diff options
-rw-r--r-- | composer.json | 1 | ||||
-rw-r--r-- | composer.lock | 2 | ||||
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/DownloadImages.php | 31 |
3 files changed, 16 insertions, 18 deletions
diff --git a/composer.json b/composer.json index 55e7f765..b0c17385 100644 --- a/composer.json +++ b/composer.json | |||
@@ -63,7 +63,6 @@ | |||
63 | "nelmio/api-doc-bundle": "^2.13.2", | 63 | "nelmio/api-doc-bundle": "^2.13.2", |
64 | "mgargano/simplehtmldom": "~1.5", | 64 | "mgargano/simplehtmldom": "~1.5", |
65 | "wallabag/tcpdf": "^6.2.26", | 65 | "wallabag/tcpdf": "^6.2.26", |
66 | "simplepie/simplepie": "~1.5", | ||
67 | "willdurand/hateoas-bundle": "~1.3", | 66 | "willdurand/hateoas-bundle": "~1.3", |
68 | "liip/theme-bundle": "^1.4.6", | 67 | "liip/theme-bundle": "^1.4.6", |
69 | "lexik/form-filter-bundle": "^5.0.4", | 68 | "lexik/form-filter-bundle": "^5.0.4", |
diff --git a/composer.lock b/composer.lock index cbb9265d..a2a48c1e 100644 --- a/composer.lock +++ b/composer.lock | |||
@@ -4,7 +4,7 @@ | |||
4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", | 4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", |
5 | "This file is @generated automatically" | 5 | "This file is @generated automatically" |
6 | ], | 6 | ], |
7 | "content-hash": "883f44eda34a48c8ddabc3294498d996", | 7 | "content-hash": "c42e1b50f4a2b8a59ca06c5ccb24e6a3", |
8 | "packages": [ | 8 | "packages": [ |
9 | { | 9 | { |
10 | "name": "bdunogier/guzzle-site-authenticator", | 10 | "name": "bdunogier/guzzle-site-authenticator", |
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index 7a39a2e4..1d361d6d 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php | |||
@@ -2,6 +2,8 @@ | |||
2 | 2 | ||
3 | namespace Wallabag\CoreBundle\Helper; | 3 | namespace Wallabag\CoreBundle\Helper; |
4 | 4 | ||
5 | use GuzzleHttp\Psr7\Uri; | ||
6 | use GuzzleHttp\Psr7\UriResolver; | ||
5 | use Http\Client\Common\HttpMethodsClient; | 7 | use Http\Client\Common\HttpMethodsClient; |
6 | use Http\Client\Common\Plugin\ErrorPlugin; | 8 | use Http\Client\Common\Plugin\ErrorPlugin; |
7 | use Http\Client\Common\PluginClient; | 9 | use Http\Client\Common\PluginClient; |
@@ -45,10 +47,8 @@ class DownloadImages | |||
45 | public static function extractImagesUrlsFromHtml($html) | 47 | public static function extractImagesUrlsFromHtml($html) |
46 | { | 48 | { |
47 | $crawler = new Crawler($html); | 49 | $crawler = new Crawler($html); |
48 | $imagesCrawler = $crawler | 50 | $imagesCrawler = $crawler->filterXpath('//img'); |
49 | ->filterXpath('//img'); | 51 | $imagesUrls = $imagesCrawler->extract(['src']); |
50 | $imagesUrls = $imagesCrawler | ||
51 | ->extract(['src']); | ||
52 | $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler); | 52 | $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler); |
53 | 53 | ||
54 | return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); | 54 | return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); |
@@ -220,22 +220,25 @@ class DownloadImages | |||
220 | private static function getSrcsetUrls(Crawler $imagesCrawler) | 220 | private static function getSrcsetUrls(Crawler $imagesCrawler) |
221 | { | 221 | { |
222 | $urls = []; | 222 | $urls = []; |
223 | $iterator = $imagesCrawler | 223 | $iterator = $imagesCrawler->getIterator(); |
224 | ->getIterator(); | 224 | |
225 | while ($iterator->valid()) { | 225 | while ($iterator->valid()) { |
226 | $srcsetAttribute = $iterator->current()->getAttribute('srcset'); | 226 | $srcsetAttribute = $iterator->current()->getAttribute('srcset'); |
227 | |||
227 | if ('' !== $srcsetAttribute) { | 228 | if ('' !== $srcsetAttribute) { |
228 | // Couldn't start with " OR ' OR a white space | 229 | // Couldn't start with " OR ' OR a white space |
229 | // Could be one or more white space | 230 | // Could be one or more white space |
230 | // Must be one or more digits followed by w OR x | 231 | // Must be one or more digits followed by w OR x |
231 | $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; | 232 | $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; |
232 | preg_match_all($pattern, $srcsetAttribute, $matches); | 233 | preg_match_all($pattern, $srcsetAttribute, $matches); |
234 | |||
233 | $srcset = \call_user_func_array('array_merge', $matches); | 235 | $srcset = \call_user_func_array('array_merge', $matches); |
234 | $srcsetUrls = array_map(function ($src) { | 236 | $srcsetUrls = array_map(function ($src) { |
235 | return trim(explode(' ', $src, 2)[0]); | 237 | return trim(explode(' ', $src, 2)[0]); |
236 | }, $srcset); | 238 | }, $srcset); |
237 | $urls = array_merge($srcsetUrls, $urls); | 239 | $urls = array_merge($srcsetUrls, $urls); |
238 | } | 240 | } |
241 | |||
239 | $iterator->next(); | 242 | $iterator->next(); |
240 | } | 243 | } |
241 | 244 | ||
@@ -292,20 +295,16 @@ class DownloadImages | |||
292 | return $url; | 295 | return $url; |
293 | } | 296 | } |
294 | 297 | ||
295 | $base = new \SimplePie_IRI($base); | 298 | $base = new Uri($base); |
296 | 299 | ||
297 | // remove '//' in URL path (causes URLs not to resolve properly) | 300 | // in case the url has no scheme & host |
298 | if (isset($base->ipath)) { | 301 | if ('' === $base->getAuthority() || '' === $base->getScheme()) { |
299 | $base->ipath = preg_replace('!//+!', '/', $base->ipath); | 302 | $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); |
300 | } | ||
301 | 303 | ||
302 | if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { | 304 | return false; |
303 | return $absolute->get_uri(); | ||
304 | } | 305 | } |
305 | 306 | ||
306 | $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); | 307 | return (string) UriResolver::resolve($base, new Uri($url)); |
307 | |||
308 | return false; | ||
309 | } | 308 | } |
310 | 309 | ||
311 | /** | 310 | /** |