From e6f12c073416eba6fc620f0ff38a343bda428280 Mon Sep 17 00:00:00 2001 From: Simounet Date: Wed, 11 Jul 2018 19:57:34 +0200 Subject: More robust srcset image attribute handling Linked to HTMLawed PR https://github.com/kesar/HTMLawed/pull/17 --- src/Wallabag/CoreBundle/Helper/DownloadImages.php | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index f91cdf5e..487a3a23 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php @@ -185,7 +185,7 @@ class DownloadImages * * @return array An array of urls */ - protected function getSrcsetUrls(Crawler $imagesCrawler) + private function getSrcsetUrls(Crawler $imagesCrawler) { $urls = []; $iterator = $imagesCrawler @@ -193,9 +193,14 @@ class DownloadImages while ($iterator->valid()) { $srcsetAttribute = $iterator->current()->getAttribute('srcset'); if ('' !== $srcsetAttribute) { - $srcset = array_map('trim', explode(',', $srcsetAttribute)); + // Couldn't start with " OR ' OR a white space + // Could be one or more white space + // Must be one or more digits followed by w OR x + $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; + preg_match_all($pattern, $srcsetAttribute, $matches); + $srcset = call_user_func_array('array_merge', $matches); $srcsetUrls = array_map(function ($src) { - return explode(' ', $src)[0]; + return trim(explode(' ', $src, 2)[0]); }, $srcset); $urls = array_merge($srcsetUrls, $urls); } -- cgit v1.2.3