diff options
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/DownloadImages.php')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/DownloadImages.php | 75 |
1 files changed, 46 insertions, 29 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index 9a7e9828..1d361d6d 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php | |||
@@ -2,8 +2,15 @@ | |||
2 | 2 | ||
3 | namespace Wallabag\CoreBundle\Helper; | 3 | namespace Wallabag\CoreBundle\Helper; |
4 | 4 | ||
5 | use GuzzleHttp\Client; | 5 | use GuzzleHttp\Psr7\Uri; |
6 | use GuzzleHttp\Message\Response; | 6 | use GuzzleHttp\Psr7\UriResolver; |
7 | use Http\Client\Common\HttpMethodsClient; | ||
8 | use Http\Client\Common\Plugin\ErrorPlugin; | ||
9 | use Http\Client\Common\PluginClient; | ||
10 | use Http\Client\HttpClient; | ||
11 | use Http\Discovery\MessageFactoryDiscovery; | ||
12 | use Http\Message\MessageFactory; | ||
13 | use Psr\Http\Message\ResponseInterface; | ||
7 | use Psr\Log\LoggerInterface; | 14 | use Psr\Log\LoggerInterface; |
8 | use Symfony\Component\DomCrawler\Crawler; | 15 | use Symfony\Component\DomCrawler\Crawler; |
9 | use Symfony\Component\Finder\Finder; | 16 | use Symfony\Component\Finder\Finder; |
@@ -19,9 +26,9 @@ class DownloadImages | |||
19 | private $mimeGuesser; | 26 | private $mimeGuesser; |
20 | private $wallabagUrl; | 27 | private $wallabagUrl; |
21 | 28 | ||
22 | public function __construct(Client $client, $baseFolder, $wallabagUrl, LoggerInterface $logger) | 29 | public function __construct(HttpClient $client, $baseFolder, $wallabagUrl, LoggerInterface $logger, MessageFactory $messageFactory = null) |
23 | { | 30 | { |
24 | $this->client = $client; | 31 | $this->client = new HttpMethodsClient(new PluginClient($client, [new ErrorPlugin()]), $messageFactory ?: MessageFactoryDiscovery::find()); |
25 | $this->baseFolder = $baseFolder; | 32 | $this->baseFolder = $baseFolder; |
26 | $this->wallabagUrl = rtrim($wallabagUrl, '/'); | 33 | $this->wallabagUrl = rtrim($wallabagUrl, '/'); |
27 | $this->logger = $logger; | 34 | $this->logger = $logger; |
@@ -31,6 +38,23 @@ class DownloadImages | |||
31 | } | 38 | } |
32 | 39 | ||
33 | /** | 40 | /** |
41 | * Process the html and extract images URLs from it. | ||
42 | * | ||
43 | * @param string $html | ||
44 | * | ||
45 | * @return string[] | ||
46 | */ | ||
47 | public static function extractImagesUrlsFromHtml($html) | ||
48 | { | ||
49 | $crawler = new Crawler($html); | ||
50 | $imagesCrawler = $crawler->filterXpath('//img'); | ||
51 | $imagesUrls = $imagesCrawler->extract(['src']); | ||
52 | $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler); | ||
53 | |||
54 | return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); | ||
55 | } | ||
56 | |||
57 | /** | ||
34 | * Process the html and extract image from it, save them to local and return the updated html. | 58 | * Process the html and extract image from it, save them to local and return the updated html. |
35 | * | 59 | * |
36 | * @param int $entryId ID of the entry | 60 | * @param int $entryId ID of the entry |
@@ -41,13 +65,7 @@ class DownloadImages | |||
41 | */ | 65 | */ |
42 | public function processHtml($entryId, $html, $url) | 66 | public function processHtml($entryId, $html, $url) |
43 | { | 67 | { |
44 | $crawler = new Crawler($html); | 68 | $imagesUrls = self::extractImagesUrlsFromHtml($html); |
45 | $imagesCrawler = $crawler | ||
46 | ->filterXpath('//img'); | ||
47 | $imagesUrls = $imagesCrawler | ||
48 | ->extract(['src']); | ||
49 | $imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler); | ||
50 | $imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); | ||
51 | 69 | ||
52 | $relativePath = $this->getRelativePath($entryId); | 70 | $relativePath = $this->getRelativePath($entryId); |
53 | 71 | ||
@@ -122,7 +140,7 @@ class DownloadImages | |||
122 | $localPath = $folderPath . '/' . $hashImage . '.' . $ext; | 140 | $localPath = $folderPath . '/' . $hashImage . '.' . $ext; |
123 | 141 | ||
124 | try { | 142 | try { |
125 | $im = imagecreatefromstring($res->getBody()); | 143 | $im = imagecreatefromstring((string) $res->getBody()); |
126 | } catch (\Exception $e) { | 144 | } catch (\Exception $e) { |
127 | $im = false; | 145 | $im = false; |
128 | } | 146 | } |
@@ -199,25 +217,28 @@ class DownloadImages | |||
199 | * | 217 | * |
200 | * @return array An array of urls | 218 | * @return array An array of urls |
201 | */ | 219 | */ |
202 | private function getSrcsetUrls(Crawler $imagesCrawler) | 220 | private static function getSrcsetUrls(Crawler $imagesCrawler) |
203 | { | 221 | { |
204 | $urls = []; | 222 | $urls = []; |
205 | $iterator = $imagesCrawler | 223 | $iterator = $imagesCrawler->getIterator(); |
206 | ->getIterator(); | 224 | |
207 | while ($iterator->valid()) { | 225 | while ($iterator->valid()) { |
208 | $srcsetAttribute = $iterator->current()->getAttribute('srcset'); | 226 | $srcsetAttribute = $iterator->current()->getAttribute('srcset'); |
227 | |||
209 | if ('' !== $srcsetAttribute) { | 228 | if ('' !== $srcsetAttribute) { |
210 | // Couldn't start with " OR ' OR a white space | 229 | // Couldn't start with " OR ' OR a white space |
211 | // Could be one or more white space | 230 | // Could be one or more white space |
212 | // Must be one or more digits followed by w OR x | 231 | // Must be one or more digits followed by w OR x |
213 | $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; | 232 | $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; |
214 | preg_match_all($pattern, $srcsetAttribute, $matches); | 233 | preg_match_all($pattern, $srcsetAttribute, $matches); |
234 | |||
215 | $srcset = \call_user_func_array('array_merge', $matches); | 235 | $srcset = \call_user_func_array('array_merge', $matches); |
216 | $srcsetUrls = array_map(function ($src) { | 236 | $srcsetUrls = array_map(function ($src) { |
217 | return trim(explode(' ', $src, 2)[0]); | 237 | return trim(explode(' ', $src, 2)[0]); |
218 | }, $srcset); | 238 | }, $srcset); |
219 | $urls = array_merge($srcsetUrls, $urls); | 239 | $urls = array_merge($srcsetUrls, $urls); |
220 | } | 240 | } |
241 | |||
221 | $iterator->next(); | 242 | $iterator->next(); |
222 | } | 243 | } |
223 | 244 | ||
@@ -274,33 +295,29 @@ class DownloadImages | |||
274 | return $url; | 295 | return $url; |
275 | } | 296 | } |
276 | 297 | ||
277 | $base = new \SimplePie_IRI($base); | 298 | $base = new Uri($base); |
278 | 299 | ||
279 | // remove '//' in URL path (causes URLs not to resolve properly) | 300 | // in case the url has no scheme & host |
280 | if (isset($base->ipath)) { | 301 | if ('' === $base->getAuthority() || '' === $base->getScheme()) { |
281 | $base->ipath = preg_replace('!//+!', '/', $base->ipath); | 302 | $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); |
282 | } | ||
283 | 303 | ||
284 | if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { | 304 | return false; |
285 | return $absolute->get_uri(); | ||
286 | } | 305 | } |
287 | 306 | ||
288 | $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); | 307 | return (string) UriResolver::resolve($base, new Uri($url)); |
289 | |||
290 | return false; | ||
291 | } | 308 | } |
292 | 309 | ||
293 | /** | 310 | /** |
294 | * Retrieve and validate the extension from the response of the url of the image. | 311 | * Retrieve and validate the extension from the response of the url of the image. |
295 | * | 312 | * |
296 | * @param Response $res Guzzle Response | 313 | * @param ResponseInterface $res Http Response |
297 | * @param string $imagePath Path from the src image from the content (used for log only) | 314 | * @param string $imagePath Path from the src image from the content (used for log only) |
298 | * | 315 | * |
299 | * @return string|false Extension name or false if validation failed | 316 | * @return string|false Extension name or false if validation failed |
300 | */ | 317 | */ |
301 | private function getExtensionFromResponse(Response $res, $imagePath) | 318 | private function getExtensionFromResponse(ResponseInterface $res, $imagePath) |
302 | { | 319 | { |
303 | $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); | 320 | $ext = $this->mimeGuesser->guess(current($res->getHeader('content-type'))); |
304 | $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); | 321 | $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); |
305 | 322 | ||
306 | // ok header doesn't have the extension, try a different way | 323 | // ok header doesn't have the extension, try a different way |