aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Wallabag/CoreBundle/Helper/DownloadImages.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/DownloadImages.php')
-rw-r--r--src/Wallabag/CoreBundle/Helper/DownloadImages.php75
1 files changed, 46 insertions, 29 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php
index 9a7e9828..1d361d6d 100644
--- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php
+++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php
@@ -2,8 +2,15 @@
2 2
3namespace Wallabag\CoreBundle\Helper; 3namespace Wallabag\CoreBundle\Helper;
4 4
5use GuzzleHttp\Client; 5use GuzzleHttp\Psr7\Uri;
6use GuzzleHttp\Message\Response; 6use GuzzleHttp\Psr7\UriResolver;
7use Http\Client\Common\HttpMethodsClient;
8use Http\Client\Common\Plugin\ErrorPlugin;
9use Http\Client\Common\PluginClient;
10use Http\Client\HttpClient;
11use Http\Discovery\MessageFactoryDiscovery;
12use Http\Message\MessageFactory;
13use Psr\Http\Message\ResponseInterface;
7use Psr\Log\LoggerInterface; 14use Psr\Log\LoggerInterface;
8use Symfony\Component\DomCrawler\Crawler; 15use Symfony\Component\DomCrawler\Crawler;
9use Symfony\Component\Finder\Finder; 16use Symfony\Component\Finder\Finder;
@@ -19,9 +26,9 @@ class DownloadImages
19 private $mimeGuesser; 26 private $mimeGuesser;
20 private $wallabagUrl; 27 private $wallabagUrl;
21 28
22 public function __construct(Client $client, $baseFolder, $wallabagUrl, LoggerInterface $logger) 29 public function __construct(HttpClient $client, $baseFolder, $wallabagUrl, LoggerInterface $logger, MessageFactory $messageFactory = null)
23 { 30 {
24 $this->client = $client; 31 $this->client = new HttpMethodsClient(new PluginClient($client, [new ErrorPlugin()]), $messageFactory ?: MessageFactoryDiscovery::find());
25 $this->baseFolder = $baseFolder; 32 $this->baseFolder = $baseFolder;
26 $this->wallabagUrl = rtrim($wallabagUrl, '/'); 33 $this->wallabagUrl = rtrim($wallabagUrl, '/');
27 $this->logger = $logger; 34 $this->logger = $logger;
@@ -31,6 +38,23 @@ class DownloadImages
31 } 38 }
32 39
33 /** 40 /**
41 * Process the html and extract images URLs from it.
42 *
43 * @param string $html
44 *
45 * @return string[]
46 */
47 public static function extractImagesUrlsFromHtml($html)
48 {
49 $crawler = new Crawler($html);
50 $imagesCrawler = $crawler->filterXpath('//img');
51 $imagesUrls = $imagesCrawler->extract(['src']);
52 $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler);
53
54 return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls));
55 }
56
57 /**
34 * Process the html and extract image from it, save them to local and return the updated html. 58 * Process the html and extract image from it, save them to local and return the updated html.
35 * 59 *
36 * @param int $entryId ID of the entry 60 * @param int $entryId ID of the entry
@@ -41,13 +65,7 @@ class DownloadImages
41 */ 65 */
42 public function processHtml($entryId, $html, $url) 66 public function processHtml($entryId, $html, $url)
43 { 67 {
44 $crawler = new Crawler($html); 68 $imagesUrls = self::extractImagesUrlsFromHtml($html);
45 $imagesCrawler = $crawler
46 ->filterXpath('//img');
47 $imagesUrls = $imagesCrawler
48 ->extract(['src']);
49 $imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler);
50 $imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls));
51 69
52 $relativePath = $this->getRelativePath($entryId); 70 $relativePath = $this->getRelativePath($entryId);
53 71
@@ -122,7 +140,7 @@ class DownloadImages
122 $localPath = $folderPath . '/' . $hashImage . '.' . $ext; 140 $localPath = $folderPath . '/' . $hashImage . '.' . $ext;
123 141
124 try { 142 try {
125 $im = imagecreatefromstring($res->getBody()); 143 $im = imagecreatefromstring((string) $res->getBody());
126 } catch (\Exception $e) { 144 } catch (\Exception $e) {
127 $im = false; 145 $im = false;
128 } 146 }
@@ -199,25 +217,28 @@ class DownloadImages
199 * 217 *
200 * @return array An array of urls 218 * @return array An array of urls
201 */ 219 */
202 private function getSrcsetUrls(Crawler $imagesCrawler) 220 private static function getSrcsetUrls(Crawler $imagesCrawler)
203 { 221 {
204 $urls = []; 222 $urls = [];
205 $iterator = $imagesCrawler 223 $iterator = $imagesCrawler->getIterator();
206 ->getIterator(); 224
207 while ($iterator->valid()) { 225 while ($iterator->valid()) {
208 $srcsetAttribute = $iterator->current()->getAttribute('srcset'); 226 $srcsetAttribute = $iterator->current()->getAttribute('srcset');
227
209 if ('' !== $srcsetAttribute) { 228 if ('' !== $srcsetAttribute) {
210 // Couldn't start with " OR ' OR a white space 229 // Couldn't start with " OR ' OR a white space
211 // Could be one or more white space 230 // Could be one or more white space
212 // Must be one or more digits followed by w OR x 231 // Must be one or more digits followed by w OR x
213 $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; 232 $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/";
214 preg_match_all($pattern, $srcsetAttribute, $matches); 233 preg_match_all($pattern, $srcsetAttribute, $matches);
234
215 $srcset = \call_user_func_array('array_merge', $matches); 235 $srcset = \call_user_func_array('array_merge', $matches);
216 $srcsetUrls = array_map(function ($src) { 236 $srcsetUrls = array_map(function ($src) {
217 return trim(explode(' ', $src, 2)[0]); 237 return trim(explode(' ', $src, 2)[0]);
218 }, $srcset); 238 }, $srcset);
219 $urls = array_merge($srcsetUrls, $urls); 239 $urls = array_merge($srcsetUrls, $urls);
220 } 240 }
241
221 $iterator->next(); 242 $iterator->next();
222 } 243 }
223 244
@@ -274,33 +295,29 @@ class DownloadImages
274 return $url; 295 return $url;
275 } 296 }
276 297
277 $base = new \SimplePie_IRI($base); 298 $base = new Uri($base);
278 299
279 // remove '//' in URL path (causes URLs not to resolve properly) 300 // in case the url has no scheme & host
280 if (isset($base->ipath)) { 301 if ('' === $base->getAuthority() || '' === $base->getScheme()) {
281 $base->ipath = preg_replace('!//+!', '/', $base->ipath); 302 $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]);
282 }
283 303
284 if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { 304 return false;
285 return $absolute->get_uri();
286 } 305 }
287 306
288 $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); 307 return (string) UriResolver::resolve($base, new Uri($url));
289
290 return false;
291 } 308 }
292 309
293 /** 310 /**
294 * Retrieve and validate the extension from the response of the url of the image. 311 * Retrieve and validate the extension from the response of the url of the image.
295 * 312 *
296 * @param Response $res Guzzle Response 313 * @param ResponseInterface $res Http Response
297 * @param string $imagePath Path from the src image from the content (used for log only) 314 * @param string $imagePath Path from the src image from the content (used for log only)
298 * 315 *
299 * @return string|false Extension name or false if validation failed 316 * @return string|false Extension name or false if validation failed
300 */ 317 */
301 private function getExtensionFromResponse(Response $res, $imagePath) 318 private function getExtensionFromResponse(ResponseInterface $res, $imagePath)
302 { 319 {
303 $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); 320 $ext = $this->mimeGuesser->guess(current($res->getHeader('content-type')));
304 $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); 321 $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
305 322
306 // ok header doesn't have the extension, try a different way 323 // ok header doesn't have the extension, try a different way