X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=src%2FWallabag%2FCoreBundle%2FHelper%2FDownloadImages.php;h=1d98fd1a93135dbca275a721558749fda3bdd4c6;hb=48b0163d247554d7e2f1ec63b717c8216ea9ec59;hp=ed888cdb031ae52afd2b6624378c7927a0366e4d;hpb=577c0b6dd82c421c377c37295c59dee147068132;p=github%2Fwallabag%2Fwallabag.git diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index ed888cdb..1d98fd1a 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php @@ -2,12 +2,19 @@ namespace Wallabag\CoreBundle\Helper; +use GuzzleHttp\Psr7\Uri; +use GuzzleHttp\Psr7\UriResolver; +use Http\Client\Common\HttpMethodsClient; +use Http\Client\Common\Plugin\ErrorPlugin; +use Http\Client\Common\PluginClient; +use Http\Client\HttpClient; +use Http\Discovery\MessageFactoryDiscovery; +use Http\Message\MessageFactory; +use Psr\Http\Message\ResponseInterface; use Psr\Log\LoggerInterface; use Symfony\Component\DomCrawler\Crawler; -use GuzzleHttp\Client; -use GuzzleHttp\Message\Response; -use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; use Symfony\Component\Finder\Finder; +use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; class DownloadImages { @@ -19,9 +26,9 @@ class DownloadImages private $mimeGuesser; private $wallabagUrl; - public function __construct(Client $client, $baseFolder, $wallabagUrl, LoggerInterface $logger) + public function __construct(HttpClient $client, $baseFolder, $wallabagUrl, LoggerInterface $logger, MessageFactory $messageFactory = null) { - $this->client = $client; + $this->client = new HttpMethodsClient(new PluginClient($client, [new ErrorPlugin()]), $messageFactory ?: MessageFactoryDiscovery::find()); $this->baseFolder = $baseFolder; $this->wallabagUrl = rtrim($wallabagUrl, '/'); $this->logger = $logger; @@ -31,14 +38,20 @@ class DownloadImages } /** - * Setup base folder where all images are going to be saved. + * Process the html and extract images URLs from it. + * + * @param string $html + * + * @return string[] */ - private function setFolder() + public static function extractImagesUrlsFromHtml($html) { - // if folder doesn't exist, attempt to create one and store the folder name in property $folder - if (!file_exists($this->baseFolder)) { - mkdir($this->baseFolder, 0755, true); - } + $crawler = new Crawler($html); + $imagesCrawler = $crawler->filterXpath('//img'); + $imagesUrls = $imagesCrawler->extract(['src']); + $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler); + + return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); } /** @@ -52,15 +65,12 @@ class DownloadImages */ public function processHtml($entryId, $html, $url) { - $crawler = new Crawler($html); - $result = $crawler - ->filterXpath('//img') - ->extract(['src']); + $imagesUrls = self::extractImagesUrlsFromHtml($html); $relativePath = $this->getRelativePath($entryId); // download and save the image to the folder - foreach ($result as $image) { + foreach ($imagesUrls as $image) { $imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath); if (false === $imagePath) { @@ -93,13 +103,17 @@ class DownloadImages */ public function processSingleImage($entryId, $imagePath, $url, $relativePath = null) { + if (null === $imagePath) { + return false; + } + if (null === $relativePath) { $relativePath = $this->getRelativePath($entryId); } - $this->logger->debug('DownloadImages: working on image: '.$imagePath); + $this->logger->debug('DownloadImages: working on image: ' . $imagePath); - $folderPath = $this->baseFolder.'/'.$relativePath; + $folderPath = $this->baseFolder . '/' . $relativePath; // build image path $absolutePath = $this->getAbsoluteLink($url, $imagePath); @@ -123,10 +137,10 @@ class DownloadImages } $hashImage = hash('crc32', $absolutePath); - $localPath = $folderPath.'/'.$hashImage.'.'.$ext; + $localPath = $folderPath . '/' . $hashImage . '.' . $ext; try { - $im = imagecreatefromstring($res->getBody()); + $im = imagecreatefromstring((string) $res->getBody()); } catch (\Exception $e) { $im = false; } @@ -139,7 +153,21 @@ class DownloadImages switch ($ext) { case 'gif': - imagegif($im, $localPath); + // use Imagick if available to keep GIF animation + if (class_exists('\\Imagick')) { + try { + $imagick = new \Imagick(); + $imagick->readImageBlob($res->getBody()); + $imagick->setImageFormat('gif'); + $imagick->writeImages($localPath, true); + } catch (\Exception $e) { + // if Imagick fail, fallback to the default solution + imagegif($im, $localPath); + } + } else { + imagegif($im, $localPath); + } + $this->logger->debug('DownloadImages: Re-creating gif'); break; case 'jpeg': @@ -156,7 +184,7 @@ class DownloadImages imagedestroy($im); - return $this->wallabagUrl.'/assets/images/'.$relativePath.'/'.$hashImage.'.'.$ext; + return $this->wallabagUrl . '/assets/images/' . $relativePath . '/' . $hashImage . '.' . $ext; } /** @@ -167,7 +195,7 @@ class DownloadImages public function removeImages($entryId) { $relativePath = $this->getRelativePath($entryId); - $folderPath = $this->baseFolder.'/'.$relativePath; + $folderPath = $this->baseFolder . '/' . $relativePath; $finder = new Finder(); $finder @@ -182,6 +210,50 @@ class DownloadImages @rmdir($folderPath); } + /** + * Get images urls from the srcset image attribute. + * + * @return array An array of urls + */ + private static function getSrcsetUrls(Crawler $imagesCrawler) + { + $urls = []; + $iterator = $imagesCrawler->getIterator(); + + while ($iterator->valid()) { + $srcsetAttribute = $iterator->current()->getAttribute('srcset'); + + if ('' !== $srcsetAttribute) { + // Couldn't start with " OR ' OR a white space + // Could be one or more white space + // Must be one or more digits followed by w OR x + $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; + preg_match_all($pattern, $srcsetAttribute, $matches); + + $srcset = \call_user_func_array('array_merge', $matches); + $srcsetUrls = array_map(function ($src) { + return trim(explode(' ', $src, 2)[0]); + }, $srcset); + $urls = array_merge($srcsetUrls, $urls); + } + + $iterator->next(); + } + + return $urls; + } + + /** + * Setup base folder where all images are going to be saved. + */ + private function setFolder() + { + // if folder doesn't exist, attempt to create one and store the folder name in property $folder + if (!file_exists($this->baseFolder)) { + mkdir($this->baseFolder, 0755, true); + } + } + /** * Generate the folder where we are going to save images based on the entry url. * @@ -192,8 +264,8 @@ class DownloadImages private function getRelativePath($entryId) { $hashId = hash('crc32', $entryId); - $relativePath = $hashId[0].'/'.$hashId[1].'/'.$hashId; - $folderPath = $this->baseFolder.'/'.$relativePath; + $relativePath = $hashId[0] . '/' . $hashId[1] . '/' . $hashId; + $folderPath = $this->baseFolder . '/' . $relativePath; if (!file_exists($folderPath)) { mkdir($folderPath, 0777, true); @@ -221,33 +293,29 @@ class DownloadImages return $url; } - $base = new \SimplePie_IRI($base); + $base = new Uri($base); - // remove '//' in URL path (causes URLs not to resolve properly) - if (isset($base->ipath)) { - $base->ipath = preg_replace('!//+!', '/', $base->ipath); - } + // in case the url has no scheme & host + if ('' === $base->getAuthority() || '' === $base->getScheme()) { + $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); - if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { - return $absolute->get_uri(); + return false; } - $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); - - return false; + return (string) UriResolver::resolve($base, new Uri($url)); } /** * Retrieve and validate the extension from the response of the url of the image. * - * @param Response $res Guzzle Response - * @param string $imagePath Path from the src image from the content (used for log only) + * @param ResponseInterface $res Http Response + * @param string $imagePath Path from the src image from the content (used for log only) * * @return string|false Extension name or false if validation failed */ - private function getExtensionFromResponse(Response $res, $imagePath) + private function getExtensionFromResponse(ResponseInterface $res, $imagePath) { - $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); + $ext = $this->mimeGuesser->guess(current($res->getHeader('content-type'))); $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); // ok header doesn't have the extension, try a different way @@ -269,8 +337,8 @@ class DownloadImages $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]); } - if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { - $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath); + if (!\in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { + $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: ' . $imagePath); return false; }