X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=src%2FWallabag%2FCoreBundle%2FHelper%2FDownloadImages.php;h=8c1c208f57772c811abb409af2ebd75b60fb8f9a;hb=refs%2Fheads%2Fphp73;hp=264bc6a3eba7cb30ddedcad79821de83dac8341a;hpb=001cc7168aa1a7e9b8290b9c29566c586ac8b511;p=github%2Fwallabag%2Fwallabag.git diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index 264bc6a3..8c1c208f 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php @@ -2,11 +2,12 @@ namespace Wallabag\CoreBundle\Helper; +use GuzzleHttp\Client; +use GuzzleHttp\Message\Response; use Psr\Log\LoggerInterface; use Symfony\Component\DomCrawler\Crawler; -use GuzzleHttp\Client; -use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; use Symfony\Component\Finder\Finder; +use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; class DownloadImages { @@ -29,17 +30,6 @@ class DownloadImages $this->setFolder(); } - /** - * Setup base folder where all images are going to be saved. - */ - private function setFolder() - { - // if folder doesn't exist, attempt to create one and store the folder name in property $folder - if (!file_exists($this->baseFolder)) { - mkdir($this->baseFolder, 0777, true); - } - } - /** * Process the html and extract image from it, save them to local and return the updated html. * @@ -52,18 +42,26 @@ class DownloadImages public function processHtml($entryId, $html, $url) { $crawler = new Crawler($html); - $result = $crawler - ->filterXpath('//img') - ->extract(array('src')); + $imagesCrawler = $crawler + ->filterXpath('//img'); + $imagesUrls = $imagesCrawler + ->extract(['src']); + $imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler); + $imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); $relativePath = $this->getRelativePath($entryId); // download and save the image to the folder - foreach ($result as $image) { + foreach ($imagesUrls as $image) { $imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath); if (false === $imagePath) { - continue; + break; + } + + // if image contains "&" and we can't find it in the html it might be because it's encoded as & + if (false !== stripos($image, '&') && false === stripos($html, $image)) { + $image = str_replace('&', '&', $image); } $html = str_replace($image, $imagePath, $html); @@ -87,13 +85,17 @@ class DownloadImages */ public function processSingleImage($entryId, $imagePath, $url, $relativePath = null) { + if (null === $imagePath) { + return false; + } + if (null === $relativePath) { $relativePath = $this->getRelativePath($entryId); } - $this->logger->debug('DownloadImages: working on image: '.$imagePath); + $this->logger->debug('DownloadImages: working on image: ' . $imagePath); - $folderPath = $this->baseFolder.'/'.$relativePath; + $folderPath = $this->baseFolder . '/' . $relativePath; // build image path $absolutePath = $this->getAbsoluteLink($url, $imagePath); @@ -111,15 +113,13 @@ class DownloadImages return false; } - $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); - $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); - if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { - $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping '.$imagePath); - + $ext = $this->getExtensionFromResponse($res, $imagePath); + if (false === $res) { return false; } + $hashImage = hash('crc32', $absolutePath); - $localPath = $folderPath.'/'.$hashImage.'.'.$ext; + $localPath = $folderPath . '/' . $hashImage . '.' . $ext; try { $im = imagecreatefromstring($res->getBody()); @@ -144,13 +144,15 @@ class DownloadImages $this->logger->debug('DownloadImages: Re-creating jpg'); break; case 'png': + imagealphablending($im, false); + imagesavealpha($im, true); imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9)); $this->logger->debug('DownloadImages: Re-creating png'); } imagedestroy($im); - return $this->wallabagUrl.'/assets/images/'.$relativePath.'/'.$hashImage.'.'.$ext; + return $this->wallabagUrl . '/assets/images/' . $relativePath . '/' . $hashImage . '.' . $ext; } /** @@ -161,7 +163,7 @@ class DownloadImages public function removeImages($entryId) { $relativePath = $this->getRelativePath($entryId); - $folderPath = $this->baseFolder.'/'.$relativePath; + $folderPath = $this->baseFolder . '/' . $relativePath; $finder = new Finder(); $finder @@ -176,6 +178,49 @@ class DownloadImages @rmdir($folderPath); } + /** + * Get images urls from the srcset image attribute. + * + * @param Crawler $imagesCrawler + * + * @return array An array of urls + */ + private function getSrcsetUrls(Crawler $imagesCrawler) + { + $urls = []; + $iterator = $imagesCrawler + ->getIterator(); + while ($iterator->valid()) { + $srcsetAttribute = $iterator->current()->getAttribute('srcset'); + if ('' !== $srcsetAttribute) { + // Couldn't start with " OR ' OR a white space + // Could be one or more white space + // Must be one or more digits followed by w OR x + $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; + preg_match_all($pattern, $srcsetAttribute, $matches); + $srcset = \call_user_func_array('array_merge', $matches); + $srcsetUrls = array_map(function ($src) { + return trim(explode(' ', $src, 2)[0]); + }, $srcset); + $urls = array_merge($srcsetUrls, $urls); + } + $iterator->next(); + } + + return $urls; + } + + /** + * Setup base folder where all images are going to be saved. + */ + private function setFolder() + { + // if folder doesn't exist, attempt to create one and store the folder name in property $folder + if (!file_exists($this->baseFolder)) { + mkdir($this->baseFolder, 0755, true); + } + } + /** * Generate the folder where we are going to save images based on the entry url. * @@ -186,8 +231,8 @@ class DownloadImages private function getRelativePath($entryId) { $hashId = hash('crc32', $entryId); - $relativePath = $hashId[0].'/'.$hashId[1].'/'.$hashId; - $folderPath = $this->baseFolder.'/'.$relativePath; + $relativePath = $hashId[0] . '/' . $hashId[1] . '/' . $hashId; + $folderPath = $this->baseFolder . '/' . $relativePath; if (!file_exists($folderPath)) { mkdir($folderPath, 0777, true); @@ -230,4 +275,45 @@ class DownloadImages return false; } + + /** + * Retrieve and validate the extension from the response of the url of the image. + * + * @param Response $res Guzzle Response + * @param string $imagePath Path from the src image from the content (used for log only) + * + * @return string|false Extension name or false if validation failed + */ + private function getExtensionFromResponse(Response $res, $imagePath) + { + $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); + $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); + + // ok header doesn't have the extension, try a different way + if (empty($ext)) { + $types = [ + 'jpeg' => "\xFF\xD8\xFF", + 'gif' => 'GIF', + 'png' => "\x89\x50\x4e\x47\x0d\x0a", + ]; + $bytes = substr((string) $res->getBody(), 0, 8); + + foreach ($types as $type => $header) { + if (0 === strpos($bytes, $header)) { + $ext = $type; + break; + } + } + + $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]); + } + + if (!\in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { + $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: ' . $imagePath); + + return false; + } + + return $ext; + } }