X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=src%2FWallabag%2FCoreBundle%2FHelper%2FDownloadImages.php;h=487a3a238eff43ff4316f1897f9fb65deb5310e2;hb=e6f12c073416eba6fc620f0ff38a343bda428280;hp=32a9dbb2ff723c40e468d74acbc63bd829e1f732;hpb=419214d7221e0821ef2b73eb2b3db816ed0cf173;p=github%2Fwallabag%2Fwallabag.git diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index 32a9dbb2..487a3a23 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php @@ -2,146 +2,318 @@ namespace Wallabag\CoreBundle\Helper; -use Psr\Log\LoggerInterface as Logger; +use GuzzleHttp\Client; +use GuzzleHttp\Message\Response; +use Psr\Log\LoggerInterface; use Symfony\Component\DomCrawler\Crawler; +use Symfony\Component\Finder\Finder; +use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; -define('REGENERATE_PICTURES_QUALITY', 75); +class DownloadImages +{ + const REGENERATE_PICTURES_QUALITY = 80; -class DownloadImages { - private $folder; - private $url; - private $html; - private $fileName; + private $client; + private $baseFolder; private $logger; + private $mimeGuesser; + private $wallabagUrl; - public function __construct($html, $url, Logger $logger) { - $this->html = $html; - $this->url = $url; - $this->setFolder(); + public function __construct(Client $client, $baseFolder, $wallabagUrl, LoggerInterface $logger) + { + $this->client = $client; + $this->baseFolder = $baseFolder; + $this->wallabagUrl = rtrim($wallabagUrl, '/'); $this->logger = $logger; - } + $this->mimeGuesser = new MimeTypeExtensionGuesser(); - public function setFolder($folder = "assets/images") { - // if folder doesn't exist, attempt to create one and store the folder name in property $folder - if(!file_exists($folder)) { - mkdir($folder); - } - $this->folder = $folder; + $this->setFolder(); } - public function process() { - //instantiate the symfony DomCrawler Component - $crawler = new Crawler($this->html); - // create an array of all scrapped image links - $this->logger->log('debug', 'Finding images inside document'); - $result = $crawler - ->filterXpath('//img') - ->extract(array('src')); + /** + * Process the html and extract image from it, save them to local and return the updated html. + * + * @param int $entryId ID of the entry + * @param string $html + * @param string $url Used as a base path for relative image and folder + * + * @return string + */ + public function processHtml($entryId, $html, $url) + { + $crawler = new Crawler($html); + $imagesCrawler = $crawler + ->filterXpath('//img'); + $imagesUrls = $imagesCrawler + ->extract(['src']); + $imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler); + $imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); + + $relativePath = $this->getRelativePath($entryId); // download and save the image to the folder - foreach ($result as $image) { - $file = file_get_contents($image); + foreach ($imagesUrls as $image) { + $imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath); + + if (false === $imagePath) { + continue; + } - // Checks - $absolute_path = self::getAbsoluteLink($image, $this->url); - $filename = basename(parse_url($absolute_path, PHP_URL_PATH)); - $fullpath = $this->folder."/".$filename; - self::checks($file, $fullpath, $absolute_path); - $this->html = str_replace($image, $fullpath, $this->html); + // if image contains "&" and we can't find it in the html it might be because it's encoded as & + if (false !== stripos($image, '&') && false === stripos($html, $image)) { + $image = str_replace('&', '&', $image); + } + + $html = str_replace($image, $imagePath, $html); } - return $this->html; + return $html; } - private function checks($rawdata, $fullpath, $absolute_path) { - $fullpath = urldecode($fullpath); + /** + * Process a single image: + * - retrieve it + * - re-saved it (for security reason) + * - return the new local path. + * + * @param int $entryId ID of the entry + * @param string $imagePath Path to the image to retrieve + * @param string $url Url from where the image were found + * @param string $relativePath Relative local path to saved the image + * + * @return string Relative url to access the image from the web + */ + public function processSingleImage($entryId, $imagePath, $url, $relativePath = null) + { + if (null === $imagePath) { + return false; + } - if (file_exists($fullpath)) { - unlink($fullpath); + if (null === $relativePath) { + $relativePath = $this->getRelativePath($entryId); } - // check extension - $this->logger->log('debug','Checking extension'); + $this->logger->debug('DownloadImages: working on image: ' . $imagePath); + + $folderPath = $this->baseFolder . '/' . $relativePath; - $file_ext = strrchr($fullpath, '.'); - $whitelist = array('.jpg', '.jpeg', '.gif', '.png'); - if (!(in_array($file_ext, $whitelist))) { - $this->logger->log('debug','processed image with not allowed extension. Skipping '.$fullpath); + // build image path + $absolutePath = $this->getAbsoluteLink($url, $imagePath); + if (false === $absolutePath) { + $this->logger->error('DownloadImages: Can not determine the absolute path for that image, skipping.'); return false; } - // check headers - $this->logger->log('debug','Checking headers'); - $imageinfo = getimagesize($absolute_path); - if ($imageinfo['mime'] != 'image/gif' && $imageinfo['mime'] != 'image/jpeg' && $imageinfo['mime'] != 'image/jpg' && $imageinfo['mime'] != 'image/png') { - $this->logger->log('debug','processed image with bad header. Skipping '.$fullpath); + try { + $res = $this->client->get($absolutePath); + } catch (\Exception $e) { + $this->logger->error('DownloadImages: Can not retrieve image, skipping.', ['exception' => $e]); return false; } - // regenerate image - $this->logger->log('debug','regenerating image'); - $im = imagecreatefromstring($rawdata); - if ($im === false) { - $this->logger->log('error','error while regenerating image '.$fullpath); + $ext = $this->getExtensionFromResponse($res, $imagePath); + if (false === $res) { + return false; + } + + $hashImage = hash('crc32', $absolutePath); + $localPath = $folderPath . '/' . $hashImage . '.' . $ext; + + try { + $im = imagecreatefromstring($res->getBody()); + } catch (\Exception $e) { + $im = false; + } + + if (false === $im) { + $this->logger->error('DownloadImages: Error while regenerating image', ['path' => $localPath]); return false; } - switch ($imageinfo['mime']) { - case 'image/gif': - $result = imagegif($im, $fullpath); - $this->logger->log('debug','Re-creating gif'); - break; - case 'image/jpeg': - case 'image/jpg': - $result = imagejpeg($im, $fullpath, REGENERATE_PICTURES_QUALITY); - $this->logger->log('debug','Re-creating jpg'); + switch ($ext) { + case 'gif': + imagegif($im, $localPath); + $this->logger->debug('DownloadImages: Re-creating gif'); break; - case 'image/png': - $this->logger->log('debug','Re-creating png'); - $result = imagepng($im, $fullpath, ceil(REGENERATE_PICTURES_QUALITY / 100 * 9)); + case 'jpeg': + case 'jpg': + imagejpeg($im, $localPath, self::REGENERATE_PICTURES_QUALITY); + $this->logger->debug('DownloadImages: Re-creating jpg'); break; + case 'png': + imagealphablending($im, false); + imagesavealpha($im, true); + imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9)); + $this->logger->debug('DownloadImages: Re-creating png'); } + imagedestroy($im); - return $result; + return $this->wallabagUrl . '/assets/images/' . $relativePath . '/' . $hashImage . '.' . $ext; } - private static function getAbsoluteLink($relativeLink, $url) + /** + * Remove all images for the given entry id. + * + * @param int $entryId ID of the entry + */ + public function removeImages($entryId) { - /* return if already absolute URL */ - if (parse_url($relativeLink, PHP_URL_SCHEME) != '') { - return $relativeLink; + $relativePath = $this->getRelativePath($entryId); + $folderPath = $this->baseFolder . '/' . $relativePath; + + $finder = new Finder(); + $finder + ->files() + ->ignoreDotFiles(true) + ->in($folderPath); + + foreach ($finder as $file) { + @unlink($file->getRealPath()); } - /* queries and anchors */ - if ($relativeLink[0] == '#' || $relativeLink[0] == '?') { - return $url.$relativeLink; + @rmdir($folderPath); + } + + /** + * Get images urls from the srcset image attribute. + * + * @param Crawler $imagesCrawler + * + * @return array An array of urls + */ + private function getSrcsetUrls(Crawler $imagesCrawler) + { + $urls = []; + $iterator = $imagesCrawler + ->getIterator(); + while ($iterator->valid()) { + $srcsetAttribute = $iterator->current()->getAttribute('srcset'); + if ('' !== $srcsetAttribute) { + // Couldn't start with " OR ' OR a white space + // Could be one or more white space + // Must be one or more digits followed by w OR x + $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; + preg_match_all($pattern, $srcsetAttribute, $matches); + $srcset = call_user_func_array('array_merge', $matches); + $srcsetUrls = array_map(function ($src) { + return trim(explode(' ', $src, 2)[0]); + }, $srcset); + $urls = array_merge($srcsetUrls, $urls); + } + $iterator->next(); } - /* parse base URL and convert to local variables: - $scheme, $host, $path */ - extract(parse_url($url)); + return $urls; + } - /* remove non-directory element from path */ - $path = preg_replace('#/[^/]*$#', '', $path); + /** + * Setup base folder where all images are going to be saved. + */ + private function setFolder() + { + // if folder doesn't exist, attempt to create one and store the folder name in property $folder + if (!file_exists($this->baseFolder)) { + mkdir($this->baseFolder, 0755, true); + } + } + + /** + * Generate the folder where we are going to save images based on the entry url. + * + * @param int $entryId ID of the entry + * + * @return string + */ + private function getRelativePath($entryId) + { + $hashId = hash('crc32', $entryId); + $relativePath = $hashId[0] . '/' . $hashId[1] . '/' . $hashId; + $folderPath = $this->baseFolder . '/' . $relativePath; + + if (!file_exists($folderPath)) { + mkdir($folderPath, 0777, true); + } + + $this->logger->debug('DownloadImages: Folder used for that Entry id', ['folder' => $folderPath, 'entryId' => $entryId]); + + return $relativePath; + } - /* destroy path if relative url points to root */ - if ($relativeLink[0] == '/') { - $path = ''; + /** + * Make an $url absolute based on the $base. + * + * @see Graby->makeAbsoluteStr + * + * @param string $base Base url + * @param string $url Url to make it absolute + * + * @return false|string + */ + private function getAbsoluteLink($base, $url) + { + if (preg_match('!^https?://!i', $url)) { + // already absolute + return $url; } - /* dirty absolute URL */ - $abs = $host.$path.'/'.$relativeLink; + $base = new \SimplePie_IRI($base); + + // remove '//' in URL path (causes URLs not to resolve properly) + if (isset($base->ipath)) { + $base->ipath = preg_replace('!//+!', '/', $base->ipath); + } - /* replace '//' or '/./' or '/foo/../' with '/' */ - $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'); - for ($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) { + if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { + return $absolute->get_uri(); + } + + $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); + + return false; + } + + /** + * Retrieve and validate the extension from the response of the url of the image. + * + * @param Response $res Guzzle Response + * @param string $imagePath Path from the src image from the content (used for log only) + * + * @return string|false Extension name or false if validation failed + */ + private function getExtensionFromResponse(Response $res, $imagePath) + { + $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); + $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); + + // ok header doesn't have the extension, try a different way + if (empty($ext)) { + $types = [ + 'jpeg' => "\xFF\xD8\xFF", + 'gif' => 'GIF', + 'png' => "\x89\x50\x4e\x47\x0d\x0a", + ]; + $bytes = substr((string) $res->getBody(), 0, 8); + + foreach ($types as $type => $header) { + if (0 === strpos($bytes, $header)) { + $ext = $type; + break; + } + } + + $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]); + } + + if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { + $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: ' . $imagePath); + + return false; } - /* absolute URL is ready! */ - return $scheme.'://'.$abs; + return $ext; } }