From 577c0b6dd82c421c377c37295c59dee147068132 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Mon, 5 Jun 2017 22:54:02 +0200 Subject: Use an alternative way to detect image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When parsing content to retrieve images to save locally, we only check for the content-type of the image response. In some case, that value is empty. Now we’re also checking for the first few bytes of the content as an alternative to detect if it’s an image wallabag can handle. We might get higher image supports using that alternative method. --- src/Wallabag/CoreBundle/Helper/DownloadImages.php | 50 ++++++++++++++++++--- .../CoreBundle/Helper/DownloadImagesTest.php | 25 +++++++++++ .../CoreBundle/fixtures/image-no-content-type.jpg | Bin 0 -> 354067 bytes 3 files changed, 70 insertions(+), 5 deletions(-) create mode 100644 tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index 54e23a05..ed888cdb 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php @@ -5,6 +5,7 @@ namespace Wallabag\CoreBundle\Helper; use Psr\Log\LoggerInterface; use Symfony\Component\DomCrawler\Crawler; use GuzzleHttp\Client; +use GuzzleHttp\Message\Response; use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; use Symfony\Component\Finder\Finder; @@ -116,13 +117,11 @@ class DownloadImages return false; } - $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); - $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); - if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { - $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath); - + $ext = $this->getExtensionFromResponse($res, $imagePath); + if (false === $res) { return false; } + $hashImage = hash('crc32', $absolutePath); $localPath = $folderPath.'/'.$hashImage.'.'.$ext; @@ -237,4 +236,45 @@ class DownloadImages return false; } + + /** + * Retrieve and validate the extension from the response of the url of the image. + * + * @param Response $res Guzzle Response + * @param string $imagePath Path from the src image from the content (used for log only) + * + * @return string|false Extension name or false if validation failed + */ + private function getExtensionFromResponse(Response $res, $imagePath) + { + $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); + $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); + + // ok header doesn't have the extension, try a different way + if (empty($ext)) { + $types = [ + 'jpeg' => "\xFF\xD8\xFF", + 'gif' => 'GIF', + 'png' => "\x89\x50\x4e\x47\x0d\x0a", + ]; + $bytes = substr((string) $res->getBody(), 0, 8); + + foreach ($types as $type => $header) { + if (0 === strpos($bytes, $header)) { + $ext = $type; + break; + } + } + + $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]); + } + + if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { + $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath); + + return false; + } + + return $ext; + } } diff --git a/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php b/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php index 9125f8dc..c02f9658 100644 --- a/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php +++ b/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php @@ -157,4 +157,29 @@ class DownloadImagesTest extends \PHPUnit_Framework_TestCase $this->assertFalse($res, 'Absolute image can not be determined, so it will not be replaced'); } + + public function testProcessRealImage() + { + $client = new Client(); + + $mock = new Mock([ + new Response(200, ['content-type' => null], Stream::factory(file_get_contents(__DIR__.'/../fixtures/image-no-content-type.jpg'))), + ]); + + $client->getEmitter()->attach($mock); + + $logHandler = new TestHandler(); + $logger = new Logger('test', array($logHandler)); + + $download = new DownloadImages($client, sys_get_temp_dir().'/wallabag_test', 'http://wallabag.io/', $logger); + + $res = $download->processSingleImage( + 123, + 'https://cdn.theconversation.com/files/157200/article/width926/gsj2rjp2-1487348607.jpg', + 'https://theconversation.com/conversation-avec-gerald-bronner-ce-nest-pas-la-post-verite-qui-nous-menace-mais-lextension-de-notre-credulite-73089' + ); + + $this->assertContains('http://wallabag.io/assets/images/9/b/9b0ead26/', $res, 'Content-Type was empty but data is ok for an image'); + $this->assertContains('DownloadImages: Checking extension (alternative)', $logHandler->getRecords()[3]['message']); + } } diff --git a/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg b/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg new file mode 100644 index 00000000..0c60e952 Binary files /dev/null and b/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg differ -- cgit v1.2.3