]> git.immae.eu Git - github/wallabag/wallabag.git/commitdiff
Use an alternative way to detect image 3184/head
authorJeremy Benoist <jeremy.benoist@gmail.com>
Mon, 5 Jun 2017 20:54:02 +0000 (22:54 +0200)
committerJeremy Benoist <jeremy.benoist@gmail.com>
Mon, 5 Jun 2017 20:54:02 +0000 (22:54 +0200)
When parsing content to retrieve images to save locally, we only check for the content-type of the image response.
In some case, that value is empty.
Now we’re also checking for the first few bytes of the content as an alternative to detect if it’s an image wallabag can handle.
We might get higher image supports using that alternative method.

src/Wallabag/CoreBundle/Helper/DownloadImages.php
tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php
tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg [new file with mode: 0644]

index 54e23a052684bc87c3508fd2dbc632d15294c891..ed888cdb031ae52afd2b6624378c7927a0366e4d 100644 (file)
@@ -5,6 +5,7 @@ namespace Wallabag\CoreBundle\Helper;
 use Psr\Log\LoggerInterface;
 use Symfony\Component\DomCrawler\Crawler;
 use GuzzleHttp\Client;
+use GuzzleHttp\Message\Response;
 use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
 use Symfony\Component\Finder\Finder;
 
@@ -116,13 +117,11 @@ class DownloadImages
             return false;
         }
 
-        $ext = $this->mimeGuesser->guess($res->getHeader('content-type'));
-        $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
-        if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
-            $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath);
-
+        $ext = $this->getExtensionFromResponse($res, $imagePath);
+        if (false === $res) {
             return false;
         }
+
         $hashImage = hash('crc32', $absolutePath);
         $localPath = $folderPath.'/'.$hashImage.'.'.$ext;
 
@@ -237,4 +236,45 @@ class DownloadImages
 
         return false;
     }
+
+    /**
+     * Retrieve and validate the extension from the response of the url of the image.
+     *
+     * @param Response $res       Guzzle Response
+     * @param string   $imagePath Path from the src image from the content (used for log only)
+     *
+     * @return string|false Extension name or false if validation failed
+     */
+    private function getExtensionFromResponse(Response $res, $imagePath)
+    {
+        $ext = $this->mimeGuesser->guess($res->getHeader('content-type'));
+        $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
+
+        // ok header doesn't have the extension, try a different way
+        if (empty($ext)) {
+            $types = [
+                'jpeg' => "\xFF\xD8\xFF",
+                'gif' => 'GIF',
+                'png' => "\x89\x50\x4e\x47\x0d\x0a",
+            ];
+            $bytes = substr((string) $res->getBody(), 0, 8);
+
+            foreach ($types as $type => $header) {
+                if (0 === strpos($bytes, $header)) {
+                    $ext = $type;
+                    break;
+                }
+            }
+
+            $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]);
+        }
+
+        if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
+            $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath);
+
+            return false;
+        }
+
+        return $ext;
+    }
 }
index 9125f8dcb2da20fbeb0ebd61a67efbdba11f00dc..c02f96587cc663eab3caf0eefeb8af5cd387e5f4 100644 (file)
@@ -157,4 +157,29 @@ class DownloadImagesTest extends \PHPUnit_Framework_TestCase
 
         $this->assertFalse($res, 'Absolute image can not be determined, so it will not be replaced');
     }
+
+    public function testProcessRealImage()
+    {
+        $client = new Client();
+
+        $mock = new Mock([
+            new Response(200, ['content-type' => null], Stream::factory(file_get_contents(__DIR__.'/../fixtures/image-no-content-type.jpg'))),
+        ]);
+
+        $client->getEmitter()->attach($mock);
+
+        $logHandler = new TestHandler();
+        $logger = new Logger('test', array($logHandler));
+
+        $download = new DownloadImages($client, sys_get_temp_dir().'/wallabag_test', 'http://wallabag.io/', $logger);
+
+        $res = $download->processSingleImage(
+            123,
+            'https://cdn.theconversation.com/files/157200/article/width926/gsj2rjp2-1487348607.jpg',
+            'https://theconversation.com/conversation-avec-gerald-bronner-ce-nest-pas-la-post-verite-qui-nous-menace-mais-lextension-de-notre-credulite-73089'
+        );
+
+        $this->assertContains('http://wallabag.io/assets/images/9/b/9b0ead26/', $res, 'Content-Type was empty but data is ok for an image');
+        $this->assertContains('DownloadImages: Checking extension (alternative)', $logHandler->getRecords()[3]['message']);
+    }
 }
diff --git a/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg b/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg
new file mode 100644 (file)
index 0000000..0c60e95
Binary files /dev/null and b/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg differ