aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJeremy Benoist <jeremy.benoist@gmail.com>2017-06-05 22:54:02 +0200
committerJeremy Benoist <jeremy.benoist@gmail.com>2017-06-05 22:54:02 +0200
commit577c0b6dd82c421c377c37295c59dee147068132 (patch)
treee63b2d0a64b446934cd85cd89eb3611a6b386a46
parenta687c8d915276eee0c0494156700f7d0c0606735 (diff)
downloadwallabag-577c0b6dd82c421c377c37295c59dee147068132.tar.gz
wallabag-577c0b6dd82c421c377c37295c59dee147068132.tar.zst
wallabag-577c0b6dd82c421c377c37295c59dee147068132.zip
Use an alternative way to detect image
When parsing content to retrieve images to save locally, we only check for the content-type of the image response. In some case, that value is empty. Now we’re also checking for the first few bytes of the content as an alternative to detect if it’s an image wallabag can handle. We might get higher image supports using that alternative method.
-rw-r--r--src/Wallabag/CoreBundle/Helper/DownloadImages.php50
-rw-r--r--tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php25
-rw-r--r--tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpgbin0 -> 354067 bytes
3 files changed, 70 insertions, 5 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php
index 54e23a05..ed888cdb 100644
--- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php
+++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php
@@ -5,6 +5,7 @@ namespace Wallabag\CoreBundle\Helper;
5use Psr\Log\LoggerInterface; 5use Psr\Log\LoggerInterface;
6use Symfony\Component\DomCrawler\Crawler; 6use Symfony\Component\DomCrawler\Crawler;
7use GuzzleHttp\Client; 7use GuzzleHttp\Client;
8use GuzzleHttp\Message\Response;
8use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; 9use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
9use Symfony\Component\Finder\Finder; 10use Symfony\Component\Finder\Finder;
10 11
@@ -116,13 +117,11 @@ class DownloadImages
116 return false; 117 return false;
117 } 118 }
118 119
119 $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); 120 $ext = $this->getExtensionFromResponse($res, $imagePath);
120 $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); 121 if (false === $res) {
121 if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
122 $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath);
123
124 return false; 122 return false;
125 } 123 }
124
126 $hashImage = hash('crc32', $absolutePath); 125 $hashImage = hash('crc32', $absolutePath);
127 $localPath = $folderPath.'/'.$hashImage.'.'.$ext; 126 $localPath = $folderPath.'/'.$hashImage.'.'.$ext;
128 127
@@ -237,4 +236,45 @@ class DownloadImages
237 236
238 return false; 237 return false;
239 } 238 }
239
240 /**
241 * Retrieve and validate the extension from the response of the url of the image.
242 *
243 * @param Response $res Guzzle Response
244 * @param string $imagePath Path from the src image from the content (used for log only)
245 *
246 * @return string|false Extension name or false if validation failed
247 */
248 private function getExtensionFromResponse(Response $res, $imagePath)
249 {
250 $ext = $this->mimeGuesser->guess($res->getHeader('content-type'));
251 $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
252
253 // ok header doesn't have the extension, try a different way
254 if (empty($ext)) {
255 $types = [
256 'jpeg' => "\xFF\xD8\xFF",
257 'gif' => 'GIF',
258 'png' => "\x89\x50\x4e\x47\x0d\x0a",
259 ];
260 $bytes = substr((string) $res->getBody(), 0, 8);
261
262 foreach ($types as $type => $header) {
263 if (0 === strpos($bytes, $header)) {
264 $ext = $type;
265 break;
266 }
267 }
268
269 $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]);
270 }
271
272 if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
273 $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: '.$imagePath);
274
275 return false;
276 }
277
278 return $ext;
279 }
240} 280}
diff --git a/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php b/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php
index 9125f8dc..c02f9658 100644
--- a/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php
+++ b/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php
@@ -157,4 +157,29 @@ class DownloadImagesTest extends \PHPUnit_Framework_TestCase
157 157
158 $this->assertFalse($res, 'Absolute image can not be determined, so it will not be replaced'); 158 $this->assertFalse($res, 'Absolute image can not be determined, so it will not be replaced');
159 } 159 }
160
161 public function testProcessRealImage()
162 {
163 $client = new Client();
164
165 $mock = new Mock([
166 new Response(200, ['content-type' => null], Stream::factory(file_get_contents(__DIR__.'/../fixtures/image-no-content-type.jpg'))),
167 ]);
168
169 $client->getEmitter()->attach($mock);
170
171 $logHandler = new TestHandler();
172 $logger = new Logger('test', array($logHandler));
173
174 $download = new DownloadImages($client, sys_get_temp_dir().'/wallabag_test', 'http://wallabag.io/', $logger);
175
176 $res = $download->processSingleImage(
177 123,
178 'https://cdn.theconversation.com/files/157200/article/width926/gsj2rjp2-1487348607.jpg',
179 'https://theconversation.com/conversation-avec-gerald-bronner-ce-nest-pas-la-post-verite-qui-nous-menace-mais-lextension-de-notre-credulite-73089'
180 );
181
182 $this->assertContains('http://wallabag.io/assets/images/9/b/9b0ead26/', $res, 'Content-Type was empty but data is ok for an image');
183 $this->assertContains('DownloadImages: Checking extension (alternative)', $logHandler->getRecords()[3]['message']);
184 }
160} 185}
diff --git a/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg b/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg
new file mode 100644
index 00000000..0c60e952
--- /dev/null
+++ b/tests/Wallabag/CoreBundle/fixtures/image-no-content-type.jpg
Binary files differ