diff options
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/DownloadImages.php')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/DownloadImages.php | 93 |
1 files changed, 61 insertions, 32 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index cc3dcfce..1d98fd1a 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php | |||
@@ -2,8 +2,15 @@ | |||
2 | 2 | ||
3 | namespace Wallabag\CoreBundle\Helper; | 3 | namespace Wallabag\CoreBundle\Helper; |
4 | 4 | ||
5 | use GuzzleHttp\Client; | 5 | use GuzzleHttp\Psr7\Uri; |
6 | use GuzzleHttp\Message\Response; | 6 | use GuzzleHttp\Psr7\UriResolver; |
7 | use Http\Client\Common\HttpMethodsClient; | ||
8 | use Http\Client\Common\Plugin\ErrorPlugin; | ||
9 | use Http\Client\Common\PluginClient; | ||
10 | use Http\Client\HttpClient; | ||
11 | use Http\Discovery\MessageFactoryDiscovery; | ||
12 | use Http\Message\MessageFactory; | ||
13 | use Psr\Http\Message\ResponseInterface; | ||
7 | use Psr\Log\LoggerInterface; | 14 | use Psr\Log\LoggerInterface; |
8 | use Symfony\Component\DomCrawler\Crawler; | 15 | use Symfony\Component\DomCrawler\Crawler; |
9 | use Symfony\Component\Finder\Finder; | 16 | use Symfony\Component\Finder\Finder; |
@@ -19,9 +26,9 @@ class DownloadImages | |||
19 | private $mimeGuesser; | 26 | private $mimeGuesser; |
20 | private $wallabagUrl; | 27 | private $wallabagUrl; |
21 | 28 | ||
22 | public function __construct(Client $client, $baseFolder, $wallabagUrl, LoggerInterface $logger) | 29 | public function __construct(HttpClient $client, $baseFolder, $wallabagUrl, LoggerInterface $logger, MessageFactory $messageFactory = null) |
23 | { | 30 | { |
24 | $this->client = $client; | 31 | $this->client = new HttpMethodsClient(new PluginClient($client, [new ErrorPlugin()]), $messageFactory ?: MessageFactoryDiscovery::find()); |
25 | $this->baseFolder = $baseFolder; | 32 | $this->baseFolder = $baseFolder; |
26 | $this->wallabagUrl = rtrim($wallabagUrl, '/'); | 33 | $this->wallabagUrl = rtrim($wallabagUrl, '/'); |
27 | $this->logger = $logger; | 34 | $this->logger = $logger; |
@@ -31,6 +38,23 @@ class DownloadImages | |||
31 | } | 38 | } |
32 | 39 | ||
33 | /** | 40 | /** |
41 | * Process the html and extract images URLs from it. | ||
42 | * | ||
43 | * @param string $html | ||
44 | * | ||
45 | * @return string[] | ||
46 | */ | ||
47 | public static function extractImagesUrlsFromHtml($html) | ||
48 | { | ||
49 | $crawler = new Crawler($html); | ||
50 | $imagesCrawler = $crawler->filterXpath('//img'); | ||
51 | $imagesUrls = $imagesCrawler->extract(['src']); | ||
52 | $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler); | ||
53 | |||
54 | return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); | ||
55 | } | ||
56 | |||
57 | /** | ||
34 | * Process the html and extract image from it, save them to local and return the updated html. | 58 | * Process the html and extract image from it, save them to local and return the updated html. |
35 | * | 59 | * |
36 | * @param int $entryId ID of the entry | 60 | * @param int $entryId ID of the entry |
@@ -41,13 +65,7 @@ class DownloadImages | |||
41 | */ | 65 | */ |
42 | public function processHtml($entryId, $html, $url) | 66 | public function processHtml($entryId, $html, $url) |
43 | { | 67 | { |
44 | $crawler = new Crawler($html); | 68 | $imagesUrls = self::extractImagesUrlsFromHtml($html); |
45 | $imagesCrawler = $crawler | ||
46 | ->filterXpath('//img'); | ||
47 | $imagesUrls = $imagesCrawler | ||
48 | ->extract(['src']); | ||
49 | $imagesSrcsetUrls = $this->getSrcsetUrls($imagesCrawler); | ||
50 | $imagesUrls = array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); | ||
51 | 69 | ||
52 | $relativePath = $this->getRelativePath($entryId); | 70 | $relativePath = $this->getRelativePath($entryId); |
53 | 71 | ||
@@ -122,7 +140,7 @@ class DownloadImages | |||
122 | $localPath = $folderPath . '/' . $hashImage . '.' . $ext; | 140 | $localPath = $folderPath . '/' . $hashImage . '.' . $ext; |
123 | 141 | ||
124 | try { | 142 | try { |
125 | $im = imagecreatefromstring($res->getBody()); | 143 | $im = imagecreatefromstring((string) $res->getBody()); |
126 | } catch (\Exception $e) { | 144 | } catch (\Exception $e) { |
127 | $im = false; | 145 | $im = false; |
128 | } | 146 | } |
@@ -135,7 +153,21 @@ class DownloadImages | |||
135 | 153 | ||
136 | switch ($ext) { | 154 | switch ($ext) { |
137 | case 'gif': | 155 | case 'gif': |
138 | imagegif($im, $localPath); | 156 | // use Imagick if available to keep GIF animation |
157 | if (class_exists('\\Imagick')) { | ||
158 | try { | ||
159 | $imagick = new \Imagick(); | ||
160 | $imagick->readImageBlob($res->getBody()); | ||
161 | $imagick->setImageFormat('gif'); | ||
162 | $imagick->writeImages($localPath, true); | ||
163 | } catch (\Exception $e) { | ||
164 | // if Imagick fail, fallback to the default solution | ||
165 | imagegif($im, $localPath); | ||
166 | } | ||
167 | } else { | ||
168 | imagegif($im, $localPath); | ||
169 | } | ||
170 | |||
139 | $this->logger->debug('DownloadImages: Re-creating gif'); | 171 | $this->logger->debug('DownloadImages: Re-creating gif'); |
140 | break; | 172 | break; |
141 | case 'jpeg': | 173 | case 'jpeg': |
@@ -181,29 +213,30 @@ class DownloadImages | |||
181 | /** | 213 | /** |
182 | * Get images urls from the srcset image attribute. | 214 | * Get images urls from the srcset image attribute. |
183 | * | 215 | * |
184 | * @param Crawler $imagesCrawler | ||
185 | * | ||
186 | * @return array An array of urls | 216 | * @return array An array of urls |
187 | */ | 217 | */ |
188 | private function getSrcsetUrls(Crawler $imagesCrawler) | 218 | private static function getSrcsetUrls(Crawler $imagesCrawler) |
189 | { | 219 | { |
190 | $urls = []; | 220 | $urls = []; |
191 | $iterator = $imagesCrawler | 221 | $iterator = $imagesCrawler->getIterator(); |
192 | ->getIterator(); | 222 | |
193 | while ($iterator->valid()) { | 223 | while ($iterator->valid()) { |
194 | $srcsetAttribute = $iterator->current()->getAttribute('srcset'); | 224 | $srcsetAttribute = $iterator->current()->getAttribute('srcset'); |
225 | |||
195 | if ('' !== $srcsetAttribute) { | 226 | if ('' !== $srcsetAttribute) { |
196 | // Couldn't start with " OR ' OR a white space | 227 | // Couldn't start with " OR ' OR a white space |
197 | // Could be one or more white space | 228 | // Could be one or more white space |
198 | // Must be one or more digits followed by w OR x | 229 | // Must be one or more digits followed by w OR x |
199 | $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; | 230 | $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; |
200 | preg_match_all($pattern, $srcsetAttribute, $matches); | 231 | preg_match_all($pattern, $srcsetAttribute, $matches); |
232 | |||
201 | $srcset = \call_user_func_array('array_merge', $matches); | 233 | $srcset = \call_user_func_array('array_merge', $matches); |
202 | $srcsetUrls = array_map(function ($src) { | 234 | $srcsetUrls = array_map(function ($src) { |
203 | return trim(explode(' ', $src, 2)[0]); | 235 | return trim(explode(' ', $src, 2)[0]); |
204 | }, $srcset); | 236 | }, $srcset); |
205 | $urls = array_merge($srcsetUrls, $urls); | 237 | $urls = array_merge($srcsetUrls, $urls); |
206 | } | 238 | } |
239 | |||
207 | $iterator->next(); | 240 | $iterator->next(); |
208 | } | 241 | } |
209 | 242 | ||
@@ -260,33 +293,29 @@ class DownloadImages | |||
260 | return $url; | 293 | return $url; |
261 | } | 294 | } |
262 | 295 | ||
263 | $base = new \SimplePie_IRI($base); | 296 | $base = new Uri($base); |
264 | 297 | ||
265 | // remove '//' in URL path (causes URLs not to resolve properly) | 298 | // in case the url has no scheme & host |
266 | if (isset($base->ipath)) { | 299 | if ('' === $base->getAuthority() || '' === $base->getScheme()) { |
267 | $base->ipath = preg_replace('!//+!', '/', $base->ipath); | 300 | $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); |
268 | } | ||
269 | 301 | ||
270 | if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { | 302 | return false; |
271 | return $absolute->get_uri(); | ||
272 | } | 303 | } |
273 | 304 | ||
274 | $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); | 305 | return (string) UriResolver::resolve($base, new Uri($url)); |
275 | |||
276 | return false; | ||
277 | } | 306 | } |
278 | 307 | ||
279 | /** | 308 | /** |
280 | * Retrieve and validate the extension from the response of the url of the image. | 309 | * Retrieve and validate the extension from the response of the url of the image. |
281 | * | 310 | * |
282 | * @param Response $res Guzzle Response | 311 | * @param ResponseInterface $res Http Response |
283 | * @param string $imagePath Path from the src image from the content (used for log only) | 312 | * @param string $imagePath Path from the src image from the content (used for log only) |
284 | * | 313 | * |
285 | * @return string|false Extension name or false if validation failed | 314 | * @return string|false Extension name or false if validation failed |
286 | */ | 315 | */ |
287 | private function getExtensionFromResponse(Response $res, $imagePath) | 316 | private function getExtensionFromResponse(ResponseInterface $res, $imagePath) |
288 | { | 317 | { |
289 | $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); | 318 | $ext = $this->mimeGuesser->guess(current($res->getHeader('content-type'))); |
290 | $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); | 319 | $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); |
291 | 320 | ||
292 | // ok header doesn't have the extension, try a different way | 321 | // ok header doesn't have the extension, try a different way |