diff options
author | Jeremy Benoist <jeremy.benoist@gmail.com> | 2016-10-30 10:48:29 +0100 |
---|---|---|
committer | Jeremy Benoist <jeremy.benoist@gmail.com> | 2016-10-30 10:48:29 +0100 |
commit | 7f55941856549a3f5f45c42fdc171d66ff7ee297 (patch) | |
tree | 32292162726d6c1d708a29e7495725cf7a58b40f /src | |
parent | 45fd7e09d75995bd0b9a731ffd70054b7ae6ee1f (diff) | |
download | wallabag-7f55941856549a3f5f45c42fdc171d66ff7ee297.tar.gz wallabag-7f55941856549a3f5f45c42fdc171d66ff7ee297.tar.zst wallabag-7f55941856549a3f5f45c42fdc171d66ff7ee297.zip |
Use doctrine event to download images
Diffstat (limited to 'src')
4 files changed, 274 insertions, 128 deletions
diff --git a/src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php b/src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php new file mode 100644 index 00000000..654edf31 --- /dev/null +++ b/src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php | |||
@@ -0,0 +1,129 @@ | |||
1 | <?php | ||
2 | |||
3 | namespace Wallabag\CoreBundle\Event\Subscriber; | ||
4 | |||
5 | use Doctrine\Common\EventSubscriber; | ||
6 | use Doctrine\ORM\Event\LifecycleEventArgs; | ||
7 | use Psr\Log\LoggerInterface; | ||
8 | use Wallabag\CoreBundle\Helper\DownloadImages; | ||
9 | use Wallabag\CoreBundle\Entity\Entry; | ||
10 | use Doctrine\ORM\EntityManager; | ||
11 | use Craue\ConfigBundle\Util\Config; | ||
12 | |||
13 | class DownloadImagesSubscriber implements EventSubscriber | ||
14 | { | ||
15 | private $configClass; | ||
16 | private $downloadImages; | ||
17 | private $logger; | ||
18 | |||
19 | /** | ||
20 | * We inject the class instead of the service otherwise it generates a circular reference with the EntityManager. | ||
21 | * So we build the service ourself when we got the EntityManager (in downloadImages). | ||
22 | */ | ||
23 | public function __construct(DownloadImages $downloadImages, $configClass, LoggerInterface $logger) | ||
24 | { | ||
25 | $this->downloadImages = $downloadImages; | ||
26 | $this->configClass = $configClass; | ||
27 | $this->logger = $logger; | ||
28 | } | ||
29 | |||
30 | public function getSubscribedEvents() | ||
31 | { | ||
32 | return array( | ||
33 | 'prePersist', | ||
34 | 'preUpdate', | ||
35 | ); | ||
36 | } | ||
37 | |||
38 | /** | ||
39 | * In case of an entry has been updated. | ||
40 | * We won't update the content field if it wasn't updated. | ||
41 | * | ||
42 | * @param LifecycleEventArgs $args | ||
43 | */ | ||
44 | public function preUpdate(LifecycleEventArgs $args) | ||
45 | { | ||
46 | $entity = $args->getEntity(); | ||
47 | |||
48 | if (!$entity instanceof Entry) { | ||
49 | return; | ||
50 | } | ||
51 | |||
52 | $em = $args->getEntityManager(); | ||
53 | |||
54 | // field content has been updated | ||
55 | if ($args->hasChangedField('content')) { | ||
56 | $html = $this->downloadImages($em, $entity); | ||
57 | |||
58 | if (null !== $html) { | ||
59 | $args->setNewValue('content', $html); | ||
60 | } | ||
61 | } | ||
62 | |||
63 | // field preview picture has been updated | ||
64 | if ($args->hasChangedField('previewPicture')) { | ||
65 | $previewPicture = $this->downloadPreviewImage($em, $entity); | ||
66 | |||
67 | if (null !== $previewPicture) { | ||
68 | $entity->setPreviewPicture($previewPicture); | ||
69 | } | ||
70 | } | ||
71 | } | ||
72 | |||
73 | /** | ||
74 | * When a new entry is saved. | ||
75 | * | ||
76 | * @param LifecycleEventArgs $args | ||
77 | */ | ||
78 | public function prePersist(LifecycleEventArgs $args) | ||
79 | { | ||
80 | $entity = $args->getEntity(); | ||
81 | |||
82 | if (!$entity instanceof Entry) { | ||
83 | return; | ||
84 | } | ||
85 | |||
86 | $config = new $this->configClass(); | ||
87 | $config->setEntityManager($args->getEntityManager()); | ||
88 | |||
89 | // update all images inside the html | ||
90 | $html = $this->downloadImages($config, $entity); | ||
91 | if (null !== $html) { | ||
92 | $entity->setContent($html); | ||
93 | } | ||
94 | |||
95 | // update preview picture | ||
96 | $previewPicture = $this->downloadPreviewImage($config, $entity); | ||
97 | if (null !== $previewPicture) { | ||
98 | $entity->setPreviewPicture($previewPicture); | ||
99 | } | ||
100 | } | ||
101 | |||
102 | public function downloadImages(Config $config, Entry $entry) | ||
103 | { | ||
104 | // if ($config->get('download_images_with_rabbitmq')) { | ||
105 | |||
106 | // } else if ($config->get('download_images_with_redis')) { | ||
107 | |||
108 | // } | ||
109 | |||
110 | return $this->downloadImages->processHtml( | ||
111 | $entry->getContent(), | ||
112 | $entry->getUrl() | ||
113 | ); | ||
114 | } | ||
115 | |||
116 | public function downloadPreviewImage(Config $config, Entry $entry) | ||
117 | { | ||
118 | // if ($config->get('download_images_with_rabbitmq')) { | ||
119 | |||
120 | // } else if ($config->get('download_images_with_redis')) { | ||
121 | |||
122 | // } | ||
123 | |||
124 | return $this->downloadImages->processSingleImage( | ||
125 | $entry->getPreviewPicture(), | ||
126 | $entry->getUrl() | ||
127 | ); | ||
128 | } | ||
129 | } | ||
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 219b90d3..d90d3dc8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -75,12 +75,6 @@ class ContentProxy | |||
75 | $entry->setDomainName($domainName); | 75 | $entry->setDomainName($domainName); |
76 | } | 76 | } |
77 | 77 | ||
78 | if (true) { | ||
79 | $this->logger->log('debug', 'Starting to download images'); | ||
80 | $downloadImages = new DownloadImages($html, $url, $this->logger); | ||
81 | $html = $downloadImages->process(); | ||
82 | } | ||
83 | |||
84 | $entry->setContent($html); | 78 | $entry->setContent($html); |
85 | 79 | ||
86 | if (isset($content['open_graph']['og_image'])) { | 80 | if (isset($content['open_graph']['og_image'])) { |
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index e23e0c55..426cbe48 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php | |||
@@ -2,193 +2,197 @@ | |||
2 | 2 | ||
3 | namespace Wallabag\CoreBundle\Helper; | 3 | namespace Wallabag\CoreBundle\Helper; |
4 | 4 | ||
5 | use Psr\Log\LoggerInterface as Logger; | 5 | use Psr\Log\LoggerInterface; |
6 | use Symfony\Component\DomCrawler\Crawler; | 6 | use Symfony\Component\DomCrawler\Crawler; |
7 | 7 | use GuzzleHttp\Client; | |
8 | define('REGENERATE_PICTURES_QUALITY', 75); | 8 | use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; |
9 | define('HTTP_PORT', 80); | ||
10 | define('SSL_PORT', 443); | ||
11 | define('BASE_URL', ''); | ||
12 | 9 | ||
13 | class DownloadImages | 10 | class DownloadImages |
14 | { | 11 | { |
15 | private $folder; | 12 | const REGENERATE_PICTURES_QUALITY = 80; |
16 | private $url; | 13 | |
17 | private $html; | 14 | private $client; |
18 | private $fileName; | 15 | private $baseFolder; |
19 | private $logger; | 16 | private $logger; |
17 | private $mimeGuesser; | ||
20 | 18 | ||
21 | public function __construct($html, $url, Logger $logger) | 19 | public function __construct(Client $client, $baseFolder, LoggerInterface $logger) |
22 | { | 20 | { |
23 | $this->html = $html; | 21 | $this->client = $client; |
24 | $this->url = $url; | 22 | $this->baseFolder = $baseFolder; |
25 | $this->setFolder(); | ||
26 | $this->logger = $logger; | 23 | $this->logger = $logger; |
24 | $this->mimeGuesser = new MimeTypeExtensionGuesser(); | ||
25 | |||
26 | $this->setFolder(); | ||
27 | } | 27 | } |
28 | 28 | ||
29 | public function setFolder($folder = 'assets/images') | 29 | /** |
30 | * Setup base folder where all images are going to be saved. | ||
31 | */ | ||
32 | private function setFolder() | ||
30 | { | 33 | { |
31 | // if folder doesn't exist, attempt to create one and store the folder name in property $folder | 34 | // if folder doesn't exist, attempt to create one and store the folder name in property $folder |
32 | if (!file_exists($folder)) { | 35 | if (!file_exists($this->baseFolder)) { |
33 | mkdir($folder); | 36 | mkdir($this->baseFolder, 0777, true); |
34 | } | 37 | } |
35 | $this->folder = $folder; | ||
36 | } | 38 | } |
37 | 39 | ||
38 | public function process() | 40 | /** |
41 | * Process the html and extract image from it, save them to local and return the updated html. | ||
42 | * | ||
43 | * @param string $html | ||
44 | * @param string $url Used as a base path for relative image and folder | ||
45 | * | ||
46 | * @return string | ||
47 | */ | ||
48 | public function processHtml($html, $url) | ||
39 | { | 49 | { |
40 | //instantiate the symfony DomCrawler Component | 50 | $crawler = new Crawler($html); |
41 | $crawler = new Crawler($this->html); | ||
42 | // create an array of all scrapped image links | ||
43 | $this->logger->log('debug', 'Finding images inside document'); | ||
44 | $result = $crawler | 51 | $result = $crawler |
45 | ->filterXpath('//img') | 52 | ->filterXpath('//img') |
46 | ->extract(array('src')); | 53 | ->extract(array('src')); |
47 | 54 | ||
55 | $relativePath = $this->getRelativePath($url); | ||
56 | |||
48 | // download and save the image to the folder | 57 | // download and save the image to the folder |
49 | foreach ($result as $image) { | 58 | foreach ($result as $image) { |
50 | $file = file_get_contents($image); | 59 | $imagePath = $this->processSingleImage($image, $url, $relativePath); |
51 | 60 | ||
52 | // Checks | 61 | if (false === $imagePath) { |
53 | $absolute_path = self::getAbsoluteLink($image, $this->url); | 62 | continue; |
54 | $filename = basename(parse_url($absolute_path, PHP_URL_PATH)); | 63 | } |
55 | $fullpath = $this->folder.'/'.$filename; | 64 | |
56 | self::checks($file, $fullpath, $absolute_path); | 65 | $html = str_replace($image, $imagePath, $html); |
57 | $this->html = str_replace($image, self::getPocheUrl().'/'.$fullpath, $this->html); | ||
58 | } | 66 | } |
59 | 67 | ||
60 | return $this->html; | 68 | return $html; |
61 | } | 69 | } |
62 | 70 | ||
63 | private function checks($rawdata, $fullpath, $absolute_path) | 71 | /** |
72 | * Process a single image: | ||
73 | * - retrieve it | ||
74 | * - re-saved it (for security reason) | ||
75 | * - return the new local path. | ||
76 | * | ||
77 | * @param string $imagePath Path to the image to retrieve | ||
78 | * @param string $url Url from where the image were found | ||
79 | * @param string $relativePath Relative local path to saved the image | ||
80 | * | ||
81 | * @return string Relative url to access the image from the web | ||
82 | */ | ||
83 | public function processSingleImage($imagePath, $url, $relativePath = null) | ||
64 | { | 84 | { |
65 | $fullpath = urldecode($fullpath); | 85 | if (null == $relativePath) { |
66 | 86 | $relativePath = $this->getRelativePath($url); | |
67 | if (file_exists($fullpath)) { | ||
68 | unlink($fullpath); | ||
69 | } | 87 | } |
70 | 88 | ||
71 | // check extension | 89 | $folderPath = $this->baseFolder.'/'.$relativePath; |
72 | $this->logger->log('debug', 'Checking extension'); | ||
73 | 90 | ||
74 | $file_ext = strrchr($fullpath, '.'); | 91 | // build image path |
75 | $whitelist = array('.jpg', '.jpeg', '.gif', '.png'); | 92 | $absolutePath = $this->getAbsoluteLink($url, $imagePath); |
76 | if (!(in_array($file_ext, $whitelist))) { | 93 | if (false === $absolutePath) { |
77 | $this->logger->log('debug', 'processed image with not allowed extension. Skipping '.$fullpath); | 94 | $this->logger->log('debug', 'Can not determine the absolute path for that image, skipping.'); |
78 | 95 | ||
79 | return false; | 96 | return false; |
80 | } | 97 | } |
81 | 98 | ||
82 | // check headers | 99 | $res = $this->client->get( |
83 | $this->logger->log('debug', 'Checking headers'); | 100 | $absolutePath, |
84 | $imageinfo = getimagesize($absolute_path); | 101 | ['exceptions' => false] |
85 | if ($imageinfo['mime'] != 'image/gif' && $imageinfo['mime'] != 'image/jpeg' && $imageinfo['mime'] != 'image/jpg' && $imageinfo['mime'] != 'image/png') { | 102 | ); |
86 | $this->logger->log('debug', 'processed image with bad header. Skipping '.$fullpath); | 103 | |
104 | $ext = $this->mimeGuesser->guess($res->getHeader('content-type')); | ||
105 | $this->logger->log('debug', 'Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); | ||
106 | if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'])) { | ||
107 | $this->logger->log('debug', 'Processed image with not allowed extension. Skipping '.$imagePath); | ||
87 | 108 | ||
88 | return false; | 109 | return false; |
89 | } | 110 | } |
111 | $hashImage = hash('crc32', $absolutePath); | ||
112 | $localPath = $folderPath.'/'.$hashImage.'.'.$ext; | ||
113 | |||
114 | try { | ||
115 | $im = imagecreatefromstring($res->getBody()); | ||
116 | } catch (\Exception $e) { | ||
117 | $im = false; | ||
118 | } | ||
90 | 119 | ||
91 | // regenerate image | ||
92 | $this->logger->log('debug', 'regenerating image'); | ||
93 | $im = imagecreatefromstring($rawdata); | ||
94 | if ($im === false) { | 120 | if ($im === false) { |
95 | $this->logger->log('error', 'error while regenerating image '.$fullpath); | 121 | $this->logger->log('error', 'Error while regenerating image', ['path' => $localPath]); |
96 | 122 | ||
97 | return false; | 123 | return false; |
98 | } | 124 | } |
99 | 125 | ||
100 | switch ($imageinfo['mime']) { | 126 | switch ($ext) { |
101 | case 'image/gif': | 127 | case 'gif': |
102 | $result = imagegif($im, $fullpath); | 128 | $result = imagegif($im, $localPath); |
103 | $this->logger->log('debug', 'Re-creating gif'); | 129 | $this->logger->log('debug', 'Re-creating gif'); |
104 | break; | 130 | break; |
105 | case 'image/jpeg': | 131 | case 'jpeg': |
106 | case 'image/jpg': | 132 | case 'jpg': |
107 | $result = imagejpeg($im, $fullpath, REGENERATE_PICTURES_QUALITY); | 133 | $result = imagejpeg($im, $localPath, self::REGENERATE_PICTURES_QUALITY); |
108 | $this->logger->log('debug', 'Re-creating jpg'); | 134 | $this->logger->log('debug', 'Re-creating jpg'); |
109 | break; | 135 | break; |
110 | case 'image/png': | 136 | case 'png': |
137 | $result = imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9)); | ||
111 | $this->logger->log('debug', 'Re-creating png'); | 138 | $this->logger->log('debug', 'Re-creating png'); |
112 | $result = imagepng($im, $fullpath, ceil(REGENERATE_PICTURES_QUALITY / 100 * 9)); | ||
113 | break; | ||
114 | } | 139 | } |
140 | |||
115 | imagedestroy($im); | 141 | imagedestroy($im); |
116 | 142 | ||
117 | return $result; | 143 | return '/assets/images/'.$relativePath.'/'.$hashImage.'.'.$ext; |
118 | } | 144 | } |
119 | 145 | ||
120 | private static function getAbsoluteLink($relativeLink, $url) | 146 | /** |
147 | * Generate the folder where we are going to save images based on the entry url. | ||
148 | * | ||
149 | * @param string $url | ||
150 | * | ||
151 | * @return string | ||
152 | */ | ||
153 | private function getRelativePath($url) | ||
121 | { | 154 | { |
122 | /* return if already absolute URL */ | 155 | $hashUrl = hash('crc32', $url); |
123 | if (parse_url($relativeLink, PHP_URL_SCHEME) != '') { | 156 | $relativePath = $hashUrl[0].'/'.$hashUrl[1].'/'.$hashUrl; |
124 | return $relativeLink; | 157 | $folderPath = $this->baseFolder.'/'.$relativePath; |
125 | } | ||
126 | 158 | ||
127 | /* queries and anchors */ | 159 | if (!file_exists($folderPath)) { |
128 | if ($relativeLink[0] == '#' || $relativeLink[0] == '?') { | 160 | mkdir($folderPath, 0777, true); |
129 | return $url.$relativeLink; | ||
130 | } | 161 | } |
131 | 162 | ||
132 | /* parse base URL and convert to local variables: | 163 | $this->logger->log('debug', 'Folder used for that url', ['folder' => $folderPath, 'url' => $url]); |
133 | $scheme, $host, $path */ | ||
134 | extract(parse_url($url)); | ||
135 | 164 | ||
136 | /* remove non-directory element from path */ | 165 | return $relativePath; |
137 | $path = preg_replace('#/[^/]*$#', '', $path); | 166 | } |
138 | 167 | ||
139 | /* destroy path if relative url points to root */ | 168 | /** |
140 | if ($relativeLink[0] == '/') { | 169 | * Make an $url absolute based on the $base. |
141 | $path = ''; | 170 | * |
171 | * @see Graby->makeAbsoluteStr | ||
172 | * | ||
173 | * @param string $base Base url | ||
174 | * @param string $url Url to make it absolute | ||
175 | * | ||
176 | * @return false|string | ||
177 | */ | ||
178 | private function getAbsoluteLink($base, $url) | ||
179 | { | ||
180 | if (preg_match('!^https?://!i', $url)) { | ||
181 | // already absolute | ||
182 | return $url; | ||
142 | } | 183 | } |
143 | 184 | ||
144 | /* dirty absolute URL */ | 185 | $base = new \SimplePie_IRI($base); |
145 | $abs = $host.$path.'/'.$relativeLink; | ||
146 | 186 | ||
147 | /* replace '//' or '/./' or '/foo/../' with '/' */ | 187 | // remove '//' in URL path (causes URLs not to resolve properly) |
148 | $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'); | 188 | if (isset($base->ipath)) { |
149 | for ($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) { | 189 | $base->ipath = preg_replace('!//+!', '/', $base->ipath); |
150 | } | 190 | } |
151 | 191 | ||
152 | /* absolute URL is ready! */ | 192 | if ($absolute = \SimplePie_IRI::absolutize($base, $url)) { |
153 | return $scheme.'://'.$abs; | 193 | return $absolute->get_uri(); |
154 | } | ||
155 | |||
156 | public static function getPocheUrl() | ||
157 | { | ||
158 | $baseUrl = ''; | ||
159 | $https = (!empty($_SERVER['HTTPS']) | ||
160 | && (strtolower($_SERVER['HTTPS']) == 'on')) | ||
161 | || (isset($_SERVER['SERVER_PORT']) | ||
162 | && $_SERVER['SERVER_PORT'] == '443') // HTTPS detection. | ||
163 | || (isset($_SERVER['SERVER_PORT']) //Custom HTTPS port detection | ||
164 | && $_SERVER['SERVER_PORT'] == SSL_PORT) | ||
165 | || (isset($_SERVER['HTTP_X_FORWARDED_PROTO']) | ||
166 | && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https'); | ||
167 | $serverport = (!isset($_SERVER['SERVER_PORT']) | ||
168 | || $_SERVER['SERVER_PORT'] == '80' | ||
169 | || $_SERVER['SERVER_PORT'] == HTTP_PORT | ||
170 | || ($https && $_SERVER['SERVER_PORT'] == '443') | ||
171 | || ($https && $_SERVER['SERVER_PORT'] == SSL_PORT) //Custom HTTPS port detection | ||
172 | ? '' : ':'.$_SERVER['SERVER_PORT']); | ||
173 | |||
174 | if (isset($_SERVER['HTTP_X_FORWARDED_PORT'])) { | ||
175 | $serverport = ':'.$_SERVER['HTTP_X_FORWARDED_PORT']; | ||
176 | } | ||
177 | // $scriptname = str_replace('/index.php', '/', $_SERVER["SCRIPT_NAME"]); | ||
178 | // if (!isset($_SERVER["HTTP_HOST"])) { | ||
179 | // return $scriptname; | ||
180 | // } | ||
181 | $host = (isset($_SERVER['HTTP_X_FORWARDED_HOST']) ? $_SERVER['HTTP_X_FORWARDED_HOST'] : (isset($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'])); | ||
182 | if (strpos($host, ':') !== false) { | ||
183 | $serverport = ''; | ||
184 | } | ||
185 | // check if BASE_URL is configured | ||
186 | if (BASE_URL) { | ||
187 | $baseUrl = BASE_URL; | ||
188 | } else { | ||
189 | $baseUrl = 'http'.($https ? 's' : '').'://'.$host.$serverport; | ||
190 | } | 194 | } |
191 | 195 | ||
192 | return $baseUrl; | 196 | return false; |
193 | } | 197 | } |
194 | } | 198 | } |
diff --git a/src/Wallabag/CoreBundle/Resources/config/services.yml b/src/Wallabag/CoreBundle/Resources/config/services.yml index 4b7751fe..1fb81a46 100644 --- a/src/Wallabag/CoreBundle/Resources/config/services.yml +++ b/src/Wallabag/CoreBundle/Resources/config/services.yml | |||
@@ -136,3 +136,22 @@ services: | |||
136 | - "@doctrine" | 136 | - "@doctrine" |
137 | tags: | 137 | tags: |
138 | - { name: doctrine.event_subscriber } | 138 | - { name: doctrine.event_subscriber } |
139 | |||
140 | wallabag_core.subscriber.download_images: | ||
141 | class: Wallabag\CoreBundle\Event\Subscriber\DownloadImagesSubscriber | ||
142 | arguments: | ||
143 | - "@wallabag_core.entry.download_images" | ||
144 | - "%craue_config.config.class%" | ||
145 | - "@logger" | ||
146 | tags: | ||
147 | - { name: doctrine.event_subscriber } | ||
148 | |||
149 | wallabag_core.entry.download_images: | ||
150 | class: Wallabag\CoreBundle\Helper\DownloadImages | ||
151 | arguments: | ||
152 | - "@wallabag_core.entry.download_images.client" | ||
153 | - "%kernel.root_dir%/../web/assets/images" | ||
154 | - "@logger" | ||
155 | |||
156 | wallabag_core.entry.download_images.client: | ||
157 | class: GuzzleHttp\Client | ||