aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorJeremy Benoist <jeremy.benoist@gmail.com>2016-10-30 10:48:29 +0100
committerJeremy Benoist <jeremy.benoist@gmail.com>2016-10-30 10:48:29 +0100
commit7f55941856549a3f5f45c42fdc171d66ff7ee297 (patch)
tree32292162726d6c1d708a29e7495725cf7a58b40f /src
parent45fd7e09d75995bd0b9a731ffd70054b7ae6ee1f (diff)
downloadwallabag-7f55941856549a3f5f45c42fdc171d66ff7ee297.tar.gz
wallabag-7f55941856549a3f5f45c42fdc171d66ff7ee297.tar.zst
wallabag-7f55941856549a3f5f45c42fdc171d66ff7ee297.zip
Use doctrine event to download images
Diffstat (limited to 'src')
-rw-r--r--src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php129
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php6
-rw-r--r--src/Wallabag/CoreBundle/Helper/DownloadImages.php248
-rw-r--r--src/Wallabag/CoreBundle/Resources/config/services.yml19
4 files changed, 274 insertions, 128 deletions
diff --git a/src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php b/src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php
new file mode 100644
index 00000000..654edf31
--- /dev/null
+++ b/src/Wallabag/CoreBundle/Event/Subscriber/DownloadImagesSubscriber.php
@@ -0,0 +1,129 @@
1<?php
2
3namespace Wallabag\CoreBundle\Event\Subscriber;
4
5use Doctrine\Common\EventSubscriber;
6use Doctrine\ORM\Event\LifecycleEventArgs;
7use Psr\Log\LoggerInterface;
8use Wallabag\CoreBundle\Helper\DownloadImages;
9use Wallabag\CoreBundle\Entity\Entry;
10use Doctrine\ORM\EntityManager;
11use Craue\ConfigBundle\Util\Config;
12
13class DownloadImagesSubscriber implements EventSubscriber
14{
15 private $configClass;
16 private $downloadImages;
17 private $logger;
18
19 /**
20 * We inject the class instead of the service otherwise it generates a circular reference with the EntityManager.
21 * So we build the service ourself when we got the EntityManager (in downloadImages).
22 */
23 public function __construct(DownloadImages $downloadImages, $configClass, LoggerInterface $logger)
24 {
25 $this->downloadImages = $downloadImages;
26 $this->configClass = $configClass;
27 $this->logger = $logger;
28 }
29
30 public function getSubscribedEvents()
31 {
32 return array(
33 'prePersist',
34 'preUpdate',
35 );
36 }
37
38 /**
39 * In case of an entry has been updated.
40 * We won't update the content field if it wasn't updated.
41 *
42 * @param LifecycleEventArgs $args
43 */
44 public function preUpdate(LifecycleEventArgs $args)
45 {
46 $entity = $args->getEntity();
47
48 if (!$entity instanceof Entry) {
49 return;
50 }
51
52 $em = $args->getEntityManager();
53
54 // field content has been updated
55 if ($args->hasChangedField('content')) {
56 $html = $this->downloadImages($em, $entity);
57
58 if (null !== $html) {
59 $args->setNewValue('content', $html);
60 }
61 }
62
63 // field preview picture has been updated
64 if ($args->hasChangedField('previewPicture')) {
65 $previewPicture = $this->downloadPreviewImage($em, $entity);
66
67 if (null !== $previewPicture) {
68 $entity->setPreviewPicture($previewPicture);
69 }
70 }
71 }
72
73 /**
74 * When a new entry is saved.
75 *
76 * @param LifecycleEventArgs $args
77 */
78 public function prePersist(LifecycleEventArgs $args)
79 {
80 $entity = $args->getEntity();
81
82 if (!$entity instanceof Entry) {
83 return;
84 }
85
86 $config = new $this->configClass();
87 $config->setEntityManager($args->getEntityManager());
88
89 // update all images inside the html
90 $html = $this->downloadImages($config, $entity);
91 if (null !== $html) {
92 $entity->setContent($html);
93 }
94
95 // update preview picture
96 $previewPicture = $this->downloadPreviewImage($config, $entity);
97 if (null !== $previewPicture) {
98 $entity->setPreviewPicture($previewPicture);
99 }
100 }
101
102 public function downloadImages(Config $config, Entry $entry)
103 {
104 // if ($config->get('download_images_with_rabbitmq')) {
105
106 // } else if ($config->get('download_images_with_redis')) {
107
108 // }
109
110 return $this->downloadImages->processHtml(
111 $entry->getContent(),
112 $entry->getUrl()
113 );
114 }
115
116 public function downloadPreviewImage(Config $config, Entry $entry)
117 {
118 // if ($config->get('download_images_with_rabbitmq')) {
119
120 // } else if ($config->get('download_images_with_redis')) {
121
122 // }
123
124 return $this->downloadImages->processSingleImage(
125 $entry->getPreviewPicture(),
126 $entry->getUrl()
127 );
128 }
129}
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index 219b90d3..d90d3dc8 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -75,12 +75,6 @@ class ContentProxy
75 $entry->setDomainName($domainName); 75 $entry->setDomainName($domainName);
76 } 76 }
77 77
78 if (true) {
79 $this->logger->log('debug', 'Starting to download images');
80 $downloadImages = new DownloadImages($html, $url, $this->logger);
81 $html = $downloadImages->process();
82 }
83
84 $entry->setContent($html); 78 $entry->setContent($html);
85 79
86 if (isset($content['open_graph']['og_image'])) { 80 if (isset($content['open_graph']['og_image'])) {
diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php
index e23e0c55..426cbe48 100644
--- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php
+++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php
@@ -2,193 +2,197 @@
2 2
3namespace Wallabag\CoreBundle\Helper; 3namespace Wallabag\CoreBundle\Helper;
4 4
5use Psr\Log\LoggerInterface as Logger; 5use Psr\Log\LoggerInterface;
6use Symfony\Component\DomCrawler\Crawler; 6use Symfony\Component\DomCrawler\Crawler;
7 7use GuzzleHttp\Client;
8define('REGENERATE_PICTURES_QUALITY', 75); 8use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
9define('HTTP_PORT', 80);
10define('SSL_PORT', 443);
11define('BASE_URL', '');
12 9
13class DownloadImages 10class DownloadImages
14{ 11{
15 private $folder; 12 const REGENERATE_PICTURES_QUALITY = 80;
16 private $url; 13
17 private $html; 14 private $client;
18 private $fileName; 15 private $baseFolder;
19 private $logger; 16 private $logger;
17 private $mimeGuesser;
20 18
21 public function __construct($html, $url, Logger $logger) 19 public function __construct(Client $client, $baseFolder, LoggerInterface $logger)
22 { 20 {
23 $this->html = $html; 21 $this->client = $client;
24 $this->url = $url; 22 $this->baseFolder = $baseFolder;
25 $this->setFolder();
26 $this->logger = $logger; 23 $this->logger = $logger;
24 $this->mimeGuesser = new MimeTypeExtensionGuesser();
25
26 $this->setFolder();
27 } 27 }
28 28
29 public function setFolder($folder = 'assets/images') 29 /**
30 * Setup base folder where all images are going to be saved.
31 */
32 private function setFolder()
30 { 33 {
31 // if folder doesn't exist, attempt to create one and store the folder name in property $folder 34 // if folder doesn't exist, attempt to create one and store the folder name in property $folder
32 if (!file_exists($folder)) { 35 if (!file_exists($this->baseFolder)) {
33 mkdir($folder); 36 mkdir($this->baseFolder, 0777, true);
34 } 37 }
35 $this->folder = $folder;
36 } 38 }
37 39
38 public function process() 40 /**
41 * Process the html and extract image from it, save them to local and return the updated html.
42 *
43 * @param string $html
44 * @param string $url Used as a base path for relative image and folder
45 *
46 * @return string
47 */
48 public function processHtml($html, $url)
39 { 49 {
40 //instantiate the symfony DomCrawler Component 50 $crawler = new Crawler($html);
41 $crawler = new Crawler($this->html);
42 // create an array of all scrapped image links
43 $this->logger->log('debug', 'Finding images inside document');
44 $result = $crawler 51 $result = $crawler
45 ->filterXpath('//img') 52 ->filterXpath('//img')
46 ->extract(array('src')); 53 ->extract(array('src'));
47 54
55 $relativePath = $this->getRelativePath($url);
56
48 // download and save the image to the folder 57 // download and save the image to the folder
49 foreach ($result as $image) { 58 foreach ($result as $image) {
50 $file = file_get_contents($image); 59 $imagePath = $this->processSingleImage($image, $url, $relativePath);
51 60
52 // Checks 61 if (false === $imagePath) {
53 $absolute_path = self::getAbsoluteLink($image, $this->url); 62 continue;
54 $filename = basename(parse_url($absolute_path, PHP_URL_PATH)); 63 }
55 $fullpath = $this->folder.'/'.$filename; 64
56 self::checks($file, $fullpath, $absolute_path); 65 $html = str_replace($image, $imagePath, $html);
57 $this->html = str_replace($image, self::getPocheUrl().'/'.$fullpath, $this->html);
58 } 66 }
59 67
60 return $this->html; 68 return $html;
61 } 69 }
62 70
63 private function checks($rawdata, $fullpath, $absolute_path) 71 /**
72 * Process a single image:
73 * - retrieve it
74 * - re-saved it (for security reason)
75 * - return the new local path.
76 *
77 * @param string $imagePath Path to the image to retrieve
78 * @param string $url Url from where the image were found
79 * @param string $relativePath Relative local path to saved the image
80 *
81 * @return string Relative url to access the image from the web
82 */
83 public function processSingleImage($imagePath, $url, $relativePath = null)
64 { 84 {
65 $fullpath = urldecode($fullpath); 85 if (null == $relativePath) {
66 86 $relativePath = $this->getRelativePath($url);
67 if (file_exists($fullpath)) {
68 unlink($fullpath);
69 } 87 }
70 88
71 // check extension 89 $folderPath = $this->baseFolder.'/'.$relativePath;
72 $this->logger->log('debug', 'Checking extension');
73 90
74 $file_ext = strrchr($fullpath, '.'); 91 // build image path
75 $whitelist = array('.jpg', '.jpeg', '.gif', '.png'); 92 $absolutePath = $this->getAbsoluteLink($url, $imagePath);
76 if (!(in_array($file_ext, $whitelist))) { 93 if (false === $absolutePath) {
77 $this->logger->log('debug', 'processed image with not allowed extension. Skipping '.$fullpath); 94 $this->logger->log('debug', 'Can not determine the absolute path for that image, skipping.');
78 95
79 return false; 96 return false;
80 } 97 }
81 98
82 // check headers 99 $res = $this->client->get(
83 $this->logger->log('debug', 'Checking headers'); 100 $absolutePath,
84 $imageinfo = getimagesize($absolute_path); 101 ['exceptions' => false]
85 if ($imageinfo['mime'] != 'image/gif' && $imageinfo['mime'] != 'image/jpeg' && $imageinfo['mime'] != 'image/jpg' && $imageinfo['mime'] != 'image/png') { 102 );
86 $this->logger->log('debug', 'processed image with bad header. Skipping '.$fullpath); 103
104 $ext = $this->mimeGuesser->guess($res->getHeader('content-type'));
105 $this->logger->log('debug', 'Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
106 if (!in_array($ext, ['jpeg', 'jpg', 'gif', 'png'])) {
107 $this->logger->log('debug', 'Processed image with not allowed extension. Skipping '.$imagePath);
87 108
88 return false; 109 return false;
89 } 110 }
111 $hashImage = hash('crc32', $absolutePath);
112 $localPath = $folderPath.'/'.$hashImage.'.'.$ext;
113
114 try {
115 $im = imagecreatefromstring($res->getBody());
116 } catch (\Exception $e) {
117 $im = false;
118 }
90 119
91 // regenerate image
92 $this->logger->log('debug', 'regenerating image');
93 $im = imagecreatefromstring($rawdata);
94 if ($im === false) { 120 if ($im === false) {
95 $this->logger->log('error', 'error while regenerating image '.$fullpath); 121 $this->logger->log('error', 'Error while regenerating image', ['path' => $localPath]);
96 122
97 return false; 123 return false;
98 } 124 }
99 125
100 switch ($imageinfo['mime']) { 126 switch ($ext) {
101 case 'image/gif': 127 case 'gif':
102 $result = imagegif($im, $fullpath); 128 $result = imagegif($im, $localPath);
103 $this->logger->log('debug', 'Re-creating gif'); 129 $this->logger->log('debug', 'Re-creating gif');
104 break; 130 break;
105 case 'image/jpeg': 131 case 'jpeg':
106 case 'image/jpg': 132 case 'jpg':
107 $result = imagejpeg($im, $fullpath, REGENERATE_PICTURES_QUALITY); 133 $result = imagejpeg($im, $localPath, self::REGENERATE_PICTURES_QUALITY);
108 $this->logger->log('debug', 'Re-creating jpg'); 134 $this->logger->log('debug', 'Re-creating jpg');
109 break; 135 break;
110 case 'image/png': 136 case 'png':
137 $result = imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9));
111 $this->logger->log('debug', 'Re-creating png'); 138 $this->logger->log('debug', 'Re-creating png');
112 $result = imagepng($im, $fullpath, ceil(REGENERATE_PICTURES_QUALITY / 100 * 9));
113 break;
114 } 139 }
140
115 imagedestroy($im); 141 imagedestroy($im);
116 142
117 return $result; 143 return '/assets/images/'.$relativePath.'/'.$hashImage.'.'.$ext;
118 } 144 }
119 145
120 private static function getAbsoluteLink($relativeLink, $url) 146 /**
147 * Generate the folder where we are going to save images based on the entry url.
148 *
149 * @param string $url
150 *
151 * @return string
152 */
153 private function getRelativePath($url)
121 { 154 {
122 /* return if already absolute URL */ 155 $hashUrl = hash('crc32', $url);
123 if (parse_url($relativeLink, PHP_URL_SCHEME) != '') { 156 $relativePath = $hashUrl[0].'/'.$hashUrl[1].'/'.$hashUrl;
124 return $relativeLink; 157 $folderPath = $this->baseFolder.'/'.$relativePath;
125 }
126 158
127 /* queries and anchors */ 159 if (!file_exists($folderPath)) {
128 if ($relativeLink[0] == '#' || $relativeLink[0] == '?') { 160 mkdir($folderPath, 0777, true);
129 return $url.$relativeLink;
130 } 161 }
131 162
132 /* parse base URL and convert to local variables: 163 $this->logger->log('debug', 'Folder used for that url', ['folder' => $folderPath, 'url' => $url]);
133 $scheme, $host, $path */
134 extract(parse_url($url));
135 164
136 /* remove non-directory element from path */ 165 return $relativePath;
137 $path = preg_replace('#/[^/]*$#', '', $path); 166 }
138 167
139 /* destroy path if relative url points to root */ 168 /**
140 if ($relativeLink[0] == '/') { 169 * Make an $url absolute based on the $base.
141 $path = ''; 170 *
171 * @see Graby->makeAbsoluteStr
172 *
173 * @param string $base Base url
174 * @param string $url Url to make it absolute
175 *
176 * @return false|string
177 */
178 private function getAbsoluteLink($base, $url)
179 {
180 if (preg_match('!^https?://!i', $url)) {
181 // already absolute
182 return $url;
142 } 183 }
143 184
144 /* dirty absolute URL */ 185 $base = new \SimplePie_IRI($base);
145 $abs = $host.$path.'/'.$relativeLink;
146 186
147 /* replace '//' or '/./' or '/foo/../' with '/' */ 187 // remove '//' in URL path (causes URLs not to resolve properly)
148 $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'); 188 if (isset($base->ipath)) {
149 for ($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) { 189 $base->ipath = preg_replace('!//+!', '/', $base->ipath);
150 } 190 }
151 191
152 /* absolute URL is ready! */ 192 if ($absolute = \SimplePie_IRI::absolutize($base, $url)) {
153 return $scheme.'://'.$abs; 193 return $absolute->get_uri();
154 }
155
156 public static function getPocheUrl()
157 {
158 $baseUrl = '';
159 $https = (!empty($_SERVER['HTTPS'])
160 && (strtolower($_SERVER['HTTPS']) == 'on'))
161 || (isset($_SERVER['SERVER_PORT'])
162 && $_SERVER['SERVER_PORT'] == '443') // HTTPS detection.
163 || (isset($_SERVER['SERVER_PORT']) //Custom HTTPS port detection
164 && $_SERVER['SERVER_PORT'] == SSL_PORT)
165 || (isset($_SERVER['HTTP_X_FORWARDED_PROTO'])
166 && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https');
167 $serverport = (!isset($_SERVER['SERVER_PORT'])
168 || $_SERVER['SERVER_PORT'] == '80'
169 || $_SERVER['SERVER_PORT'] == HTTP_PORT
170 || ($https && $_SERVER['SERVER_PORT'] == '443')
171 || ($https && $_SERVER['SERVER_PORT'] == SSL_PORT) //Custom HTTPS port detection
172 ? '' : ':'.$_SERVER['SERVER_PORT']);
173
174 if (isset($_SERVER['HTTP_X_FORWARDED_PORT'])) {
175 $serverport = ':'.$_SERVER['HTTP_X_FORWARDED_PORT'];
176 }
177 // $scriptname = str_replace('/index.php', '/', $_SERVER["SCRIPT_NAME"]);
178 // if (!isset($_SERVER["HTTP_HOST"])) {
179 // return $scriptname;
180 // }
181 $host = (isset($_SERVER['HTTP_X_FORWARDED_HOST']) ? $_SERVER['HTTP_X_FORWARDED_HOST'] : (isset($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME']));
182 if (strpos($host, ':') !== false) {
183 $serverport = '';
184 }
185 // check if BASE_URL is configured
186 if (BASE_URL) {
187 $baseUrl = BASE_URL;
188 } else {
189 $baseUrl = 'http'.($https ? 's' : '').'://'.$host.$serverport;
190 } 194 }
191 195
192 return $baseUrl; 196 return false;
193 } 197 }
194} 198}
diff --git a/src/Wallabag/CoreBundle/Resources/config/services.yml b/src/Wallabag/CoreBundle/Resources/config/services.yml
index 4b7751fe..1fb81a46 100644
--- a/src/Wallabag/CoreBundle/Resources/config/services.yml
+++ b/src/Wallabag/CoreBundle/Resources/config/services.yml
@@ -136,3 +136,22 @@ services:
136 - "@doctrine" 136 - "@doctrine"
137 tags: 137 tags:
138 - { name: doctrine.event_subscriber } 138 - { name: doctrine.event_subscriber }
139
140 wallabag_core.subscriber.download_images:
141 class: Wallabag\CoreBundle\Event\Subscriber\DownloadImagesSubscriber
142 arguments:
143 - "@wallabag_core.entry.download_images"
144 - "%craue_config.config.class%"
145 - "@logger"
146 tags:
147 - { name: doctrine.event_subscriber }
148
149 wallabag_core.entry.download_images:
150 class: Wallabag\CoreBundle\Helper\DownloadImages
151 arguments:
152 - "@wallabag_core.entry.download_images.client"
153 - "%kernel.root_dir%/../web/assets/images"
154 - "@logger"
155
156 wallabag_core.entry.download_images.client:
157 class: GuzzleHttp\Client