aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/Wallabag/CoreBundle/Helper/ContentProxy.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php')
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php172
1 files changed, 171 insertions, 1 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index 3fe31c2c..d38811a2 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -53,6 +53,7 @@ class ContentProxy
53 53
54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { 54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
55 $fetchedContent = $this->graby->fetchContent($url); 55 $fetchedContent = $this->graby->fetchContent($url);
56 $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
56 57
57 // when content is imported, we have information in $content 58 // when content is imported, we have information in $content
58 // in case fetching content goes bad, we'll keep the imported information instead of overriding them 59 // in case fetching content goes bad, we'll keep the imported information instead of overriding them
@@ -65,6 +66,13 @@ class ContentProxy
65 // so we'll be able to refetch it in the future 66 // so we'll be able to refetch it in the future
66 $content['url'] = !empty($content['url']) ? $content['url'] : $url; 67 $content['url'] = !empty($content['url']) ? $content['url'] : $url;
67 68
69 // In one case (at least in tests), url is empty here
70 // so we set it using $url provided in the updateEntry call.
71 // Not sure what are the other possible cases where this property is empty
72 if (empty($entry->getUrl()) && !empty($url)) {
73 $entry->setUrl($url);
74 }
75
68 $this->stockEntry($entry, $content); 76 $this->stockEntry($entry, $content);
69 } 77 }
70 78
@@ -177,6 +185,59 @@ class ContentProxy
177 } 185 }
178 186
179 /** 187 /**
188 * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
189 *
190 * @param $title
191 * @param $contentType
192 *
193 * @return string
194 */
195 private function sanitizeContentTitle($title, $contentType)
196 {
197 if ('application/pdf' === $contentType) {
198 $title = $this->convertPdfEncodingToUTF8($title);
199 }
200
201 return $this->sanitizeUTF8Text($title);
202 }
203
204 /**
205 * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
206 * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
207 *
208 * @param $title
209 *
210 * @return string (maybe contains invalid UTF-8 character)
211 */
212 private function convertPdfEncodingToUTF8($title)
213 {
214 // first try UTF-8 because its easier to detect its present/absence
215 foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
216 if (mb_check_encoding($title, $encoding)) {
217 return mb_convert_encoding($title, 'UTF-8', $encoding);
218 }
219 }
220
221 return $title;
222 }
223
224 /**
225 * Remove invalid UTF-8 characters from the given string.
226 *
227 * @param string $rawText
228 *
229 * @return string
230 */
231 private function sanitizeUTF8Text($rawText)
232 {
233 if (mb_check_encoding($rawText, 'UTF-8')) {
234 return $rawText;
235 }
236
237 return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
238 }
239
240 /**
180 * Stock entry with fetched or imported content. 241 * Stock entry with fetched or imported content.
181 * Will fall back to OpenGraph data if available. 242 * Will fall back to OpenGraph data if available.
182 * 243 *
@@ -185,7 +246,7 @@ class ContentProxy
185 */ 246 */
186 private function stockEntry(Entry $entry, array $content) 247 private function stockEntry(Entry $entry, array $content)
187 { 248 {
188 $entry->setUrl($content['url']); 249 $this->updateOriginUrl($entry, $content['url']);
189 250
190 $this->setEntryDomainName($entry); 251 $this->setEntryDomainName($entry);
191 252
@@ -252,6 +313,115 @@ class ContentProxy
252 } 313 }
253 314
254 /** 315 /**
316 * Update the origin_url field when a redirection occurs
317 * This field is set if it is empty and new url does not match ignore list.
318 *
319 * @param Entry $entry
320 * @param string $url
321 */
322 private function updateOriginUrl(Entry $entry, $url)
323 {
324 if (empty($url) || $entry->getUrl() === $url) {
325 return false;
326 }
327
328 $parsed_entry_url = parse_url($entry->getUrl());
329 $parsed_content_url = parse_url($url);
330
331 /**
332 * The following part computes the list of part changes between two
333 * parse_url arrays.
334 *
335 * As array_diff_assoc only computes changes to go from the left array
336 * to the right one, we make two differents arrays to have both
337 * directions. We merge these two arrays and sort keys before passing
338 * the result to the switch.
339 *
340 * The resulting array gives us all changing parts between the two
341 * urls: scheme, host, path, query and/or fragment.
342 */
343 $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url);
344 $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url);
345
346 $diff = array_merge($diff_ec, $diff_ce);
347 $diff_keys = array_keys($diff);
348 sort($diff_keys);
349
350 if ($this->ignoreUrl($entry->getUrl())) {
351 $entry->setUrl($url);
352
353 return false;
354 }
355
356 /**
357 * This switch case lets us apply different behaviors according to
358 * changing parts of urls.
359 *
360 * As $diff_keys is an array, we provide arrays as cases. ['path'] means
361 * 'only the path is different between the two urls' whereas
362 * ['fragment', 'query'] means 'only fragment and query string parts are
363 * different between the two urls'.
364 *
365 * Note that values in $diff_keys are sorted.
366 */
367 switch ($diff_keys) {
368 case ['path']:
369 if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
370 || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
371 $entry->setUrl($url);
372 }
373 break;
374 case ['scheme']:
375 $entry->setUrl($url);
376 break;
377 case ['fragment']:
378 // noop
379 break;
380 default:
381 if (empty($entry->getOriginUrl())) {
382 $entry->setOriginUrl($entry->getUrl());
383 }
384 $entry->setUrl($url);
385 break;
386 }
387 }
388
389 /**
390 * Check entry url against an ignore list to replace with content url.
391 *
392 * XXX: move the ignore list in the database to let users handle it
393 *
394 * @param string $url url to test
395 *
396 * @return bool true if url matches ignore list otherwise false
397 */
398 private function ignoreUrl($url)
399 {
400 $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
401 $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
402
403 $parsed_url = parse_url($url);
404
405 $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
406 return $var === $parsed_url['host'];
407 });
408
409 if ([] !== $filtered) {
410 return true;
411 }
412
413 $filtered = array_filter($ignored_patterns, function ($var) use ($url) {
414 return preg_match("`$var`i", $url);
415 });
416
417 if ([] !== $filtered) {
418 return true;
419 }
420
421 return false;
422 }
423
424 /**
255 * Validate that the given content has at least a title, an html and a url. 425 * Validate that the given content has at least a title, an html and a url.
256 * 426 *
257 * @param array $content 427 * @param array $content