aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorKevin Decherf <kevin@kdecherf.com>2018-10-22 23:39:31 +0200
committerKevin Decherf <kevin@kdecherf.com>2018-10-22 23:42:09 +0200
commitb49c87acf12f22e38db751fb35be5da2436abc45 (patch)
tree6b4abfd62e3b54cbe289ccb79891e14bf1fe80bf
parentfc040c749dec0275e562182562c1c1cb89e6cfa1 (diff)
downloadwallabag-b49c87acf12f22e38db751fb35be5da2436abc45.tar.gz
wallabag-b49c87acf12f22e38db751fb35be5da2436abc45.tar.zst
wallabag-b49c87acf12f22e38db751fb35be5da2436abc45.zip
ignoreOriginUrl: add initial support of ignore lists
Add the ability to specify hosts and patterns lists to ignore the given entry url and replace it with the fetched content url without touching to origin_url. This initial support should be reworked in the following months to move the hardcoded ignore lists in the database. Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php79
-rw-r--r--tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php34
2 files changed, 92 insertions, 21 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index 1a2a330f..2dc436f8 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -332,32 +332,71 @@ class ContentProxy
332 $diff_keys = array_keys($diff); 332 $diff_keys = array_keys($diff);
333 sort($diff_keys); 333 sort($diff_keys);
334 334
335 switch ($diff_keys) { 335 if ($this->ignoreUrl($entry->getUrl())) {
336 case ['path']: 336 $entry->setUrl($url);
337 if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry 337 } else {
338 || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId 338 switch ($diff_keys) {
339 case ['path']:
340 if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
341 || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
342 $entry->setUrl($url);
343 }
344 break;
345 case ['scheme']:
339 $entry->setUrl($url); 346 $entry->setUrl($url);
340 } 347 break;
341 break; 348 case ['fragment']:
342 case ['scheme']: 349 case ['query']:
343 $entry->setUrl($url); 350 case ['fragment', 'query']:
344 break; 351 // noop
345 case ['fragment']: 352 break;
346 case ['query']: 353 default:
347 case ['fragment', 'query']: 354 if (empty($entry->getOriginUrl())) {
348 // noop 355 $entry->setOriginUrl($entry->getUrl());
349 break; 356 }
350 default: 357 $entry->setUrl($url);
351 if (empty($entry->getOriginUrl())) { 358 break;
352 $entry->setOriginUrl($entry->getUrl()); 359 }
353 }
354 $entry->setUrl($url);
355 break;
356 } 360 }
357 } 361 }
358 } 362 }
359 363
360 /** 364 /**
365 * Check entry url against an ignore list to replace with content url.
366 *
367 * XXX: move the ignore list in the database to let users handle it
368 *
369 * @param string $url url to test
370 *
371 * @return bool true if url matches ignore list otherwise false
372 */
373 private function ignoreUrl($url)
374 {
375 $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
376 $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
377
378 $parsed_url = parse_url($url);
379
380 $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
381 return $var === $parsed_url['host'];
382 });
383
384 if ([] !== $filtered) {
385 return true;
386 }
387
388 $filtered = array_filter($ignored_patterns, function ($var) use ($url) {
389 return preg_match("`$var`i", $url);
390 });
391
392 if ([] !== $filtered) {
393 return true;
394 }
395
396 return false;
397 }
398
399 /**
361 * Validate that the given content has at least a title, an html and a url. 400 * Validate that the given content has at least a title, an html and a url.
362 * 401 *
363 * @param array $content 402 * @param array $content
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
index 3debc457..a60aec5b 100644
--- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
+++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
@@ -808,7 +808,39 @@ class ContentProxyTest extends TestCase
808 'https://example.org/hello', 808 'https://example.org/hello',
809 null, 809 null,
810 'example.org', 810 'example.org',
811 ] 811 ],
812 'different path and query string in fetch content' => [
813 'https://example.org/hello',
814 null,
815 'https://example.org/world?foo',
816 'https://example.org/world?foo',
817 'https://example.org/hello',
818 'example.org',
819 ],
820 'feedproxy ignore list test' => [
821 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
822 null,
823 'https://example.org/hello-wallabag',
824 'https://example.org/hello-wallabag',
825 null,
826 'example.org',
827 ],
828 'feedproxy ignore list test with origin url already set' => [
829 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
830 'https://example.org/this-is-source',
831 'https://example.org/hello-wallabag',
832 'https://example.org/hello-wallabag',
833 'https://example.org/this-is-source',
834 'example.org',
835 ],
836 'lemonde ignore pattern test' => [
837 'http://www.lemonde.fr/tiny/url',
838 null,
839 'http://example.com/hello-world',
840 'http://example.com/hello-world',
841 null,
842 'example.com',
843 ],
812 ]; 844 ];
813 } 845 }
814 846