diff options
author | Kevin Decherf <kevin@kdecherf.com> | 2018-10-22 23:39:31 +0200 |
---|---|---|
committer | Kevin Decherf <kevin@kdecherf.com> | 2018-10-22 23:42:09 +0200 |
commit | b49c87acf12f22e38db751fb35be5da2436abc45 (patch) | |
tree | 6b4abfd62e3b54cbe289ccb79891e14bf1fe80bf /src/Wallabag | |
parent | fc040c749dec0275e562182562c1c1cb89e6cfa1 (diff) | |
download | wallabag-b49c87acf12f22e38db751fb35be5da2436abc45.tar.gz wallabag-b49c87acf12f22e38db751fb35be5da2436abc45.tar.zst wallabag-b49c87acf12f22e38db751fb35be5da2436abc45.zip |
ignoreOriginUrl: add initial support of ignore lists
Add the ability to specify hosts and patterns lists to ignore the given
entry url and replace it with the fetched content url without touching
to origin_url.
This initial support should be reworked in the following months to move
the hardcoded ignore lists in the database.
Signed-off-by: Kevin Decherf <kevin@kdecherf.com>
Diffstat (limited to 'src/Wallabag')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 79 |
1 files changed, 59 insertions, 20 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 1a2a330f..2dc436f8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -332,32 +332,71 @@ class ContentProxy | |||
332 | $diff_keys = array_keys($diff); | 332 | $diff_keys = array_keys($diff); |
333 | sort($diff_keys); | 333 | sort($diff_keys); |
334 | 334 | ||
335 | switch ($diff_keys) { | 335 | if ($this->ignoreUrl($entry->getUrl())) { |
336 | case ['path']: | 336 | $entry->setUrl($url); |
337 | if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry | 337 | } else { |
338 | || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId | 338 | switch ($diff_keys) { |
339 | case ['path']: | ||
340 | if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry | ||
341 | || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId | ||
342 | $entry->setUrl($url); | ||
343 | } | ||
344 | break; | ||
345 | case ['scheme']: | ||
339 | $entry->setUrl($url); | 346 | $entry->setUrl($url); |
340 | } | 347 | break; |
341 | break; | 348 | case ['fragment']: |
342 | case ['scheme']: | 349 | case ['query']: |
343 | $entry->setUrl($url); | 350 | case ['fragment', 'query']: |
344 | break; | 351 | // noop |
345 | case ['fragment']: | 352 | break; |
346 | case ['query']: | 353 | default: |
347 | case ['fragment', 'query']: | 354 | if (empty($entry->getOriginUrl())) { |
348 | // noop | 355 | $entry->setOriginUrl($entry->getUrl()); |
349 | break; | 356 | } |
350 | default: | 357 | $entry->setUrl($url); |
351 | if (empty($entry->getOriginUrl())) { | 358 | break; |
352 | $entry->setOriginUrl($entry->getUrl()); | 359 | } |
353 | } | ||
354 | $entry->setUrl($url); | ||
355 | break; | ||
356 | } | 360 | } |
357 | } | 361 | } |
358 | } | 362 | } |
359 | 363 | ||
360 | /** | 364 | /** |
365 | * Check entry url against an ignore list to replace with content url. | ||
366 | * | ||
367 | * XXX: move the ignore list in the database to let users handle it | ||
368 | * | ||
369 | * @param string $url url to test | ||
370 | * | ||
371 | * @return bool true if url matches ignore list otherwise false | ||
372 | */ | ||
373 | private function ignoreUrl($url) | ||
374 | { | ||
375 | $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com']; | ||
376 | $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*']; | ||
377 | |||
378 | $parsed_url = parse_url($url); | ||
379 | |||
380 | $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) { | ||
381 | return $var === $parsed_url['host']; | ||
382 | }); | ||
383 | |||
384 | if ([] !== $filtered) { | ||
385 | return true; | ||
386 | } | ||
387 | |||
388 | $filtered = array_filter($ignored_patterns, function ($var) use ($url) { | ||
389 | return preg_match("`$var`i", $url); | ||
390 | }); | ||
391 | |||
392 | if ([] !== $filtered) { | ||
393 | return true; | ||
394 | } | ||
395 | |||
396 | return false; | ||
397 | } | ||
398 | |||
399 | /** | ||
361 | * Validate that the given content has at least a title, an html and a url. | 400 | * Validate that the given content has at least a title, an html and a url. |
362 | * | 401 | * |
363 | * @param array $content | 402 | * @param array $content |