$diff_keys = array_keys($diff);
sort($diff_keys);
- switch ($diff_keys) {
- case ['path']:
- if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
- || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
+ if ($this->ignoreUrl($entry->getUrl())) {
+ $entry->setUrl($url);
+ } else {
+ switch ($diff_keys) {
+ case ['path']:
+ if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
+ || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
+ $entry->setUrl($url);
+ }
+ break;
+ case ['scheme']:
$entry->setUrl($url);
- }
- break;
- case ['scheme']:
- $entry->setUrl($url);
- break;
- case ['fragment']:
- case ['query']:
- case ['fragment', 'query']:
- // noop
- break;
- default:
- if (empty($entry->getOriginUrl())) {
- $entry->setOriginUrl($entry->getUrl());
- }
- $entry->setUrl($url);
- break;
+ break;
+ case ['fragment']:
+ case ['query']:
+ case ['fragment', 'query']:
+ // noop
+ break;
+ default:
+ if (empty($entry->getOriginUrl())) {
+ $entry->setOriginUrl($entry->getUrl());
+ }
+ $entry->setUrl($url);
+ break;
+ }
}
}
}
+ /**
+ * Check entry url against an ignore list to replace with content url.
+ *
+ * XXX: move the ignore list in the database to let users handle it
+ *
+ * @param string $url url to test
+ *
+ * @return bool true if url matches ignore list otherwise false
+ */
+ private function ignoreUrl($url)
+ {
+ $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
+ $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
+
+ $parsed_url = parse_url($url);
+
+ $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
+ return $var === $parsed_url['host'];
+ });
+
+ if ([] !== $filtered) {
+ return true;
+ }
+
+ $filtered = array_filter($ignored_patterns, function ($var) use ($url) {
+ return preg_match("`$var`i", $url);
+ });
+
+ if ([] !== $filtered) {
+ return true;
+ }
+
+ return false;
+ }
+
/**
* Validate that the given content has at least a title, an html and a url.
*
'https://example.org/hello',
null,
'example.org',
- ]
+ ],
+ 'different path and query string in fetch content' => [
+ 'https://example.org/hello',
+ null,
+ 'https://example.org/world?foo',
+ 'https://example.org/world?foo',
+ 'https://example.org/hello',
+ 'example.org',
+ ],
+ 'feedproxy ignore list test' => [
+ 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
+ null,
+ 'https://example.org/hello-wallabag',
+ 'https://example.org/hello-wallabag',
+ null,
+ 'example.org',
+ ],
+ 'feedproxy ignore list test with origin url already set' => [
+ 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
+ 'https://example.org/this-is-source',
+ 'https://example.org/hello-wallabag',
+ 'https://example.org/hello-wallabag',
+ 'https://example.org/this-is-source',
+ 'example.org',
+ ],
+ 'lemonde ignore pattern test' => [
+ 'http://www.lemonde.fr/tiny/url',
+ null,
+ 'http://example.com/hello-world',
+ 'http://example.com/hello-world',
+ null,
+ 'example.com',
+ ],
];
}