X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=src%2FWallabag%2FCoreBundle%2FHelper%2FContentProxy.php;h=74130be8ecc4fc8fbdf66a86cbda37d179757bc6;hb=60599679519e819301ce36185c3dd5ca7aa7f4ec;hp=007ee8bbe9cb8fd8049ec830836532e0e9c1fbee;hpb=e07fadea76aa7329c4b955a59e74cb867c733706;p=github%2Fwallabag%2Fwallabag.git diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 007ee8bb..74130be8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -321,41 +321,105 @@ class ContentProxy */ private function updateOriginUrl(Entry $entry, $url) { - if (!empty($url) && $entry->getUrl() !== $url) { - $parsed_entry_url = parse_url($entry->getUrl()); - $parsed_content_url = parse_url($url); - - $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); - $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); - - $diff = array_merge($diff_ec, $diff_ce); - $diff_keys = array_keys($diff); - sort($diff_keys); - - switch ($diff_keys) { - case ['path']: - if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry - || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId - $entry->setUrl($url); - } - break; - case ['scheme']: - $entry->setUrl($url); - break; - case ['fragment']: - case ['query']: - // noop - break; - default: - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } + if (empty($url) || $entry->getUrl() === $url) { + return false; + } + + $parsed_entry_url = parse_url($entry->getUrl()); + $parsed_content_url = parse_url($url); + + /** + * The following part computes the list of part changes between two + * parse_url arrays. + * + * As array_diff_assoc only computes changes to go from the left array + * to the right one, we make two differents arrays to have both + * directions. We merge these two arrays and sort keys before passing + * the result to the switch. + * + * The resulting array gives us all changing parts between the two + * urls: scheme, host, path, query and/or fragment. + */ + $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); + $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); + + $diff = array_merge($diff_ec, $diff_ce); + $diff_keys = array_keys($diff); + sort($diff_keys); + + if ($this->ignoreUrl($entry->getUrl())) { + $entry->setUrl($url); + return false; + } + + /** + * This switch case lets us apply different behaviors according to + * changing parts of urls. + * + * As $diff_keys is an array, we provide arrays as cases. ['path'] means + * 'only the path is different between the two urls' whereas + * ['fragment', 'query'] means 'only fragment and query string parts are + * different between the two urls'. + * + * Note that values in $diff_keys are sorted. + */ + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId $entry->setUrl($url); - break; - } + } + break; + case ['scheme']: + $entry->setUrl($url); + break; + case ['fragment']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($url); + break; } } + /** + * Check entry url against an ignore list to replace with content url. + * + * XXX: move the ignore list in the database to let users handle it + * + * @param string $url url to test + * + * @return bool true if url matches ignore list otherwise false + */ + private function ignoreUrl($url) + { + $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com']; + $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*']; + + $parsed_url = parse_url($url); + + $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) { + return $var === $parsed_url['host']; + }); + + if ([] !== $filtered) { + return true; + } + + $filtered = array_filter($ignored_patterns, function ($var) use ($url) { + return preg_match("`$var`i", $url); + }); + + if ([] !== $filtered) { + return true; + } + + return false; + } + /** * Validate that the given content has at least a title, an html and a url. *