From 4a81360efcdfe4bab8d75f7227c9cf5bfd514189 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sun, 7 Jan 2018 17:25:26 +0100 Subject: ContentProxy: fix a corner case when entry.url is empty in updateEntry Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index d4ea608f..f0d8c1b4 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -66,6 +66,14 @@ class ContentProxy // so we'll be able to refetch it in the future $content['url'] = !empty($content['url']) ? $content['url'] : $url; + // In one case (at least in tests), url is empty here + // so we set it using $url provided in the updateEntry call. + // Not sure what are the other possible cases where this property is empty + if (empty($entry->getUrl()) && !empty($url)) + { + $entry->setUrl($url); + } + $this->stockEntry($entry, $content); } -- cgit v1.2.3 From 781864b9546b0ff2d6fe42ce72f78b8f40b785e9 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sun, 7 Jan 2018 17:28:04 +0100 Subject: ContentProxy: swap entry url to origin_url and set new url according to graby content Closes #3529 Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index f0d8c1b4..da0ec5a3 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -68,9 +68,8 @@ class ContentProxy // In one case (at least in tests), url is empty here // so we set it using $url provided in the updateEntry call. - // Not sure what are the other possible cases where this property is empty - if (empty($entry->getUrl()) && !empty($url)) - { + // Not sure what are the other possible cases where this property is empty + if (empty($entry->getUrl()) && !empty($url)) { $entry->setUrl($url); } @@ -247,7 +246,15 @@ class ContentProxy */ private function stockEntry(Entry $entry, array $content) { - $entry->setUrl($content['url']); + // When a redirection occurs while fetching an entry + // we move the original url in origin_url property if empty + // and set the entry url with the final value + if (!empty($content['url']) && $entry->getUrl() !== $content['url']) { + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($content['url']); + } $this->setEntryDomainName($entry); -- cgit v1.2.3 From e07fadea76aa7329c4b955a59e74cb867c733706 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Thu, 6 Sep 2018 22:26:20 +0200 Subject: Refactor updateOriginUrl to include new behaviors behaviors - Leave origin_url unchanged if difference is an ending slash - Leave origin_url unchanged if difference is scheme - Ignore (noop) if difference is query string or fragment Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 54 ++++++++++++++++++++----- 1 file changed, 45 insertions(+), 9 deletions(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index da0ec5a3..007ee8bb 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -246,15 +246,7 @@ class ContentProxy */ private function stockEntry(Entry $entry, array $content) { - // When a redirection occurs while fetching an entry - // we move the original url in origin_url property if empty - // and set the entry url with the final value - if (!empty($content['url']) && $entry->getUrl() !== $content['url']) { - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($content['url']); - } + $this->updateOriginUrl($entry, $content['url']); $this->setEntryDomainName($entry); @@ -320,6 +312,50 @@ class ContentProxy } } + /** + * Update the origin_url field when a redirection occurs + * This field is set if it is empty and new url does not match ignore list. + * + * @param Entry $entry + * @param string $url + */ + private function updateOriginUrl(Entry $entry, $url) + { + if (!empty($url) && $entry->getUrl() !== $url) { + $parsed_entry_url = parse_url($entry->getUrl()); + $parsed_content_url = parse_url($url); + + $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); + $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); + + $diff = array_merge($diff_ec, $diff_ce); + $diff_keys = array_keys($diff); + sort($diff_keys); + + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: + $entry->setUrl($url); + break; + case ['fragment']: + case ['query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($url); + break; + } + } + } + /** * Validate that the given content has at least a title, an html and a url. * -- cgit v1.2.3 From fc040c749dec0275e562182562c1c1cb89e6cfa1 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Mon, 22 Oct 2018 23:08:58 +0200 Subject: updateOriginUrl: add behavior when diff is fragment and query Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 1 + 1 file changed, 1 insertion(+) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 007ee8bb..1a2a330f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -344,6 +344,7 @@ class ContentProxy break; case ['fragment']: case ['query']: + case ['fragment', 'query']: // noop break; default: -- cgit v1.2.3 From b49c87acf12f22e38db751fb35be5da2436abc45 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Mon, 22 Oct 2018 23:39:31 +0200 Subject: ignoreOriginUrl: add initial support of ignore lists Add the ability to specify hosts and patterns lists to ignore the given entry url and replace it with the fetched content url without touching to origin_url. This initial support should be reworked in the following months to move the hardcoded ignore lists in the database. Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 79 ++++++++++++++++++------- 1 file changed, 59 insertions(+), 20 deletions(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 1a2a330f..2dc436f8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -332,31 +332,70 @@ class ContentProxy $diff_keys = array_keys($diff); sort($diff_keys); - switch ($diff_keys) { - case ['path']: - if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry - || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + if ($this->ignoreUrl($entry->getUrl())) { + $entry->setUrl($url); + } else { + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: $entry->setUrl($url); - } - break; - case ['scheme']: - $entry->setUrl($url); - break; - case ['fragment']: - case ['query']: - case ['fragment', 'query']: - // noop - break; - default: - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($url); - break; + break; + case ['fragment']: + case ['query']: + case ['fragment', 'query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($url); + break; + } } } } + /** + * Check entry url against an ignore list to replace with content url. + * + * XXX: move the ignore list in the database to let users handle it + * + * @param string $url url to test + * + * @return bool true if url matches ignore list otherwise false + */ + private function ignoreUrl($url) + { + $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com']; + $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*']; + + $parsed_url = parse_url($url); + + $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) { + return $var === $parsed_url['host']; + }); + + if ([] !== $filtered) { + return true; + } + + $filtered = array_filter($ignored_patterns, function ($var) use ($url) { + return preg_match("`$var`i", $url); + }); + + if ([] !== $filtered) { + return true; + } + + return false; + } + /** * Validate that the given content has at least a title, an html and a url. * -- cgit v1.2.3 From 5ba5e22a092068aeb12213578fd8fc4edb2399fe Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 21:54:09 +0200 Subject: updateOriginUrl: rewrite some if, resolving feedbacks from PR Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 67 +++++++++++++------------ 1 file changed, 35 insertions(+), 32 deletions(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 2dc436f8..92351986 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -321,43 +321,46 @@ class ContentProxy */ private function updateOriginUrl(Entry $entry, $url) { - if (!empty($url) && $entry->getUrl() !== $url) { - $parsed_entry_url = parse_url($entry->getUrl()); - $parsed_content_url = parse_url($url); + if (empty($url) || $entry->getUrl() === $url) { + return false; + } + + $parsed_entry_url = parse_url($entry->getUrl()); + $parsed_content_url = parse_url($url); - $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); - $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); + $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); + $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); - $diff = array_merge($diff_ec, $diff_ce); - $diff_keys = array_keys($diff); - sort($diff_keys); + $diff = array_merge($diff_ec, $diff_ce); + $diff_keys = array_keys($diff); + sort($diff_keys); + + if ($this->ignoreUrl($entry->getUrl())) { + $entry->setUrl($url); + return false; + } - if ($this->ignoreUrl($entry->getUrl())) { + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: $entry->setUrl($url); - } else { - switch ($diff_keys) { - case ['path']: - if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry - || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId - $entry->setUrl($url); - } - break; - case ['scheme']: - $entry->setUrl($url); - break; - case ['fragment']: - case ['query']: - case ['fragment', 'query']: - // noop - break; - default: - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($url); - break; + break; + case ['fragment']: + case ['query']: + case ['fragment', 'query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); } - } + $entry->setUrl($url); + break; } } -- cgit v1.2.3 From 44e63667d9cf331aeedef8cb964538823c0a145d Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 22:11:35 +0200 Subject: updateOriginUrl: add comment blocks for the parse_url diff check Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 92351986..a93f4a2d 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -328,6 +328,18 @@ class ContentProxy $parsed_entry_url = parse_url($entry->getUrl()); $parsed_content_url = parse_url($url); + /** + * The following part computes the list of part changes between two + * parse_url arrays. + * + * As array_diff_assoc only computes changes to go from the left array + * to the right one, we make two differents arrays to have both + * directions. We merge these two arrays and sort keys before passing + * the result to the switch. + * + * The resulting array gives us all changing parts between the two + * urls: scheme, host, path, query and/or fragment. + */ $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); @@ -340,6 +352,17 @@ class ContentProxy return false; } + /** + * This switch case lets us apply different behaviors according to + * changing parts of urls. + * + * As $diff_keys is an array, we provide arrays as cases. ['path'] means + * 'only the path is different between the two urls' whereas + * ['fragment', 'query'] means 'only fragment and query string parts are + * different between the two urls'. + * + * Note that values in $diff_keys are sorted. + */ switch ($diff_keys) { case ['path']: if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry -- cgit v1.2.3 From 60599679519e819301ce36185c3dd5ca7aa7f4ec Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 22:27:27 +0200 Subject: updateOriginUrl: remove 'query string' case from ignore list Two urls with a different query string may refer to two different pages so keep them both. Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 2 -- 1 file changed, 2 deletions(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index a93f4a2d..74130be8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -374,8 +374,6 @@ class ContentProxy $entry->setUrl($url); break; case ['fragment']: - case ['query']: - case ['fragment', 'query']: // noop break; default: -- cgit v1.2.3 From 1b220426e2e8139364b4a34678a2843c2e8bccf5 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 22:33:32 +0200 Subject: phpcs Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 74130be8..d38811a2 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -349,6 +349,7 @@ class ContentProxy if ($this->ignoreUrl($entry->getUrl())) { $entry->setUrl($url); + return false; } @@ -360,7 +361,7 @@ class ContentProxy * 'only the path is different between the two urls' whereas * ['fragment', 'query'] means 'only fragment and query string parts are * different between the two urls'. - * + * * Note that values in $diff_keys are sorted. */ switch ($diff_keys) { -- cgit v1.2.3