From d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Tue, 18 Sep 2018 15:04:19 +0200 Subject: Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 3fe31c2c..2628af19 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -53,6 +53,7 @@ class ContentProxy if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { $fetchedContent = $this->graby->fetchContent($url); + $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); // when content is imported, we have information in $content // in case fetching content goes bad, we'll keep the imported information instead of overriding them @@ -68,6 +69,28 @@ class ContentProxy $this->stockEntry($entry, $content); } + /** + * Remove invalid UTF-8 characters from the given string in following steps: + * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) + * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) + * @param String $rawText + * @return string + */ + private function sanitizeUTF8Text(String $rawText) { + if (mb_check_encoding($rawText, 'utf-8')) { + return $rawText; // return because its valid utf-8 text + } + + // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) + $convertedText = utf8_encode($rawText); + if (mb_check_encoding($convertedText, 'utf-8')) { + return $convertedText; + } + + // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding + return iconv("UTF-8", "UTF-8//IGNORE", $rawText); + } + /** * Use a Symfony validator to ensure the language is well formatted. * -- cgit v1.2.3 From 8648f0c00534e8af83b2a5451269d79906db6c16 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Wed, 19 Sep 2018 11:03:42 +0200 Subject: Remove type declaration for PHP 5 compatibility --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 2628af19..29259bbd 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -76,7 +76,7 @@ class ContentProxy * @param String $rawText * @return string */ - private function sanitizeUTF8Text(String $rawText) { + private function sanitizeUTF8Text($rawText) { if (mb_check_encoding($rawText, 'utf-8')) { return $rawText; // return because its valid utf-8 text } -- cgit v1.2.3 From f80f16dfc858ec90da76daacd405b0cfdaa32f74 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Wed, 19 Sep 2018 12:30:26 +0200 Subject: Try to detect the character encoding in PDFs and try to translate the title from the PDF to UTF-8 --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 46 ++++++++++++++++++------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 29259bbd..fab05268 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -53,7 +53,7 @@ class ContentProxy if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { $fetchedContent = $this->graby->fetchContent($url); - $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); + $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']); // when content is imported, we have information in $content // in case fetching content goes bad, we'll keep the imported information instead of overriding them @@ -70,24 +70,44 @@ class ContentProxy } /** - * Remove invalid UTF-8 characters from the given string in following steps: - * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) - * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) - * @param String $rawText + * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. + * @param $title + * @param $contentType * @return string */ - private function sanitizeUTF8Text($rawText) { - if (mb_check_encoding($rawText, 'utf-8')) { - return $rawText; // return because its valid utf-8 text + private function sanitizeContentTitle($title, $contentType) { + if ('application/pdf' === $contentType) { + $convertedTitle = $this->convertPdfEncodingToUTF8($title); + return $this->sanitizeUTF8Text($convertedTitle); } + return $this->sanitizeUTF8Text($title); + } - // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) - $convertedText = utf8_encode($rawText); - if (mb_check_encoding($convertedText, 'utf-8')) { - return $convertedText; + /** + * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not + * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. + * @param $title + * @return string (maybe contains invalid UTF-8 character) + */ + private function convertPdfEncodingToUTF8($title) { + // first try UTF-16 (then UTF-8) because its easier to detect its present/absence + foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) { + if (mb_check_encoding($title, $encoding)) { + return mb_convert_encoding($title, 'UTF-8', $encoding); + } } + return $title; + } - // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding + /** + * Remove invalid UTF-8 characters from the given string. + * @param String $rawText + * @return string + */ + private function sanitizeUTF8Text($rawText) { + if (mb_check_encoding($rawText, 'UTF-8')) { + return $rawText; + } return iconv("UTF-8", "UTF-8//IGNORE", $rawText); } -- cgit v1.2.3 From c01d9532920ec5a298bb347dbb83a078d36d4841 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Wed, 19 Sep 2018 13:59:07 +0200 Subject: Add tests for logic Try to translate the title of a PDF from UTF-8 (then UTF-16BE, then WINDOWS-1252) to UTF-8 --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index fab05268..50090100 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -90,8 +90,8 @@ class ContentProxy * @return string (maybe contains invalid UTF-8 character) */ private function convertPdfEncodingToUTF8($title) { - // first try UTF-16 (then UTF-8) because its easier to detect its present/absence - foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) { + // first try UTF-8 because its easier to detect its present/absence + foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) { if (mb_check_encoding($title, $encoding)) { return mb_convert_encoding($title, 'UTF-8', $encoding); } -- cgit v1.2.3 From 7a65c2017bf4dd47414df27d0a07829580392c96 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Fri, 21 Sep 2018 13:23:39 +0200 Subject: Override the value of the given parameter ($title) with the (hopefully) correct (to UTF-8) converted PDF title --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 50090100..ce82f6bc 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -77,8 +77,7 @@ class ContentProxy */ private function sanitizeContentTitle($title, $contentType) { if ('application/pdf' === $contentType) { - $convertedTitle = $this->convertPdfEncodingToUTF8($title); - return $this->sanitizeUTF8Text($convertedTitle); + $title = $this->convertPdfEncodingToUTF8($title); } return $this->sanitizeUTF8Text($title); } -- cgit v1.2.3 From 83f1c3274f02ece12bd06c0b8df61d5e4b3236e7 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Sun, 23 Sep 2018 22:20:43 +0200 Subject: Run php-cs-fixer for fixing coding standard issues --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 94 ++++++++++++++----------- 1 file changed, 53 insertions(+), 41 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index ce82f6bc..d4ea608f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -69,47 +69,6 @@ class ContentProxy $this->stockEntry($entry, $content); } - /** - * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. - * @param $title - * @param $contentType - * @return string - */ - private function sanitizeContentTitle($title, $contentType) { - if ('application/pdf' === $contentType) { - $title = $this->convertPdfEncodingToUTF8($title); - } - return $this->sanitizeUTF8Text($title); - } - - /** - * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not - * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. - * @param $title - * @return string (maybe contains invalid UTF-8 character) - */ - private function convertPdfEncodingToUTF8($title) { - // first try UTF-8 because its easier to detect its present/absence - foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) { - if (mb_check_encoding($title, $encoding)) { - return mb_convert_encoding($title, 'UTF-8', $encoding); - } - } - return $title; - } - - /** - * Remove invalid UTF-8 characters from the given string. - * @param String $rawText - * @return string - */ - private function sanitizeUTF8Text($rawText) { - if (mb_check_encoding($rawText, 'UTF-8')) { - return $rawText; - } - return iconv("UTF-8", "UTF-8//IGNORE", $rawText); - } - /** * Use a Symfony validator to ensure the language is well formatted. * @@ -218,6 +177,59 @@ class ContentProxy $entry->setTitle($path); } + /** + * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. + * + * @param $title + * @param $contentType + * + * @return string + */ + private function sanitizeContentTitle($title, $contentType) + { + if ('application/pdf' === $contentType) { + $title = $this->convertPdfEncodingToUTF8($title); + } + + return $this->sanitizeUTF8Text($title); + } + + /** + * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not + * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. + * + * @param $title + * + * @return string (maybe contains invalid UTF-8 character) + */ + private function convertPdfEncodingToUTF8($title) + { + // first try UTF-8 because its easier to detect its present/absence + foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { + if (mb_check_encoding($title, $encoding)) { + return mb_convert_encoding($title, 'UTF-8', $encoding); + } + } + + return $title; + } + + /** + * Remove invalid UTF-8 characters from the given string. + * + * @param string $rawText + * + * @return string + */ + private function sanitizeUTF8Text($rawText) + { + if (mb_check_encoding($rawText, 'UTF-8')) { + return $rawText; + } + + return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); + } + /** * Stock entry with fetched or imported content. * Will fall back to OpenGraph data if available. -- cgit v1.2.3 From 4a81360efcdfe4bab8d75f7227c9cf5bfd514189 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sun, 7 Jan 2018 17:25:26 +0100 Subject: ContentProxy: fix a corner case when entry.url is empty in updateEntry Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index d4ea608f..f0d8c1b4 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -66,6 +66,14 @@ class ContentProxy // so we'll be able to refetch it in the future $content['url'] = !empty($content['url']) ? $content['url'] : $url; + // In one case (at least in tests), url is empty here + // so we set it using $url provided in the updateEntry call. + // Not sure what are the other possible cases where this property is empty + if (empty($entry->getUrl()) && !empty($url)) + { + $entry->setUrl($url); + } + $this->stockEntry($entry, $content); } -- cgit v1.2.3 From 781864b9546b0ff2d6fe42ce72f78b8f40b785e9 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sun, 7 Jan 2018 17:28:04 +0100 Subject: ContentProxy: swap entry url to origin_url and set new url according to graby content Closes #3529 Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index f0d8c1b4..da0ec5a3 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -68,9 +68,8 @@ class ContentProxy // In one case (at least in tests), url is empty here // so we set it using $url provided in the updateEntry call. - // Not sure what are the other possible cases where this property is empty - if (empty($entry->getUrl()) && !empty($url)) - { + // Not sure what are the other possible cases where this property is empty + if (empty($entry->getUrl()) && !empty($url)) { $entry->setUrl($url); } @@ -247,7 +246,15 @@ class ContentProxy */ private function stockEntry(Entry $entry, array $content) { - $entry->setUrl($content['url']); + // When a redirection occurs while fetching an entry + // we move the original url in origin_url property if empty + // and set the entry url with the final value + if (!empty($content['url']) && $entry->getUrl() !== $content['url']) { + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($content['url']); + } $this->setEntryDomainName($entry); -- cgit v1.2.3 From e07fadea76aa7329c4b955a59e74cb867c733706 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Thu, 6 Sep 2018 22:26:20 +0200 Subject: Refactor updateOriginUrl to include new behaviors behaviors - Leave origin_url unchanged if difference is an ending slash - Leave origin_url unchanged if difference is scheme - Ignore (noop) if difference is query string or fragment Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 54 ++++++++++++++++++++----- 1 file changed, 45 insertions(+), 9 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index da0ec5a3..007ee8bb 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -246,15 +246,7 @@ class ContentProxy */ private function stockEntry(Entry $entry, array $content) { - // When a redirection occurs while fetching an entry - // we move the original url in origin_url property if empty - // and set the entry url with the final value - if (!empty($content['url']) && $entry->getUrl() !== $content['url']) { - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($content['url']); - } + $this->updateOriginUrl($entry, $content['url']); $this->setEntryDomainName($entry); @@ -320,6 +312,50 @@ class ContentProxy } } + /** + * Update the origin_url field when a redirection occurs + * This field is set if it is empty and new url does not match ignore list. + * + * @param Entry $entry + * @param string $url + */ + private function updateOriginUrl(Entry $entry, $url) + { + if (!empty($url) && $entry->getUrl() !== $url) { + $parsed_entry_url = parse_url($entry->getUrl()); + $parsed_content_url = parse_url($url); + + $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); + $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); + + $diff = array_merge($diff_ec, $diff_ce); + $diff_keys = array_keys($diff); + sort($diff_keys); + + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: + $entry->setUrl($url); + break; + case ['fragment']: + case ['query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($url); + break; + } + } + } + /** * Validate that the given content has at least a title, an html and a url. * -- cgit v1.2.3 From fc040c749dec0275e562182562c1c1cb89e6cfa1 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Mon, 22 Oct 2018 23:08:58 +0200 Subject: updateOriginUrl: add behavior when diff is fragment and query Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 1 + 1 file changed, 1 insertion(+) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 007ee8bb..1a2a330f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -344,6 +344,7 @@ class ContentProxy break; case ['fragment']: case ['query']: + case ['fragment', 'query']: // noop break; default: -- cgit v1.2.3 From b49c87acf12f22e38db751fb35be5da2436abc45 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Mon, 22 Oct 2018 23:39:31 +0200 Subject: ignoreOriginUrl: add initial support of ignore lists Add the ability to specify hosts and patterns lists to ignore the given entry url and replace it with the fetched content url without touching to origin_url. This initial support should be reworked in the following months to move the hardcoded ignore lists in the database. Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 79 ++++++++++++++++++------- 1 file changed, 59 insertions(+), 20 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 1a2a330f..2dc436f8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -332,31 +332,70 @@ class ContentProxy $diff_keys = array_keys($diff); sort($diff_keys); - switch ($diff_keys) { - case ['path']: - if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry - || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + if ($this->ignoreUrl($entry->getUrl())) { + $entry->setUrl($url); + } else { + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: $entry->setUrl($url); - } - break; - case ['scheme']: - $entry->setUrl($url); - break; - case ['fragment']: - case ['query']: - case ['fragment', 'query']: - // noop - break; - default: - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($url); - break; + break; + case ['fragment']: + case ['query']: + case ['fragment', 'query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); + } + $entry->setUrl($url); + break; + } } } } + /** + * Check entry url against an ignore list to replace with content url. + * + * XXX: move the ignore list in the database to let users handle it + * + * @param string $url url to test + * + * @return bool true if url matches ignore list otherwise false + */ + private function ignoreUrl($url) + { + $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com']; + $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*']; + + $parsed_url = parse_url($url); + + $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) { + return $var === $parsed_url['host']; + }); + + if ([] !== $filtered) { + return true; + } + + $filtered = array_filter($ignored_patterns, function ($var) use ($url) { + return preg_match("`$var`i", $url); + }); + + if ([] !== $filtered) { + return true; + } + + return false; + } + /** * Validate that the given content has at least a title, an html and a url. * -- cgit v1.2.3 From 5ba5e22a092068aeb12213578fd8fc4edb2399fe Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 21:54:09 +0200 Subject: updateOriginUrl: rewrite some if, resolving feedbacks from PR Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 67 +++++++++++++------------ 1 file changed, 35 insertions(+), 32 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 2dc436f8..92351986 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -321,43 +321,46 @@ class ContentProxy */ private function updateOriginUrl(Entry $entry, $url) { - if (!empty($url) && $entry->getUrl() !== $url) { - $parsed_entry_url = parse_url($entry->getUrl()); - $parsed_content_url = parse_url($url); + if (empty($url) || $entry->getUrl() === $url) { + return false; + } + + $parsed_entry_url = parse_url($entry->getUrl()); + $parsed_content_url = parse_url($url); - $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); - $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); + $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); + $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); - $diff = array_merge($diff_ec, $diff_ce); - $diff_keys = array_keys($diff); - sort($diff_keys); + $diff = array_merge($diff_ec, $diff_ce); + $diff_keys = array_keys($diff); + sort($diff_keys); + + if ($this->ignoreUrl($entry->getUrl())) { + $entry->setUrl($url); + return false; + } - if ($this->ignoreUrl($entry->getUrl())) { + switch ($diff_keys) { + case ['path']: + if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry + || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId + $entry->setUrl($url); + } + break; + case ['scheme']: $entry->setUrl($url); - } else { - switch ($diff_keys) { - case ['path']: - if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry - || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId - $entry->setUrl($url); - } - break; - case ['scheme']: - $entry->setUrl($url); - break; - case ['fragment']: - case ['query']: - case ['fragment', 'query']: - // noop - break; - default: - if (empty($entry->getOriginUrl())) { - $entry->setOriginUrl($entry->getUrl()); - } - $entry->setUrl($url); - break; + break; + case ['fragment']: + case ['query']: + case ['fragment', 'query']: + // noop + break; + default: + if (empty($entry->getOriginUrl())) { + $entry->setOriginUrl($entry->getUrl()); } - } + $entry->setUrl($url); + break; } } -- cgit v1.2.3 From 44e63667d9cf331aeedef8cb964538823c0a145d Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 22:11:35 +0200 Subject: updateOriginUrl: add comment blocks for the parse_url diff check Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 92351986..a93f4a2d 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -328,6 +328,18 @@ class ContentProxy $parsed_entry_url = parse_url($entry->getUrl()); $parsed_content_url = parse_url($url); + /** + * The following part computes the list of part changes between two + * parse_url arrays. + * + * As array_diff_assoc only computes changes to go from the left array + * to the right one, we make two differents arrays to have both + * directions. We merge these two arrays and sort keys before passing + * the result to the switch. + * + * The resulting array gives us all changing parts between the two + * urls: scheme, host, path, query and/or fragment. + */ $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); @@ -340,6 +352,17 @@ class ContentProxy return false; } + /** + * This switch case lets us apply different behaviors according to + * changing parts of urls. + * + * As $diff_keys is an array, we provide arrays as cases. ['path'] means + * 'only the path is different between the two urls' whereas + * ['fragment', 'query'] means 'only fragment and query string parts are + * different between the two urls'. + * + * Note that values in $diff_keys are sorted. + */ switch ($diff_keys) { case ['path']: if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry -- cgit v1.2.3 From 60599679519e819301ce36185c3dd5ca7aa7f4ec Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 22:27:27 +0200 Subject: updateOriginUrl: remove 'query string' case from ignore list Two urls with a different query string may refer to two different pages so keep them both. Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 2 -- 1 file changed, 2 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index a93f4a2d..74130be8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -374,8 +374,6 @@ class ContentProxy $entry->setUrl($url); break; case ['fragment']: - case ['query']: - case ['fragment', 'query']: // noop break; default: -- cgit v1.2.3 From 1b220426e2e8139364b4a34678a2843c2e8bccf5 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Wed, 24 Oct 2018 22:33:32 +0200 Subject: phpcs Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 74130be8..d38811a2 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -349,6 +349,7 @@ class ContentProxy if ($this->ignoreUrl($entry->getUrl())) { $entry->setUrl($url); + return false; } @@ -360,7 +361,7 @@ class ContentProxy * 'only the path is different between the two urls' whereas * ['fragment', 'query'] means 'only fragment and query string parts are * different between the two urls'. - * + * * Note that values in $diff_keys are sorted. */ switch ($diff_keys) { -- cgit v1.2.3