From d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Tue, 18 Sep 2018 15:04:19 +0200 Subject: Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 3fe31c2c..2628af19 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -53,6 +53,7 @@ class ContentProxy if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { $fetchedContent = $this->graby->fetchContent($url); + $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); // when content is imported, we have information in $content // in case fetching content goes bad, we'll keep the imported information instead of overriding them @@ -68,6 +69,28 @@ class ContentProxy $this->stockEntry($entry, $content); } + /** + * Remove invalid UTF-8 characters from the given string in following steps: + * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) + * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) + * @param String $rawText + * @return string + */ + private function sanitizeUTF8Text(String $rawText) { + if (mb_check_encoding($rawText, 'utf-8')) { + return $rawText; // return because its valid utf-8 text + } + + // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) + $convertedText = utf8_encode($rawText); + if (mb_check_encoding($convertedText, 'utf-8')) { + return $convertedText; + } + + // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding + return iconv("UTF-8", "UTF-8//IGNORE", $rawText); + } + /** * Use a Symfony validator to ensure the language is well formatted. * -- cgit v1.2.3