X-Git-Url: https://git.immae.eu/?a=blobdiff_plain;f=src%2FWallabag%2FCoreBundle%2FHelper%2FContentProxy.php;h=29259bbdb6ca57e0f66927a1993b029e4036ceec;hb=8648f0c00534e8af83b2a5451269d79906db6c16;hp=3fe31c2c7d68b9115ab8bb412588270e87993873;hpb=ada5d5b2694ec95c6ca84aa91f22add1973343e0;p=github%2Fwallabag%2Fwallabag.git diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 3fe31c2c..29259bbd 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -53,6 +53,7 @@ class ContentProxy if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { $fetchedContent = $this->graby->fetchContent($url); + $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); // when content is imported, we have information in $content // in case fetching content goes bad, we'll keep the imported information instead of overriding them @@ -68,6 +69,28 @@ class ContentProxy $this->stockEntry($entry, $content); } + /** + * Remove invalid UTF-8 characters from the given string in following steps: + * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) + * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) + * @param String $rawText + * @return string + */ + private function sanitizeUTF8Text($rawText) { + if (mb_check_encoding($rawText, 'utf-8')) { + return $rawText; // return because its valid utf-8 text + } + + // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) + $convertedText = utf8_encode($rawText); + if (mb_check_encoding($convertedText, 'utf-8')) { + return $convertedText; + } + + // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding + return iconv("UTF-8", "UTF-8//IGNORE", $rawText); + } + /** * Use a Symfony validator to ensure the language is well formatted. *