diff options
author | Tobi823 <Tobi823@users.noreply.github.com> | 2018-09-18 15:04:19 +0200 |
---|---|---|
committer | Tobi823 <Tobi823@users.noreply.github.com> | 2018-09-21 13:15:00 +0200 |
commit | d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e (patch) | |
tree | 9366674205ce85ab700cfc8b6905d9f906c1f39a /src/Wallabag | |
parent | 759c91940b9c97fdbd21a729c707ad686ded1202 (diff) | |
download | wallabag-d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e.tar.gz wallabag-d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e.tar.zst wallabag-d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e.zip |
Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters
Diffstat (limited to 'src/Wallabag')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 3fe31c2c..2628af19 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -53,6 +53,7 @@ class ContentProxy | |||
53 | 53 | ||
54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | 54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
55 | $fetchedContent = $this->graby->fetchContent($url); | 55 | $fetchedContent = $this->graby->fetchContent($url); |
56 | $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); | ||
56 | 57 | ||
57 | // when content is imported, we have information in $content | 58 | // when content is imported, we have information in $content |
58 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them | 59 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them |
@@ -69,6 +70,28 @@ class ContentProxy | |||
69 | } | 70 | } |
70 | 71 | ||
71 | /** | 72 | /** |
73 | * Remove invalid UTF-8 characters from the given string in following steps: | ||
74 | * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) | ||
75 | * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) | ||
76 | * @param String $rawText | ||
77 | * @return string | ||
78 | */ | ||
79 | private function sanitizeUTF8Text(String $rawText) { | ||
80 | if (mb_check_encoding($rawText, 'utf-8')) { | ||
81 | return $rawText; // return because its valid utf-8 text | ||
82 | } | ||
83 | |||
84 | // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) | ||
85 | $convertedText = utf8_encode($rawText); | ||
86 | if (mb_check_encoding($convertedText, 'utf-8')) { | ||
87 | return $convertedText; | ||
88 | } | ||
89 | |||
90 | // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding | ||
91 | return iconv("UTF-8", "UTF-8//IGNORE", $rawText); | ||
92 | } | ||
93 | |||
94 | /** | ||
72 | * Use a Symfony validator to ensure the language is well formatted. | 95 | * Use a Symfony validator to ensure the language is well formatted. |
73 | * | 96 | * |
74 | * @param Entry $entry | 97 | * @param Entry $entry |