aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorKevin Decherf <kevin@kdecherf.com>2018-09-24 18:34:16 +0200
committerGitHub <noreply@github.com>2018-09-24 18:34:16 +0200
commit0f5c15d5434ab1ce22f250aeb56271162a6deca0 (patch)
tree0888d976a97bebae1cdea6433f2f227a9f0ec95c /src
parent759c91940b9c97fdbd21a729c707ad686ded1202 (diff)
parent28cc645b93a3505f39f8b5655e5f860544c023b4 (diff)
downloadwallabag-0f5c15d5434ab1ce22f250aeb56271162a6deca0.tar.gz
wallabag-0f5c15d5434ab1ce22f250aeb56271162a6deca0.tar.zst
wallabag-0f5c15d5434ab1ce22f250aeb56271162a6deca0.zip
Merge pull request #3725 from Tobi823/master
Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters.
Diffstat (limited to 'src')
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php54
1 files changed, 54 insertions, 0 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index 3fe31c2c..d4ea608f 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -53,6 +53,7 @@ class ContentProxy
53 53
54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { 54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
55 $fetchedContent = $this->graby->fetchContent($url); 55 $fetchedContent = $this->graby->fetchContent($url);
56 $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
56 57
57 // when content is imported, we have information in $content 58 // when content is imported, we have information in $content
58 // in case fetching content goes bad, we'll keep the imported information instead of overriding them 59 // in case fetching content goes bad, we'll keep the imported information instead of overriding them
@@ -177,6 +178,59 @@ class ContentProxy
177 } 178 }
178 179
179 /** 180 /**
181 * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
182 *
183 * @param $title
184 * @param $contentType
185 *
186 * @return string
187 */
188 private function sanitizeContentTitle($title, $contentType)
189 {
190 if ('application/pdf' === $contentType) {
191 $title = $this->convertPdfEncodingToUTF8($title);
192 }
193
194 return $this->sanitizeUTF8Text($title);
195 }
196
197 /**
198 * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
199 * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
200 *
201 * @param $title
202 *
203 * @return string (maybe contains invalid UTF-8 character)
204 */
205 private function convertPdfEncodingToUTF8($title)
206 {
207 // first try UTF-8 because its easier to detect its present/absence
208 foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
209 if (mb_check_encoding($title, $encoding)) {
210 return mb_convert_encoding($title, 'UTF-8', $encoding);
211 }
212 }
213
214 return $title;
215 }
216
217 /**
218 * Remove invalid UTF-8 characters from the given string.
219 *
220 * @param string $rawText
221 *
222 * @return string
223 */
224 private function sanitizeUTF8Text($rawText)
225 {
226 if (mb_check_encoding($rawText, 'UTF-8')) {
227 return $rawText;
228 }
229
230 return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
231 }
232
233 /**
180 * Stock entry with fetched or imported content. 234 * Stock entry with fetched or imported content.
181 * Will fall back to OpenGraph data if available. 235 * Will fall back to OpenGraph data if available.
182 * 236 *