aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorTobi823 <Tobi823@users.noreply.github.com>2018-09-18 15:04:19 +0200
committerTobi823 <Tobi823@users.noreply.github.com>2018-09-21 13:15:00 +0200
commitd76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e (patch)
tree9366674205ce85ab700cfc8b6905d9f906c1f39a /src
parent759c91940b9c97fdbd21a729c707ad686ded1202 (diff)
downloadwallabag-d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e.tar.gz
wallabag-d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e.tar.zst
wallabag-d76a5a6d60b6ee0d1f7efd0c8a70204f821ed99e.zip
Bugfix: Sanitize the title of a saved webpage from invalid UTF-8 characters
Diffstat (limited to 'src')
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php23
1 files changed, 23 insertions, 0 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index 3fe31c2c..2628af19 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -53,6 +53,7 @@ class ContentProxy
53 53
54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { 54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
55 $fetchedContent = $this->graby->fetchContent($url); 55 $fetchedContent = $this->graby->fetchContent($url);
56 $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']);
56 57
57 // when content is imported, we have information in $content 58 // when content is imported, we have information in $content
58 // in case fetching content goes bad, we'll keep the imported information instead of overriding them 59 // in case fetching content goes bad, we'll keep the imported information instead of overriding them
@@ -69,6 +70,28 @@ class ContentProxy
69 } 70 }
70 71
71 /** 72 /**
73 * Remove invalid UTF-8 characters from the given string in following steps:
74 * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid)
75 * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665)
76 * @param String $rawText
77 * @return string
78 */
79 private function sanitizeUTF8Text(String $rawText) {
80 if (mb_check_encoding($rawText, 'utf-8')) {
81 return $rawText; // return because its valid utf-8 text
82 }
83
84 // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding)
85 $convertedText = utf8_encode($rawText);
86 if (mb_check_encoding($convertedText, 'utf-8')) {
87 return $convertedText;
88 }
89
90 // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding
91 return iconv("UTF-8", "UTF-8//IGNORE", $rawText);
92 }
93
94 /**
72 * Use a Symfony validator to ensure the language is well formatted. 95 * Use a Symfony validator to ensure the language is well formatted.
73 * 96 *
74 * @param Entry $entry 97 * @param Entry $entry