diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 3fe31c2c..d4ea608f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -53,6 +53,7 @@ class ContentProxy | |||
53 | 53 | ||
54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | 54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
55 | $fetchedContent = $this->graby->fetchContent($url); | 55 | $fetchedContent = $this->graby->fetchContent($url); |
56 | $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']); | ||
56 | 57 | ||
57 | // when content is imported, we have information in $content | 58 | // when content is imported, we have information in $content |
58 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them | 59 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them |
@@ -177,6 +178,59 @@ class ContentProxy | |||
177 | } | 178 | } |
178 | 179 | ||
179 | /** | 180 | /** |
181 | * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. | ||
182 | * | ||
183 | * @param $title | ||
184 | * @param $contentType | ||
185 | * | ||
186 | * @return string | ||
187 | */ | ||
188 | private function sanitizeContentTitle($title, $contentType) | ||
189 | { | ||
190 | if ('application/pdf' === $contentType) { | ||
191 | $title = $this->convertPdfEncodingToUTF8($title); | ||
192 | } | ||
193 | |||
194 | return $this->sanitizeUTF8Text($title); | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not | ||
199 | * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. | ||
200 | * | ||
201 | * @param $title | ||
202 | * | ||
203 | * @return string (maybe contains invalid UTF-8 character) | ||
204 | */ | ||
205 | private function convertPdfEncodingToUTF8($title) | ||
206 | { | ||
207 | // first try UTF-8 because its easier to detect its present/absence | ||
208 | foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { | ||
209 | if (mb_check_encoding($title, $encoding)) { | ||
210 | return mb_convert_encoding($title, 'UTF-8', $encoding); | ||
211 | } | ||
212 | } | ||
213 | |||
214 | return $title; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * Remove invalid UTF-8 characters from the given string. | ||
219 | * | ||
220 | * @param string $rawText | ||
221 | * | ||
222 | * @return string | ||
223 | */ | ||
224 | private function sanitizeUTF8Text($rawText) | ||
225 | { | ||
226 | if (mb_check_encoding($rawText, 'UTF-8')) { | ||
227 | return $rawText; | ||
228 | } | ||
229 | |||
230 | return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); | ||
231 | } | ||
232 | |||
233 | /** | ||
180 | * Stock entry with fetched or imported content. | 234 | * Stock entry with fetched or imported content. |
181 | * Will fall back to OpenGraph data if available. | 235 | * Will fall back to OpenGraph data if available. |
182 | * | 236 | * |