diff options
Diffstat (limited to 'src/Wallabag/CoreBundle/Helper/ContentProxy.php')
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 62 |
1 files changed, 58 insertions, 4 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index fe795d42..d4ea608f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -53,6 +53,7 @@ class ContentProxy | |||
53 | 53 | ||
54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | 54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
55 | $fetchedContent = $this->graby->fetchContent($url); | 55 | $fetchedContent = $this->graby->fetchContent($url); |
56 | $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']); | ||
56 | 57 | ||
57 | // when content is imported, we have information in $content | 58 | // when content is imported, we have information in $content |
58 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them | 59 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them |
@@ -85,7 +86,7 @@ class ContentProxy | |||
85 | (new LocaleConstraint()) | 86 | (new LocaleConstraint()) |
86 | ); | 87 | ); |
87 | 88 | ||
88 | if (0 === count($errors)) { | 89 | if (0 === \count($errors)) { |
89 | $entry->setLanguage($value); | 90 | $entry->setLanguage($value); |
90 | 91 | ||
91 | return; | 92 | return; |
@@ -107,7 +108,7 @@ class ContentProxy | |||
107 | (new UrlConstraint()) | 108 | (new UrlConstraint()) |
108 | ); | 109 | ); |
109 | 110 | ||
110 | if (0 === count($errors)) { | 111 | if (0 === \count($errors)) { |
111 | $entry->setPreviewPicture($value); | 112 | $entry->setPreviewPicture($value); |
112 | 113 | ||
113 | return; | 114 | return; |
@@ -177,6 +178,59 @@ class ContentProxy | |||
177 | } | 178 | } |
178 | 179 | ||
179 | /** | 180 | /** |
181 | * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. | ||
182 | * | ||
183 | * @param $title | ||
184 | * @param $contentType | ||
185 | * | ||
186 | * @return string | ||
187 | */ | ||
188 | private function sanitizeContentTitle($title, $contentType) | ||
189 | { | ||
190 | if ('application/pdf' === $contentType) { | ||
191 | $title = $this->convertPdfEncodingToUTF8($title); | ||
192 | } | ||
193 | |||
194 | return $this->sanitizeUTF8Text($title); | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not | ||
199 | * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. | ||
200 | * | ||
201 | * @param $title | ||
202 | * | ||
203 | * @return string (maybe contains invalid UTF-8 character) | ||
204 | */ | ||
205 | private function convertPdfEncodingToUTF8($title) | ||
206 | { | ||
207 | // first try UTF-8 because its easier to detect its present/absence | ||
208 | foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { | ||
209 | if (mb_check_encoding($title, $encoding)) { | ||
210 | return mb_convert_encoding($title, 'UTF-8', $encoding); | ||
211 | } | ||
212 | } | ||
213 | |||
214 | return $title; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * Remove invalid UTF-8 characters from the given string. | ||
219 | * | ||
220 | * @param string $rawText | ||
221 | * | ||
222 | * @return string | ||
223 | */ | ||
224 | private function sanitizeUTF8Text($rawText) | ||
225 | { | ||
226 | if (mb_check_encoding($rawText, 'UTF-8')) { | ||
227 | return $rawText; | ||
228 | } | ||
229 | |||
230 | return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); | ||
231 | } | ||
232 | |||
233 | /** | ||
180 | * Stock entry with fetched or imported content. | 234 | * Stock entry with fetched or imported content. |
181 | * Will fall back to OpenGraph data if available. | 235 | * Will fall back to OpenGraph data if available. |
182 | * | 236 | * |
@@ -212,7 +266,7 @@ class ContentProxy | |||
212 | $entry->setHttpStatus($content['status']); | 266 | $entry->setHttpStatus($content['status']); |
213 | } | 267 | } |
214 | 268 | ||
215 | if (!empty($content['authors']) && is_array($content['authors'])) { | 269 | if (!empty($content['authors']) && \is_array($content['authors'])) { |
216 | $entry->setPublishedBy($content['authors']); | 270 | $entry->setPublishedBy($content['authors']); |
217 | } | 271 | } |
218 | 272 | ||
@@ -233,7 +287,7 @@ class ContentProxy | |||
233 | } | 287 | } |
234 | 288 | ||
235 | // if content is an image, define it as a preview too | 289 | // if content is an image, define it as a preview too |
236 | if (!empty($content['content_type']) && in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { | 290 | if (!empty($content['content_type']) && \in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { |
237 | $this->updatePreviewPicture($entry, $content['url']); | 291 | $this->updatePreviewPicture($entry, $content['url']); |
238 | } | 292 | } |
239 | 293 | ||