diff options
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 94 |
1 files changed, 53 insertions, 41 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index ce82f6bc..d4ea608f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -70,47 +70,6 @@ class ContentProxy | |||
70 | } | 70 | } |
71 | 71 | ||
72 | /** | 72 | /** |
73 | * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. | ||
74 | * @param $title | ||
75 | * @param $contentType | ||
76 | * @return string | ||
77 | */ | ||
78 | private function sanitizeContentTitle($title, $contentType) { | ||
79 | if ('application/pdf' === $contentType) { | ||
80 | $title = $this->convertPdfEncodingToUTF8($title); | ||
81 | } | ||
82 | return $this->sanitizeUTF8Text($title); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not | ||
87 | * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. | ||
88 | * @param $title | ||
89 | * @return string (maybe contains invalid UTF-8 character) | ||
90 | */ | ||
91 | private function convertPdfEncodingToUTF8($title) { | ||
92 | // first try UTF-8 because its easier to detect its present/absence | ||
93 | foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) { | ||
94 | if (mb_check_encoding($title, $encoding)) { | ||
95 | return mb_convert_encoding($title, 'UTF-8', $encoding); | ||
96 | } | ||
97 | } | ||
98 | return $title; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * Remove invalid UTF-8 characters from the given string. | ||
103 | * @param String $rawText | ||
104 | * @return string | ||
105 | */ | ||
106 | private function sanitizeUTF8Text($rawText) { | ||
107 | if (mb_check_encoding($rawText, 'UTF-8')) { | ||
108 | return $rawText; | ||
109 | } | ||
110 | return iconv("UTF-8", "UTF-8//IGNORE", $rawText); | ||
111 | } | ||
112 | |||
113 | /** | ||
114 | * Use a Symfony validator to ensure the language is well formatted. | 73 | * Use a Symfony validator to ensure the language is well formatted. |
115 | * | 74 | * |
116 | * @param Entry $entry | 75 | * @param Entry $entry |
@@ -219,6 +178,59 @@ class ContentProxy | |||
219 | } | 178 | } |
220 | 179 | ||
221 | /** | 180 | /** |
181 | * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. | ||
182 | * | ||
183 | * @param $title | ||
184 | * @param $contentType | ||
185 | * | ||
186 | * @return string | ||
187 | */ | ||
188 | private function sanitizeContentTitle($title, $contentType) | ||
189 | { | ||
190 | if ('application/pdf' === $contentType) { | ||
191 | $title = $this->convertPdfEncodingToUTF8($title); | ||
192 | } | ||
193 | |||
194 | return $this->sanitizeUTF8Text($title); | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not | ||
199 | * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. | ||
200 | * | ||
201 | * @param $title | ||
202 | * | ||
203 | * @return string (maybe contains invalid UTF-8 character) | ||
204 | */ | ||
205 | private function convertPdfEncodingToUTF8($title) | ||
206 | { | ||
207 | // first try UTF-8 because its easier to detect its present/absence | ||
208 | foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { | ||
209 | if (mb_check_encoding($title, $encoding)) { | ||
210 | return mb_convert_encoding($title, 'UTF-8', $encoding); | ||
211 | } | ||
212 | } | ||
213 | |||
214 | return $title; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * Remove invalid UTF-8 characters from the given string. | ||
219 | * | ||
220 | * @param string $rawText | ||
221 | * | ||
222 | * @return string | ||
223 | */ | ||
224 | private function sanitizeUTF8Text($rawText) | ||
225 | { | ||
226 | if (mb_check_encoding($rawText, 'UTF-8')) { | ||
227 | return $rawText; | ||
228 | } | ||
229 | |||
230 | return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); | ||
231 | } | ||
232 | |||
233 | /** | ||
222 | * Stock entry with fetched or imported content. | 234 | * Stock entry with fetched or imported content. |
223 | * Will fall back to OpenGraph data if available. | 235 | * Will fall back to OpenGraph data if available. |
224 | * | 236 | * |