aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php94
1 files changed, 53 insertions, 41 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index ce82f6bc..d4ea608f 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -70,47 +70,6 @@ class ContentProxy
70 } 70 }
71 71
72 /** 72 /**
73 * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
74 * @param $title
75 * @param $contentType
76 * @return string
77 */
78 private function sanitizeContentTitle($title, $contentType) {
79 if ('application/pdf' === $contentType) {
80 $title = $this->convertPdfEncodingToUTF8($title);
81 }
82 return $this->sanitizeUTF8Text($title);
83 }
84
85 /**
86 * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
87 * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
88 * @param $title
89 * @return string (maybe contains invalid UTF-8 character)
90 */
91 private function convertPdfEncodingToUTF8($title) {
92 // first try UTF-8 because its easier to detect its present/absence
93 foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) {
94 if (mb_check_encoding($title, $encoding)) {
95 return mb_convert_encoding($title, 'UTF-8', $encoding);
96 }
97 }
98 return $title;
99 }
100
101 /**
102 * Remove invalid UTF-8 characters from the given string.
103 * @param String $rawText
104 * @return string
105 */
106 private function sanitizeUTF8Text($rawText) {
107 if (mb_check_encoding($rawText, 'UTF-8')) {
108 return $rawText;
109 }
110 return iconv("UTF-8", "UTF-8//IGNORE", $rawText);
111 }
112
113 /**
114 * Use a Symfony validator to ensure the language is well formatted. 73 * Use a Symfony validator to ensure the language is well formatted.
115 * 74 *
116 * @param Entry $entry 75 * @param Entry $entry
@@ -219,6 +178,59 @@ class ContentProxy
219 } 178 }
220 179
221 /** 180 /**
181 * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
182 *
183 * @param $title
184 * @param $contentType
185 *
186 * @return string
187 */
188 private function sanitizeContentTitle($title, $contentType)
189 {
190 if ('application/pdf' === $contentType) {
191 $title = $this->convertPdfEncodingToUTF8($title);
192 }
193
194 return $this->sanitizeUTF8Text($title);
195 }
196
197 /**
198 * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
199 * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
200 *
201 * @param $title
202 *
203 * @return string (maybe contains invalid UTF-8 character)
204 */
205 private function convertPdfEncodingToUTF8($title)
206 {
207 // first try UTF-8 because its easier to detect its present/absence
208 foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
209 if (mb_check_encoding($title, $encoding)) {
210 return mb_convert_encoding($title, 'UTF-8', $encoding);
211 }
212 }
213
214 return $title;
215 }
216
217 /**
218 * Remove invalid UTF-8 characters from the given string.
219 *
220 * @param string $rawText
221 *
222 * @return string
223 */
224 private function sanitizeUTF8Text($rawText)
225 {
226 if (mb_check_encoding($rawText, 'UTF-8')) {
227 return $rawText;
228 }
229
230 return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
231 }
232
233 /**
222 * Stock entry with fetched or imported content. 234 * Stock entry with fetched or imported content.
223 * Will fall back to OpenGraph data if available. 235 * Will fall back to OpenGraph data if available.
224 * 236 *