aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorTobi823 <Tobi823@users.noreply.github.com>2018-09-19 12:30:26 +0200
committerTobi823 <Tobi823@users.noreply.github.com>2018-09-21 13:15:00 +0200
commitf80f16dfc858ec90da76daacd405b0cfdaa32f74 (patch)
tree6232e201550d85ac0a554245990c97f1ee37f4e8
parent8648f0c00534e8af83b2a5451269d79906db6c16 (diff)
downloadwallabag-f80f16dfc858ec90da76daacd405b0cfdaa32f74.tar.gz
wallabag-f80f16dfc858ec90da76daacd405b0cfdaa32f74.tar.zst
wallabag-f80f16dfc858ec90da76daacd405b0cfdaa32f74.zip
Try to detect the character encoding in PDFs and try to translate
the title from the PDF to UTF-8
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php46
1 files changed, 33 insertions, 13 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index 29259bbd..fab05268 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -53,7 +53,7 @@ class ContentProxy
53 53
54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { 54 if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
55 $fetchedContent = $this->graby->fetchContent($url); 55 $fetchedContent = $this->graby->fetchContent($url);
56 $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); 56 $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
57 57
58 // when content is imported, we have information in $content 58 // when content is imported, we have information in $content
59 // in case fetching content goes bad, we'll keep the imported information instead of overriding them 59 // in case fetching content goes bad, we'll keep the imported information instead of overriding them
@@ -70,24 +70,44 @@ class ContentProxy
70 } 70 }
71 71
72 /** 72 /**
73 * Remove invalid UTF-8 characters from the given string in following steps: 73 * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
74 * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) 74 * @param $title
75 * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) 75 * @param $contentType
76 * @param String $rawText
77 * @return string 76 * @return string
78 */ 77 */
79 private function sanitizeUTF8Text($rawText) { 78 private function sanitizeContentTitle($title, $contentType) {
80 if (mb_check_encoding($rawText, 'utf-8')) { 79 if ('application/pdf' === $contentType) {
81 return $rawText; // return because its valid utf-8 text 80 $convertedTitle = $this->convertPdfEncodingToUTF8($title);
81 return $this->sanitizeUTF8Text($convertedTitle);
82 } 82 }
83 return $this->sanitizeUTF8Text($title);
84 }
83 85
84 // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) 86 /**
85 $convertedText = utf8_encode($rawText); 87 * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
86 if (mb_check_encoding($convertedText, 'utf-8')) { 88 * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
87 return $convertedText; 89 * @param $title
90 * @return string (maybe contains invalid UTF-8 character)
91 */
92 private function convertPdfEncodingToUTF8($title) {
93 // first try UTF-16 (then UTF-8) because its easier to detect its present/absence
94 foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) {
95 if (mb_check_encoding($title, $encoding)) {
96 return mb_convert_encoding($title, 'UTF-8', $encoding);
97 }
88 } 98 }
99 return $title;
100 }
89 101
90 // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding 102 /**
103 * Remove invalid UTF-8 characters from the given string.
104 * @param String $rawText
105 * @return string
106 */
107 private function sanitizeUTF8Text($rawText) {
108 if (mb_check_encoding($rawText, 'UTF-8')) {
109 return $rawText;
110 }
91 return iconv("UTF-8", "UTF-8//IGNORE", $rawText); 111 return iconv("UTF-8", "UTF-8//IGNORE", $rawText);
92 } 112 }
93 113