diff options
author | Tobi823 <Tobi823@users.noreply.github.com> | 2018-09-19 12:30:26 +0200 |
---|---|---|
committer | Tobi823 <Tobi823@users.noreply.github.com> | 2018-09-21 13:15:00 +0200 |
commit | f80f16dfc858ec90da76daacd405b0cfdaa32f74 (patch) | |
tree | 6232e201550d85ac0a554245990c97f1ee37f4e8 | |
parent | 8648f0c00534e8af83b2a5451269d79906db6c16 (diff) | |
download | wallabag-f80f16dfc858ec90da76daacd405b0cfdaa32f74.tar.gz wallabag-f80f16dfc858ec90da76daacd405b0cfdaa32f74.tar.zst wallabag-f80f16dfc858ec90da76daacd405b0cfdaa32f74.zip |
Try to detect the character encoding in PDFs and try to translate
the title from the PDF to UTF-8
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 46 |
1 files changed, 33 insertions, 13 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 29259bbd..fab05268 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -53,7 +53,7 @@ class ContentProxy | |||
53 | 53 | ||
54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | 54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
55 | $fetchedContent = $this->graby->fetchContent($url); | 55 | $fetchedContent = $this->graby->fetchContent($url); |
56 | $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']); | 56 | $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']); |
57 | 57 | ||
58 | // when content is imported, we have information in $content | 58 | // when content is imported, we have information in $content |
59 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them | 59 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them |
@@ -70,24 +70,44 @@ class ContentProxy | |||
70 | } | 70 | } |
71 | 71 | ||
72 | /** | 72 | /** |
73 | * Remove invalid UTF-8 characters from the given string in following steps: | 73 | * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. |
74 | * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid) | 74 | * @param $title |
75 | * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665) | 75 | * @param $contentType |
76 | * @param String $rawText | ||
77 | * @return string | 76 | * @return string |
78 | */ | 77 | */ |
79 | private function sanitizeUTF8Text($rawText) { | 78 | private function sanitizeContentTitle($title, $contentType) { |
80 | if (mb_check_encoding($rawText, 'utf-8')) { | 79 | if ('application/pdf' === $contentType) { |
81 | return $rawText; // return because its valid utf-8 text | 80 | $convertedTitle = $this->convertPdfEncodingToUTF8($title); |
81 | return $this->sanitizeUTF8Text($convertedTitle); | ||
82 | } | 82 | } |
83 | return $this->sanitizeUTF8Text($title); | ||
84 | } | ||
83 | 85 | ||
84 | // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding) | 86 | /** |
85 | $convertedText = utf8_encode($rawText); | 87 | * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not |
86 | if (mb_check_encoding($convertedText, 'utf-8')) { | 88 | * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. |
87 | return $convertedText; | 89 | * @param $title |
90 | * @return string (maybe contains invalid UTF-8 character) | ||
91 | */ | ||
92 | private function convertPdfEncodingToUTF8($title) { | ||
93 | // first try UTF-16 (then UTF-8) because its easier to detect its present/absence | ||
94 | foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) { | ||
95 | if (mb_check_encoding($title, $encoding)) { | ||
96 | return mb_convert_encoding($title, 'UTF-8', $encoding); | ||
97 | } | ||
88 | } | 98 | } |
99 | return $title; | ||
100 | } | ||
89 | 101 | ||
90 | // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding | 102 | /** |
103 | * Remove invalid UTF-8 characters from the given string. | ||
104 | * @param String $rawText | ||
105 | * @return string | ||
106 | */ | ||
107 | private function sanitizeUTF8Text($rawText) { | ||
108 | if (mb_check_encoding($rawText, 'UTF-8')) { | ||
109 | return $rawText; | ||
110 | } | ||
91 | return iconv("UTF-8", "UTF-8//IGNORE", $rawText); | 111 | return iconv("UTF-8", "UTF-8//IGNORE", $rawText); |
92 | } | 112 | } |
93 | 113 | ||