]> git.immae.eu Git - github/wallabag/wallabag.git/blobdiff - src/Wallabag/CoreBundle/Helper/ContentProxy.php
Merge pull request #3706 from shtrom/fix/gnu-make-bash
[github/wallabag/wallabag.git] / src / Wallabag / CoreBundle / Helper / ContentProxy.php
index 2628af190b3119a952394f382368c0ab914bfee9..d4ea608f3c724d6ae9f896199fb23ede3fc96435 100644 (file)
@@ -53,7 +53,7 @@ class ContentProxy
 
         if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
             $fetchedContent = $this->graby->fetchContent($url);
-            $fetchedContent['title'] = $this->sanitizeUTF8Text($fetchedContent['title']);
+            $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
 
             // when content is imported, we have information in $content
             // in case fetching content goes bad, we'll keep the imported information instead of overriding them
@@ -69,28 +69,6 @@ class ContentProxy
         $this->stockEntry($entry, $content);
     }
 
-    /**
-     * Remove invalid UTF-8 characters from the given string in following steps:
-     * - try to interpret the given string as ISO-8859-1, convert it to UTF-8 and return it (if its valid)
-     * - simply remove every invalid UTF-8 character and return the result (https://stackoverflow.com/a/1433665)
-     * @param String $rawText
-     * @return string
-     */
-    private function sanitizeUTF8Text(String $rawText) {
-        if (mb_check_encoding($rawText, 'utf-8')) {
-            return $rawText; // return because its valid utf-8 text
-        }
-
-        // we assume that $text is encoded in ISO-8859-1 (and not the similar Windows-1252 or other encoding)
-        $convertedText = utf8_encode($rawText);
-        if (mb_check_encoding($convertedText, 'utf-8')) {
-            return $convertedText;
-        }
-
-        // last resort: simply remove invalid UTF-8 character because $rawText can have some every exotic encoding
-        return iconv("UTF-8", "UTF-8//IGNORE", $rawText);
-    }
-
     /**
      * Use a Symfony validator to ensure the language is well formatted.
      *
@@ -199,6 +177,59 @@ class ContentProxy
         $entry->setTitle($path);
     }
 
+    /**
+     * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
+     *
+     * @param $title
+     * @param $contentType
+     *
+     * @return string
+     */
+    private function sanitizeContentTitle($title, $contentType)
+    {
+        if ('application/pdf' === $contentType) {
+            $title = $this->convertPdfEncodingToUTF8($title);
+        }
+
+        return $this->sanitizeUTF8Text($title);
+    }
+
+    /**
+     * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
+     * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
+     *
+     * @param $title
+     *
+     * @return string (maybe contains invalid UTF-8 character)
+     */
+    private function convertPdfEncodingToUTF8($title)
+    {
+        // first try UTF-8 because its easier to detect its present/absence
+        foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
+            if (mb_check_encoding($title, $encoding)) {
+                return mb_convert_encoding($title, 'UTF-8', $encoding);
+            }
+        }
+
+        return $title;
+    }
+
+    /**
+     * Remove invalid UTF-8 characters from the given string.
+     *
+     * @param string $rawText
+     *
+     * @return string
+     */
+    private function sanitizeUTF8Text($rawText)
+    {
+        if (mb_check_encoding($rawText, 'UTF-8')) {
+            return $rawText;
+        }
+
+        return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
+    }
+
     /**
      * Stock entry with fetched or imported content.
      * Will fall back to OpenGraph data if available.