]> git.immae.eu Git - github/wallabag/wallabag.git/blobdiff - src/Wallabag/CoreBundle/Helper/ContentProxy.php
ContentProxy: swap entry url to origin_url and set new url according to graby content
[github/wallabag/wallabag.git] / src / Wallabag / CoreBundle / Helper / ContentProxy.php
index 3fe31c2c7d68b9115ab8bb412588270e87993873..da0ec5a31d4d87970125f5529964f3d74bd0dca4 100644 (file)
@@ -53,6 +53,7 @@ class ContentProxy
 
         if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) {
             $fetchedContent = $this->graby->fetchContent($url);
+            $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']);
 
             // when content is imported, we have information in $content
             // in case fetching content goes bad, we'll keep the imported information instead of overriding them
@@ -65,6 +66,13 @@ class ContentProxy
         // so we'll be able to refetch it in the future
         $content['url'] = !empty($content['url']) ? $content['url'] : $url;
 
+        // In one case (at least in tests), url is empty here
+        // so we set it using $url provided in the updateEntry call.
+        // Not sure what are the other possible cases where this property is empty
+        if (empty($entry->getUrl()) && !empty($url)) {
+            $entry->setUrl($url);
+        }
+
         $this->stockEntry($entry, $content);
     }
 
@@ -176,6 +184,59 @@ class ContentProxy
         $entry->setTitle($path);
     }
 
+    /**
+     * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character.
+     *
+     * @param $title
+     * @param $contentType
+     *
+     * @return string
+     */
+    private function sanitizeContentTitle($title, $contentType)
+    {
+        if ('application/pdf' === $contentType) {
+            $title = $this->convertPdfEncodingToUTF8($title);
+        }
+
+        return $this->sanitizeUTF8Text($title);
+    }
+
+    /**
+     * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not
+     * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8.
+     *
+     * @param $title
+     *
+     * @return string (maybe contains invalid UTF-8 character)
+     */
+    private function convertPdfEncodingToUTF8($title)
+    {
+        // first try UTF-8 because its easier to detect its present/absence
+        foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) {
+            if (mb_check_encoding($title, $encoding)) {
+                return mb_convert_encoding($title, 'UTF-8', $encoding);
+            }
+        }
+
+        return $title;
+    }
+
+    /**
+     * Remove invalid UTF-8 characters from the given string.
+     *
+     * @param string $rawText
+     *
+     * @return string
+     */
+    private function sanitizeUTF8Text($rawText)
+    {
+        if (mb_check_encoding($rawText, 'UTF-8')) {
+            return $rawText;
+        }
+
+        return iconv('UTF-8', 'UTF-8//IGNORE', $rawText);
+    }
+
     /**
      * Stock entry with fetched or imported content.
      * Will fall back to OpenGraph data if available.
@@ -185,7 +246,15 @@ class ContentProxy
      */
     private function stockEntry(Entry $entry, array $content)
     {
-        $entry->setUrl($content['url']);
+        // When a redirection occurs while fetching an entry
+        // we move the original url in origin_url property if empty
+        // and set the entry url with the final value
+        if (!empty($content['url']) && $entry->getUrl() !== $content['url']) {
+            if (empty($entry->getOriginUrl())) {
+                $entry->setOriginUrl($entry->getUrl());
+            }
+            $entry->setUrl($content['url']);
+        }
 
         $this->setEntryDomainName($entry);