]> git.immae.eu Git - github/wallabag/wallabag.git/commitdiff
Add tests for logic
authorTobi823 <Tobi823@users.noreply.github.com>
Wed, 19 Sep 2018 11:59:07 +0000 (13:59 +0200)
committerTobi823 <Tobi823@users.noreply.github.com>
Fri, 21 Sep 2018 11:15:00 +0000 (13:15 +0200)
Try to translate the title of a PDF from UTF-8 (then UTF-16BE, then WINDOWS-1252) to UTF-8

src/Wallabag/CoreBundle/Helper/ContentProxy.php
tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php

index fab052685ccef748c6aba14591e05fc7866780cb..500901005c3df6b6906baa2d675aade2d0a5b119 100644 (file)
@@ -90,8 +90,8 @@ class ContentProxy
      * @return string (maybe contains invalid UTF-8 character)
      */
     private function convertPdfEncodingToUTF8($title) {
-        // first try UTF-16 (then UTF-8) because its easier to detect its present/absence
-        foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) {
+        // first try UTF-8 because its easier to detect its present/absence
+        foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) {
             if (mb_check_encoding($title, $encoding)) {
                 return mb_convert_encoding($title, 'UTF-8', $encoding);
             }
index 51df8de1bd8ccd94283aef19c2782ff15996b7ce..9d8098efdff15fdd44397d9915b19c3784aea68d 100644 (file)
@@ -531,6 +531,242 @@ class ContentProxyTest extends TestCase
         $this->assertSame('1.1.1.1', $entry->getDomainName());
     }
 
+    public function testWebsiteWithValidUTF8Title_doNothing()
+    {
+        // You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex
+        // See http://graphemica.com for more info about the characters
+        // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
+        $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A');
+
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = $this->getMockBuilder('Graby\Graby')
+            ->setMethods(['fetchContent'])
+            ->disableOriginalConstructor()
+            ->getMock();
+
+        $graby->expects($this->any())
+            ->method('fetchContent')
+            ->willReturn([
+                'html' => false,
+                'title' => $actualTitle,
+                'url' => '',
+                'content_type' => 'text/html',
+                'language' => '',
+            ]);
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = new Entry(new User());
+        $proxy->updateEntry($entry, 'http://0.0.0.0');
+
+        // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
+        $expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
+        $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
+    }
+
+    public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter()
+    {
+        // See http://graphemica.com for more info about the characters
+        // 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character.
+        // The correct UTF-8 â‚¬ character (U+20AC) is E282AC
+        $actualTitle = $this->hexToStr('61' . '80' . '62');
+
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = $this->getMockBuilder('Graby\Graby')
+            ->setMethods(['fetchContent'])
+            ->disableOriginalConstructor()
+            ->getMock();
+
+        $graby->expects($this->any())
+            ->method('fetchContent')
+            ->willReturn([
+                'html' => false,
+                'title' => $actualTitle,
+                'url' => '',
+                'content_type' => 'text/html',
+                'language' => '',
+            ]);
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = new Entry(new User());
+        $proxy->updateEntry($entry, 'http://0.0.0.0');
+
+        // 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed
+        $expectedTitle = '61' . '62';
+        $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
+    }
+
+    public function testPdfWithUTF16BETitle_convertToUTF8()
+    {
+        // See http://graphemica.com for more info about the characters
+        // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE
+        $actualTitle = $this->hexToStr('D83DDE3B');
+
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = $this->getMockBuilder('Graby\Graby')
+            ->setMethods(['fetchContent'])
+            ->disableOriginalConstructor()
+            ->getMock();
+
+        $graby->expects($this->any())
+            ->method('fetchContent')
+            ->willReturn([
+                'html' => false,
+                'title' => $actualTitle,
+                'url' => '',
+                'content_type' => 'application/pdf',
+                'language' => '',
+            ]);
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = new Entry(new User());
+        $proxy->updateEntry($entry, 'http://0.0.0.0');
+
+        // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
+        $expectedTitle = 'F09F98BB';
+        $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
+    }
+
+    public function testPdfWithUTF8Title_doNothing()
+    {
+        // See http://graphemica.com for more info about the characters
+        // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8
+        $actualTitle = $this->hexToStr('F09F98BB');
+
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = $this->getMockBuilder('Graby\Graby')
+            ->setMethods(['fetchContent'])
+            ->disableOriginalConstructor()
+            ->getMock();
+
+        $graby->expects($this->any())
+            ->method('fetchContent')
+            ->willReturn([
+                'html' => false,
+                'title' => $actualTitle,
+                'url' => '',
+                'content_type' => 'application/pdf',
+                'language' => '',
+            ]);
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = new Entry(new User());
+        $proxy->updateEntry($entry, 'http://0.0.0.0');
+
+        // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
+        $expectedTitle = 'F09F98BB';
+        $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
+    }
+
+    public function testPdfWithWINDOWS1252Title_convertToUTF8()
+    {
+        // See http://graphemica.com for more info about the characters
+        // '€' (80) in hexadecimal and WINDOWS-1252
+        $actualTitle = $this->hexToStr('80');
+
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = $this->getMockBuilder('Graby\Graby')
+            ->setMethods(['fetchContent'])
+            ->disableOriginalConstructor()
+            ->getMock();
+
+        $graby->expects($this->any())
+            ->method('fetchContent')
+            ->willReturn([
+                'html' => false,
+                'title' => $actualTitle,
+                'url' => '',
+                'content_type' => 'application/pdf',
+                'language' => '',
+            ]);
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = new Entry(new User());
+        $proxy->updateEntry($entry, 'http://0.0.0.0');
+
+        // '€' (U+20AC or E282AC) in hexadecimal and UTF-8
+        $expectedTitle = 'E282AC';
+        $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
+    }
+
+    public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter()
+    {
+        // See http://graphemica.com for more info about the characters
+        // '😻ℤ�z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8
+        // 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252
+        $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A');
+
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = $this->getMockBuilder('Graby\Graby')
+            ->setMethods(['fetchContent'])
+            ->disableOriginalConstructor()
+            ->getMock();
+
+        $graby->expects($this->any())
+            ->method('fetchContent')
+            ->willReturn([
+                'html' => false,
+                'title' => $actualTitle,
+                'url' => '',
+                'content_type' => 'application/pdf',
+                'language' => '',
+            ]);
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = new Entry(new User());
+        $proxy->updateEntry($entry, 'http://0.0.0.0');
+
+        // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
+        // the 0x81 (represented by ï¿½) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed
+        $expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
+        $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
+    }
+
+    /**
+     * https://stackoverflow.com/a/18506801
+     * @param $string
+     * @return string
+     */
+    function strToHex($string){
+        $hex = '';
+        for ($i=0; $i<strlen($string); $i++){
+            $ord = ord($string[$i]);
+            $hexCode = dechex($ord);
+            $hex .= substr('0'.$hexCode, -2);
+        }
+        return strToUpper($hex);
+    }
+
+    /**
+     * https://stackoverflow.com/a/18506801
+     * @param $hex
+     * @return string
+     */
+    function hexToStr($hex){
+        $string='';
+        for ($i=0; $i < strlen($hex)-1; $i+=2){
+            $string .= chr(hexdec($hex[$i].$hex[$i+1]));
+        }
+        return $string;
+    }
+
     private function getTaggerMock()
     {
         return $this->getMockBuilder(RuleBasedTagger::class)