From c01d9532920ec5a298bb347dbb83a078d36d4841 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Wed, 19 Sep 2018 13:59:07 +0200 Subject: Add tests for logic Try to translate the title of a PDF from UTF-8 (then UTF-16BE, then WINDOWS-1252) to UTF-8 --- .../CoreBundle/Helper/ContentProxyTest.php | 236 +++++++++++++++++++++ 1 file changed, 236 insertions(+) (limited to 'tests/Wallabag') diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 51df8de1..9d8098ef 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php @@ -531,6 +531,242 @@ class ContentProxyTest extends TestCase $this->assertSame('1.1.1.1', $entry->getDomainName()); } + public function testWebsiteWithValidUTF8Title_doNothing() + { + // You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex + // See http://graphemica.com for more info about the characters + // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 + $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A'); + + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = $this->getMockBuilder('Graby\Graby') + ->setMethods(['fetchContent']) + ->disableOriginalConstructor() + ->getMock(); + + $graby->expects($this->any()) + ->method('fetchContent') + ->willReturn([ + 'html' => false, + 'title' => $actualTitle, + 'url' => '', + 'content_type' => 'text/html', + 'language' => '', + ]); + + $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = new Entry(new User()); + $proxy->updateEntry($entry, 'http://0.0.0.0'); + + // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 + $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; + $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); + } + + public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter() + { + // See http://graphemica.com for more info about the characters + // 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character. + // The correct UTF-8 € character (U+20AC) is E282AC + $actualTitle = $this->hexToStr('61' . '80' . '62'); + + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = $this->getMockBuilder('Graby\Graby') + ->setMethods(['fetchContent']) + ->disableOriginalConstructor() + ->getMock(); + + $graby->expects($this->any()) + ->method('fetchContent') + ->willReturn([ + 'html' => false, + 'title' => $actualTitle, + 'url' => '', + 'content_type' => 'text/html', + 'language' => '', + ]); + + $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = new Entry(new User()); + $proxy->updateEntry($entry, 'http://0.0.0.0'); + + // 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed + $expectedTitle = '61' . '62'; + $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); + } + + public function testPdfWithUTF16BETitle_convertToUTF8() + { + // See http://graphemica.com for more info about the characters + // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE + $actualTitle = $this->hexToStr('D83DDE3B'); + + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = $this->getMockBuilder('Graby\Graby') + ->setMethods(['fetchContent']) + ->disableOriginalConstructor() + ->getMock(); + + $graby->expects($this->any()) + ->method('fetchContent') + ->willReturn([ + 'html' => false, + 'title' => $actualTitle, + 'url' => '', + 'content_type' => 'application/pdf', + 'language' => '', + ]); + + $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = new Entry(new User()); + $proxy->updateEntry($entry, 'http://0.0.0.0'); + + // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 + $expectedTitle = 'F09F98BB'; + $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); + } + + public function testPdfWithUTF8Title_doNothing() + { + // See http://graphemica.com for more info about the characters + // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8 + $actualTitle = $this->hexToStr('F09F98BB'); + + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = $this->getMockBuilder('Graby\Graby') + ->setMethods(['fetchContent']) + ->disableOriginalConstructor() + ->getMock(); + + $graby->expects($this->any()) + ->method('fetchContent') + ->willReturn([ + 'html' => false, + 'title' => $actualTitle, + 'url' => '', + 'content_type' => 'application/pdf', + 'language' => '', + ]); + + $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = new Entry(new User()); + $proxy->updateEntry($entry, 'http://0.0.0.0'); + + // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 + $expectedTitle = 'F09F98BB'; + $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); + } + + public function testPdfWithWINDOWS1252Title_convertToUTF8() + { + // See http://graphemica.com for more info about the characters + // '€' (80) in hexadecimal and WINDOWS-1252 + $actualTitle = $this->hexToStr('80'); + + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = $this->getMockBuilder('Graby\Graby') + ->setMethods(['fetchContent']) + ->disableOriginalConstructor() + ->getMock(); + + $graby->expects($this->any()) + ->method('fetchContent') + ->willReturn([ + 'html' => false, + 'title' => $actualTitle, + 'url' => '', + 'content_type' => 'application/pdf', + 'language' => '', + ]); + + $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = new Entry(new User()); + $proxy->updateEntry($entry, 'http://0.0.0.0'); + + // '€' (U+20AC or E282AC) in hexadecimal and UTF-8 + $expectedTitle = 'E282AC'; + $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); + } + + public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter() + { + // See http://graphemica.com for more info about the characters + // '😻ℤ�z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8 + // 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252 + $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A'); + + $tagger = $this->getTaggerMock(); + $tagger->expects($this->once()) + ->method('tag'); + + $graby = $this->getMockBuilder('Graby\Graby') + ->setMethods(['fetchContent']) + ->disableOriginalConstructor() + ->getMock(); + + $graby->expects($this->any()) + ->method('fetchContent') + ->willReturn([ + 'html' => false, + 'title' => $actualTitle, + 'url' => '', + 'content_type' => 'application/pdf', + 'language' => '', + ]); + + $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); + $entry = new Entry(new User()); + $proxy->updateEntry($entry, 'http://0.0.0.0'); + + // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 + // the 0x81 (represented by �) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed + $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; + $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); + } + + /** + * https://stackoverflow.com/a/18506801 + * @param $string + * @return string + */ + function strToHex($string){ + $hex = ''; + for ($i=0; $igetMockBuilder(RuleBasedTagger::class) -- cgit v1.2.3 From d64139d8123ac88c8ba1b427c3ee3637b6ea1c96 Mon Sep 17 00:00:00 2001 From: Tobi823 Date: Fri, 21 Sep 2018 13:31:28 +0200 Subject: Make helper methods strToHex and hexToStr in ContentProxyTest.php private to prevent misusage (from outside this class) --- tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tests/Wallabag') diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 9d8098ef..5f10f482 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php @@ -744,7 +744,7 @@ class ContentProxyTest extends TestCase * @param $string * @return string */ - function strToHex($string){ + private function strToHex($string){ $hex = ''; for ($i=0; $i Date: Sun, 23 Sep 2018 23:42:05 +0200 Subject: Run php-cs-fixer for fixing coding standard issues (on ContentProxyTest) --- .../CoreBundle/Helper/ContentProxyTest.php | 30 ++++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'tests/Wallabag') diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 5f10f482..3f3c60d0 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php @@ -740,30 +740,38 @@ class ContentProxyTest extends TestCase } /** - * https://stackoverflow.com/a/18506801 + * https://stackoverflow.com/a/18506801. + * * @param $string + * * @return string */ - private function strToHex($string){ + private function strToHex($string) + { $hex = ''; - for ($i=0; $i