diff options
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 4 | ||||
-rw-r--r-- | tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | 236 |
2 files changed, 238 insertions, 2 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index fab05268..50090100 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -90,8 +90,8 @@ class ContentProxy | |||
90 | * @return string (maybe contains invalid UTF-8 character) | 90 | * @return string (maybe contains invalid UTF-8 character) |
91 | */ | 91 | */ |
92 | private function convertPdfEncodingToUTF8($title) { | 92 | private function convertPdfEncodingToUTF8($title) { |
93 | // first try UTF-16 (then UTF-8) because its easier to detect its present/absence | 93 | // first try UTF-8 because its easier to detect its present/absence |
94 | foreach (array('UTF-16BE', 'UTF-16LE', 'UTF-8', 'WINDOWS-1252') as $encoding) { | 94 | foreach (array('UTF-8', 'UTF-16BE', 'WINDOWS-1252') as $encoding) { |
95 | if (mb_check_encoding($title, $encoding)) { | 95 | if (mb_check_encoding($title, $encoding)) { |
96 | return mb_convert_encoding($title, 'UTF-8', $encoding); | 96 | return mb_convert_encoding($title, 'UTF-8', $encoding); |
97 | } | 97 | } |
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 51df8de1..9d8098ef 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | |||
@@ -531,6 +531,242 @@ class ContentProxyTest extends TestCase | |||
531 | $this->assertSame('1.1.1.1', $entry->getDomainName()); | 531 | $this->assertSame('1.1.1.1', $entry->getDomainName()); |
532 | } | 532 | } |
533 | 533 | ||
534 | public function testWebsiteWithValidUTF8Title_doNothing() | ||
535 | { | ||
536 | // You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex | ||
537 | // See http://graphemica.com for more info about the characters | ||
538 | // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 | ||
539 | $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A'); | ||
540 | |||
541 | $tagger = $this->getTaggerMock(); | ||
542 | $tagger->expects($this->once()) | ||
543 | ->method('tag'); | ||
544 | |||
545 | $graby = $this->getMockBuilder('Graby\Graby') | ||
546 | ->setMethods(['fetchContent']) | ||
547 | ->disableOriginalConstructor() | ||
548 | ->getMock(); | ||
549 | |||
550 | $graby->expects($this->any()) | ||
551 | ->method('fetchContent') | ||
552 | ->willReturn([ | ||
553 | 'html' => false, | ||
554 | 'title' => $actualTitle, | ||
555 | 'url' => '', | ||
556 | 'content_type' => 'text/html', | ||
557 | 'language' => '', | ||
558 | ]); | ||
559 | |||
560 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
561 | $entry = new Entry(new User()); | ||
562 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
563 | |||
564 | // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 | ||
565 | $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; | ||
566 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
567 | } | ||
568 | |||
569 | public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter() | ||
570 | { | ||
571 | // See http://graphemica.com for more info about the characters | ||
572 | // 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character. | ||
573 | // The correct UTF-8 € character (U+20AC) is E282AC | ||
574 | $actualTitle = $this->hexToStr('61' . '80' . '62'); | ||
575 | |||
576 | $tagger = $this->getTaggerMock(); | ||
577 | $tagger->expects($this->once()) | ||
578 | ->method('tag'); | ||
579 | |||
580 | $graby = $this->getMockBuilder('Graby\Graby') | ||
581 | ->setMethods(['fetchContent']) | ||
582 | ->disableOriginalConstructor() | ||
583 | ->getMock(); | ||
584 | |||
585 | $graby->expects($this->any()) | ||
586 | ->method('fetchContent') | ||
587 | ->willReturn([ | ||
588 | 'html' => false, | ||
589 | 'title' => $actualTitle, | ||
590 | 'url' => '', | ||
591 | 'content_type' => 'text/html', | ||
592 | 'language' => '', | ||
593 | ]); | ||
594 | |||
595 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
596 | $entry = new Entry(new User()); | ||
597 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
598 | |||
599 | // 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed | ||
600 | $expectedTitle = '61' . '62'; | ||
601 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
602 | } | ||
603 | |||
604 | public function testPdfWithUTF16BETitle_convertToUTF8() | ||
605 | { | ||
606 | // See http://graphemica.com for more info about the characters | ||
607 | // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE | ||
608 | $actualTitle = $this->hexToStr('D83DDE3B'); | ||
609 | |||
610 | $tagger = $this->getTaggerMock(); | ||
611 | $tagger->expects($this->once()) | ||
612 | ->method('tag'); | ||
613 | |||
614 | $graby = $this->getMockBuilder('Graby\Graby') | ||
615 | ->setMethods(['fetchContent']) | ||
616 | ->disableOriginalConstructor() | ||
617 | ->getMock(); | ||
618 | |||
619 | $graby->expects($this->any()) | ||
620 | ->method('fetchContent') | ||
621 | ->willReturn([ | ||
622 | 'html' => false, | ||
623 | 'title' => $actualTitle, | ||
624 | 'url' => '', | ||
625 | 'content_type' => 'application/pdf', | ||
626 | 'language' => '', | ||
627 | ]); | ||
628 | |||
629 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
630 | $entry = new Entry(new User()); | ||
631 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
632 | |||
633 | // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 | ||
634 | $expectedTitle = 'F09F98BB'; | ||
635 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
636 | } | ||
637 | |||
638 | public function testPdfWithUTF8Title_doNothing() | ||
639 | { | ||
640 | // See http://graphemica.com for more info about the characters | ||
641 | // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8 | ||
642 | $actualTitle = $this->hexToStr('F09F98BB'); | ||
643 | |||
644 | $tagger = $this->getTaggerMock(); | ||
645 | $tagger->expects($this->once()) | ||
646 | ->method('tag'); | ||
647 | |||
648 | $graby = $this->getMockBuilder('Graby\Graby') | ||
649 | ->setMethods(['fetchContent']) | ||
650 | ->disableOriginalConstructor() | ||
651 | ->getMock(); | ||
652 | |||
653 | $graby->expects($this->any()) | ||
654 | ->method('fetchContent') | ||
655 | ->willReturn([ | ||
656 | 'html' => false, | ||
657 | 'title' => $actualTitle, | ||
658 | 'url' => '', | ||
659 | 'content_type' => 'application/pdf', | ||
660 | 'language' => '', | ||
661 | ]); | ||
662 | |||
663 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
664 | $entry = new Entry(new User()); | ||
665 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
666 | |||
667 | // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 | ||
668 | $expectedTitle = 'F09F98BB'; | ||
669 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
670 | } | ||
671 | |||
672 | public function testPdfWithWINDOWS1252Title_convertToUTF8() | ||
673 | { | ||
674 | // See http://graphemica.com for more info about the characters | ||
675 | // '€' (80) in hexadecimal and WINDOWS-1252 | ||
676 | $actualTitle = $this->hexToStr('80'); | ||
677 | |||
678 | $tagger = $this->getTaggerMock(); | ||
679 | $tagger->expects($this->once()) | ||
680 | ->method('tag'); | ||
681 | |||
682 | $graby = $this->getMockBuilder('Graby\Graby') | ||
683 | ->setMethods(['fetchContent']) | ||
684 | ->disableOriginalConstructor() | ||
685 | ->getMock(); | ||
686 | |||
687 | $graby->expects($this->any()) | ||
688 | ->method('fetchContent') | ||
689 | ->willReturn([ | ||
690 | 'html' => false, | ||
691 | 'title' => $actualTitle, | ||
692 | 'url' => '', | ||
693 | 'content_type' => 'application/pdf', | ||
694 | 'language' => '', | ||
695 | ]); | ||
696 | |||
697 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
698 | $entry = new Entry(new User()); | ||
699 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
700 | |||
701 | // '€' (U+20AC or E282AC) in hexadecimal and UTF-8 | ||
702 | $expectedTitle = 'E282AC'; | ||
703 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
704 | } | ||
705 | |||
706 | public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter() | ||
707 | { | ||
708 | // See http://graphemica.com for more info about the characters | ||
709 | // '😻ℤ�z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8 | ||
710 | // 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252 | ||
711 | $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A'); | ||
712 | |||
713 | $tagger = $this->getTaggerMock(); | ||
714 | $tagger->expects($this->once()) | ||
715 | ->method('tag'); | ||
716 | |||
717 | $graby = $this->getMockBuilder('Graby\Graby') | ||
718 | ->setMethods(['fetchContent']) | ||
719 | ->disableOriginalConstructor() | ||
720 | ->getMock(); | ||
721 | |||
722 | $graby->expects($this->any()) | ||
723 | ->method('fetchContent') | ||
724 | ->willReturn([ | ||
725 | 'html' => false, | ||
726 | 'title' => $actualTitle, | ||
727 | 'url' => '', | ||
728 | 'content_type' => 'application/pdf', | ||
729 | 'language' => '', | ||
730 | ]); | ||
731 | |||
732 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
733 | $entry = new Entry(new User()); | ||
734 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
735 | |||
736 | // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 | ||
737 | // the 0x81 (represented by �) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed | ||
738 | $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; | ||
739 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
740 | } | ||
741 | |||
742 | /** | ||
743 | * https://stackoverflow.com/a/18506801 | ||
744 | * @param $string | ||
745 | * @return string | ||
746 | */ | ||
747 | function strToHex($string){ | ||
748 | $hex = ''; | ||
749 | for ($i=0; $i<strlen($string); $i++){ | ||
750 | $ord = ord($string[$i]); | ||
751 | $hexCode = dechex($ord); | ||
752 | $hex .= substr('0'.$hexCode, -2); | ||
753 | } | ||
754 | return strToUpper($hex); | ||
755 | } | ||
756 | |||
757 | /** | ||
758 | * https://stackoverflow.com/a/18506801 | ||
759 | * @param $hex | ||
760 | * @return string | ||
761 | */ | ||
762 | function hexToStr($hex){ | ||
763 | $string=''; | ||
764 | for ($i=0; $i < strlen($hex)-1; $i+=2){ | ||
765 | $string .= chr(hexdec($hex[$i].$hex[$i+1])); | ||
766 | } | ||
767 | return $string; | ||
768 | } | ||
769 | |||
534 | private function getTaggerMock() | 770 | private function getTaggerMock() |
535 | { | 771 | { |
536 | return $this->getMockBuilder(RuleBasedTagger::class) | 772 | return $this->getMockBuilder(RuleBasedTagger::class) |