diff options
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 54 | ||||
-rw-r--r-- | tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | 244 |
2 files changed, 298 insertions, 0 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 3fe31c2c..d4ea608f 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -53,6 +53,7 @@ class ContentProxy | |||
53 | 53 | ||
54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { | 54 | if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
55 | $fetchedContent = $this->graby->fetchContent($url); | 55 | $fetchedContent = $this->graby->fetchContent($url); |
56 | $fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']); | ||
56 | 57 | ||
57 | // when content is imported, we have information in $content | 58 | // when content is imported, we have information in $content |
58 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them | 59 | // in case fetching content goes bad, we'll keep the imported information instead of overriding them |
@@ -177,6 +178,59 @@ class ContentProxy | |||
177 | } | 178 | } |
178 | 179 | ||
179 | /** | 180 | /** |
181 | * Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. | ||
182 | * | ||
183 | * @param $title | ||
184 | * @param $contentType | ||
185 | * | ||
186 | * @return string | ||
187 | */ | ||
188 | private function sanitizeContentTitle($title, $contentType) | ||
189 | { | ||
190 | if ('application/pdf' === $contentType) { | ||
191 | $title = $this->convertPdfEncodingToUTF8($title); | ||
192 | } | ||
193 | |||
194 | return $this->sanitizeUTF8Text($title); | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not | ||
199 | * UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. | ||
200 | * | ||
201 | * @param $title | ||
202 | * | ||
203 | * @return string (maybe contains invalid UTF-8 character) | ||
204 | */ | ||
205 | private function convertPdfEncodingToUTF8($title) | ||
206 | { | ||
207 | // first try UTF-8 because its easier to detect its present/absence | ||
208 | foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { | ||
209 | if (mb_check_encoding($title, $encoding)) { | ||
210 | return mb_convert_encoding($title, 'UTF-8', $encoding); | ||
211 | } | ||
212 | } | ||
213 | |||
214 | return $title; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * Remove invalid UTF-8 characters from the given string. | ||
219 | * | ||
220 | * @param string $rawText | ||
221 | * | ||
222 | * @return string | ||
223 | */ | ||
224 | private function sanitizeUTF8Text($rawText) | ||
225 | { | ||
226 | if (mb_check_encoding($rawText, 'UTF-8')) { | ||
227 | return $rawText; | ||
228 | } | ||
229 | |||
230 | return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); | ||
231 | } | ||
232 | |||
233 | /** | ||
180 | * Stock entry with fetched or imported content. | 234 | * Stock entry with fetched or imported content. |
181 | * Will fall back to OpenGraph data if available. | 235 | * Will fall back to OpenGraph data if available. |
182 | * | 236 | * |
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 51df8de1..3f3c60d0 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | |||
@@ -531,6 +531,250 @@ class ContentProxyTest extends TestCase | |||
531 | $this->assertSame('1.1.1.1', $entry->getDomainName()); | 531 | $this->assertSame('1.1.1.1', $entry->getDomainName()); |
532 | } | 532 | } |
533 | 533 | ||
534 | public function testWebsiteWithValidUTF8Title_doNothing() | ||
535 | { | ||
536 | // You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex | ||
537 | // See http://graphemica.com for more info about the characters | ||
538 | // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 | ||
539 | $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A'); | ||
540 | |||
541 | $tagger = $this->getTaggerMock(); | ||
542 | $tagger->expects($this->once()) | ||
543 | ->method('tag'); | ||
544 | |||
545 | $graby = $this->getMockBuilder('Graby\Graby') | ||
546 | ->setMethods(['fetchContent']) | ||
547 | ->disableOriginalConstructor() | ||
548 | ->getMock(); | ||
549 | |||
550 | $graby->expects($this->any()) | ||
551 | ->method('fetchContent') | ||
552 | ->willReturn([ | ||
553 | 'html' => false, | ||
554 | 'title' => $actualTitle, | ||
555 | 'url' => '', | ||
556 | 'content_type' => 'text/html', | ||
557 | 'language' => '', | ||
558 | ]); | ||
559 | |||
560 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
561 | $entry = new Entry(new User()); | ||
562 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
563 | |||
564 | // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 | ||
565 | $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; | ||
566 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
567 | } | ||
568 | |||
569 | public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter() | ||
570 | { | ||
571 | // See http://graphemica.com for more info about the characters | ||
572 | // 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character. | ||
573 | // The correct UTF-8 € character (U+20AC) is E282AC | ||
574 | $actualTitle = $this->hexToStr('61' . '80' . '62'); | ||
575 | |||
576 | $tagger = $this->getTaggerMock(); | ||
577 | $tagger->expects($this->once()) | ||
578 | ->method('tag'); | ||
579 | |||
580 | $graby = $this->getMockBuilder('Graby\Graby') | ||
581 | ->setMethods(['fetchContent']) | ||
582 | ->disableOriginalConstructor() | ||
583 | ->getMock(); | ||
584 | |||
585 | $graby->expects($this->any()) | ||
586 | ->method('fetchContent') | ||
587 | ->willReturn([ | ||
588 | 'html' => false, | ||
589 | 'title' => $actualTitle, | ||
590 | 'url' => '', | ||
591 | 'content_type' => 'text/html', | ||
592 | 'language' => '', | ||
593 | ]); | ||
594 | |||
595 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
596 | $entry = new Entry(new User()); | ||
597 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
598 | |||
599 | // 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed | ||
600 | $expectedTitle = '61' . '62'; | ||
601 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
602 | } | ||
603 | |||
604 | public function testPdfWithUTF16BETitle_convertToUTF8() | ||
605 | { | ||
606 | // See http://graphemica.com for more info about the characters | ||
607 | // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE | ||
608 | $actualTitle = $this->hexToStr('D83DDE3B'); | ||
609 | |||
610 | $tagger = $this->getTaggerMock(); | ||
611 | $tagger->expects($this->once()) | ||
612 | ->method('tag'); | ||
613 | |||
614 | $graby = $this->getMockBuilder('Graby\Graby') | ||
615 | ->setMethods(['fetchContent']) | ||
616 | ->disableOriginalConstructor() | ||
617 | ->getMock(); | ||
618 | |||
619 | $graby->expects($this->any()) | ||
620 | ->method('fetchContent') | ||
621 | ->willReturn([ | ||
622 | 'html' => false, | ||
623 | 'title' => $actualTitle, | ||
624 | 'url' => '', | ||
625 | 'content_type' => 'application/pdf', | ||
626 | 'language' => '', | ||
627 | ]); | ||
628 | |||
629 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
630 | $entry = new Entry(new User()); | ||
631 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
632 | |||
633 | // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 | ||
634 | $expectedTitle = 'F09F98BB'; | ||
635 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
636 | } | ||
637 | |||
638 | public function testPdfWithUTF8Title_doNothing() | ||
639 | { | ||
640 | // See http://graphemica.com for more info about the characters | ||
641 | // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8 | ||
642 | $actualTitle = $this->hexToStr('F09F98BB'); | ||
643 | |||
644 | $tagger = $this->getTaggerMock(); | ||
645 | $tagger->expects($this->once()) | ||
646 | ->method('tag'); | ||
647 | |||
648 | $graby = $this->getMockBuilder('Graby\Graby') | ||
649 | ->setMethods(['fetchContent']) | ||
650 | ->disableOriginalConstructor() | ||
651 | ->getMock(); | ||
652 | |||
653 | $graby->expects($this->any()) | ||
654 | ->method('fetchContent') | ||
655 | ->willReturn([ | ||
656 | 'html' => false, | ||
657 | 'title' => $actualTitle, | ||
658 | 'url' => '', | ||
659 | 'content_type' => 'application/pdf', | ||
660 | 'language' => '', | ||
661 | ]); | ||
662 | |||
663 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
664 | $entry = new Entry(new User()); | ||
665 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
666 | |||
667 | // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8 | ||
668 | $expectedTitle = 'F09F98BB'; | ||
669 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
670 | } | ||
671 | |||
672 | public function testPdfWithWINDOWS1252Title_convertToUTF8() | ||
673 | { | ||
674 | // See http://graphemica.com for more info about the characters | ||
675 | // '€' (80) in hexadecimal and WINDOWS-1252 | ||
676 | $actualTitle = $this->hexToStr('80'); | ||
677 | |||
678 | $tagger = $this->getTaggerMock(); | ||
679 | $tagger->expects($this->once()) | ||
680 | ->method('tag'); | ||
681 | |||
682 | $graby = $this->getMockBuilder('Graby\Graby') | ||
683 | ->setMethods(['fetchContent']) | ||
684 | ->disableOriginalConstructor() | ||
685 | ->getMock(); | ||
686 | |||
687 | $graby->expects($this->any()) | ||
688 | ->method('fetchContent') | ||
689 | ->willReturn([ | ||
690 | 'html' => false, | ||
691 | 'title' => $actualTitle, | ||
692 | 'url' => '', | ||
693 | 'content_type' => 'application/pdf', | ||
694 | 'language' => '', | ||
695 | ]); | ||
696 | |||
697 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
698 | $entry = new Entry(new User()); | ||
699 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
700 | |||
701 | // '€' (U+20AC or E282AC) in hexadecimal and UTF-8 | ||
702 | $expectedTitle = 'E282AC'; | ||
703 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
704 | } | ||
705 | |||
706 | public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter() | ||
707 | { | ||
708 | // See http://graphemica.com for more info about the characters | ||
709 | // '😻ℤ�z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8 | ||
710 | // 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252 | ||
711 | $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A'); | ||
712 | |||
713 | $tagger = $this->getTaggerMock(); | ||
714 | $tagger->expects($this->once()) | ||
715 | ->method('tag'); | ||
716 | |||
717 | $graby = $this->getMockBuilder('Graby\Graby') | ||
718 | ->setMethods(['fetchContent']) | ||
719 | ->disableOriginalConstructor() | ||
720 | ->getMock(); | ||
721 | |||
722 | $graby->expects($this->any()) | ||
723 | ->method('fetchContent') | ||
724 | ->willReturn([ | ||
725 | 'html' => false, | ||
726 | 'title' => $actualTitle, | ||
727 | 'url' => '', | ||
728 | 'content_type' => 'application/pdf', | ||
729 | 'language' => '', | ||
730 | ]); | ||
731 | |||
732 | $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage); | ||
733 | $entry = new Entry(new User()); | ||
734 | $proxy->updateEntry($entry, 'http://0.0.0.0'); | ||
735 | |||
736 | // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8 | ||
737 | // the 0x81 (represented by �) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed | ||
738 | $expectedTitle = 'F09F98BB' . 'E284A4' . '7A'; | ||
739 | $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle())); | ||
740 | } | ||
741 | |||
742 | /** | ||
743 | * https://stackoverflow.com/a/18506801. | ||
744 | * | ||
745 | * @param $string | ||
746 | * | ||
747 | * @return string | ||
748 | */ | ||
749 | private function strToHex($string) | ||
750 | { | ||
751 | $hex = ''; | ||
752 | for ($i = 0; $i < \strlen($string); ++$i) { | ||
753 | $ord = \ord($string[$i]); | ||
754 | $hexCode = dechex($ord); | ||
755 | $hex .= substr('0' . $hexCode, -2); | ||
756 | } | ||
757 | |||
758 | return strtoupper($hex); | ||
759 | } | ||
760 | |||
761 | /** | ||
762 | * https://stackoverflow.com/a/18506801. | ||
763 | * | ||
764 | * @param $hex | ||
765 | * | ||
766 | * @return string | ||
767 | */ | ||
768 | private function hexToStr($hex) | ||
769 | { | ||
770 | $string = ''; | ||
771 | for ($i = 0; $i < \strlen($hex) - 1; $i += 2) { | ||
772 | $string .= \chr(hexdec($hex[$i] . $hex[$i + 1])); | ||
773 | } | ||
774 | |||
775 | return $string; | ||
776 | } | ||
777 | |||
534 | private function getTaggerMock() | 778 | private function getTaggerMock() |
535 | { | 779 | { |
536 | return $this->getMockBuilder(RuleBasedTagger::class) | 780 | return $this->getMockBuilder(RuleBasedTagger::class) |