]> git.immae.eu Git - github/wallabag/wallabag.git/commitdiff
Use graby ContentExtractor to clean html
authorJeremy Benoist <jeremy.benoist@gmail.com>
Fri, 12 May 2017 05:53:21 +0000 (07:53 +0200)
committerJeremy Benoist <jeremy.benoist@gmail.com>
Wed, 31 May 2017 12:00:15 +0000 (14:00 +0200)
It might be better to re-use some graby functionalities to clean html instead of building a new system.

composer.json
src/Wallabag/ApiBundle/Controller/EntryRestController.php
src/Wallabag/CoreBundle/Helper/ContentProxy.php
tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php

index d8c58de28d9800174e7c72dfe29ce8a69d4358cc..31cfb6a140f7084e65dfd1d92480384d05bf26b1 100644 (file)
@@ -64,7 +64,7 @@
         "htmlawed/htmlawed": "~1.1.19",
         "liip/theme-bundle": "~1.1",
         "lexik/form-filter-bundle": "~5.0",
-        "j0k3r/graby": "~1.0",
+        "j0k3r/graby": "dev-extractor",
         "friendsofsymfony/user-bundle": "^2.0",
         "friendsofsymfony/oauth-server-bundle": "^1.5",
         "stof/doctrine-extensions-bundle": "^1.2",
index e6bbe5528b9a2f994968dbc3c70806d55db0527b..0930c1097ce0927bd3005a904a63217f2aafd932 100644 (file)
@@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
             $entry->setUrl($url);
         }
 
-
         if (!empty($tags)) {
             $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
         }
index e06ad3d6602ca19cac904d849d7dff0470b4aaf3..a1df16d8cc4184534c856e5c8124b5a5228368a6 100644 (file)
@@ -47,6 +47,16 @@ class ContentProxy
     {
         // ensure content is a bit cleaned up
         if (!empty($content['html'])) {
+            $extractor = $this->graby->getExtractor();
+            $contentExtracted = $extractor->process($content['html'], $url);
+
+            if ($contentExtracted) {
+                $contentBlock = $extractor->getContent();
+                $contentBlock->normalize();
+
+                $content['html'] = trim($contentBlock->innerHTML);
+            }
+
             $content['html'] = htmLawed($content['html'], [
                 'safe' => 1,
                 // which means: do not remove iframe elements
index 44fca0737280f8c3507e0cbec4270753e936cc63..7a50b3737284f002d1a2b81d2767f0798ff1ae1b 100644 (file)
@@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry;
 use Wallabag\CoreBundle\Entity\Tag;
 use Wallabag\UserBundle\Entity\User;
 use Wallabag\CoreBundle\Helper\RuleBasedTagger;
+use Graby\Graby;
 
 class ContentProxyTest extends \PHPUnit_Framework_TestCase
 {
@@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase
         $this->assertCount(0, $entry->getTags());
     }
 
+    public function dataForCrazyHtml()
+    {
+        return [
+            'script and comment' => [
+                '<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />',
+                'lol'
+            ],
+            'script' => [
+                '<strong>Script inside:</strong><script>alert(\'lol\');</script>',
+                'script'
+            ],
+        ];
+    }
+
+    /**
+     * @dataProvider dataForCrazyHtml
+     */
+    public function testWithCrazyHtmlContent($html, $escapedString)
+    {
+        $tagger = $this->getTaggerMock();
+        $tagger->expects($this->once())
+            ->method('tag');
+
+        $graby = new Graby();
+
+        $proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage);
+        $entry = $proxy->updateEntry(
+            new Entry(new User()),
+            'http://1.1.1.1',
+            [
+                'html' => $html,
+                'title' => 'this is my title',
+                'url' => 'http://1.1.1.1',
+                'content_type' => 'text/html',
+                'language' => 'fr',
+                'status' => '200',
+                'open_graph' => [
+                    'og_title' => 'my OG title',
+                    'og_description' => 'OG desc',
+                    'og_image' => 'http://3.3.3.3/cover.jpg',
+                ],
+            ]
+        );
+
+        $this->assertEquals('http://1.1.1.1', $entry->getUrl());
+        $this->assertEquals('this is my title', $entry->getTitle());
+        $this->assertNotContains($escapedString, $entry->getContent());
+        $this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
+        $this->assertEquals('text/html', $entry->getMimetype());
+        $this->assertEquals('fr', $entry->getLanguage());
+        $this->assertEquals('200', $entry->getHttpStatus());
+        $this->assertEquals('1.1.1.1', $entry->getDomainName());
+    }
+
     private function getTaggerMock()
     {
         return $this->getMockBuilder(RuleBasedTagger::class)