diff options
author | Jeremy Benoist <jeremy.benoist@gmail.com> | 2017-05-12 07:53:21 +0200 |
---|---|---|
committer | Jeremy Benoist <jeremy.benoist@gmail.com> | 2017-05-31 14:00:15 +0200 |
commit | 74a75f7d430eb7a69cd377194e52012db34d39b4 (patch) | |
tree | bb85741afe742e24351167699c434a955ab4a9fa | |
parent | fb436e8ca0c7468b9698050df0b78447e2d0854f (diff) | |
download | wallabag-74a75f7d430eb7a69cd377194e52012db34d39b4.tar.gz wallabag-74a75f7d430eb7a69cd377194e52012db34d39b4.tar.zst wallabag-74a75f7d430eb7a69cd377194e52012db34d39b4.zip |
Use graby ContentExtractor to clean html
It might be better to re-use some graby functionalities to clean html instead of building a new system.
-rw-r--r-- | composer.json | 2 | ||||
-rw-r--r-- | src/Wallabag/ApiBundle/Controller/EntryRestController.php | 1 | ||||
-rw-r--r-- | src/Wallabag/CoreBundle/Helper/ContentProxy.php | 10 | ||||
-rw-r--r-- | tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | 55 |
4 files changed, 66 insertions, 2 deletions
diff --git a/composer.json b/composer.json index d8c58de2..31cfb6a1 100644 --- a/composer.json +++ b/composer.json | |||
@@ -64,7 +64,7 @@ | |||
64 | "htmlawed/htmlawed": "~1.1.19", | 64 | "htmlawed/htmlawed": "~1.1.19", |
65 | "liip/theme-bundle": "~1.1", | 65 | "liip/theme-bundle": "~1.1", |
66 | "lexik/form-filter-bundle": "~5.0", | 66 | "lexik/form-filter-bundle": "~5.0", |
67 | "j0k3r/graby": "~1.0", | 67 | "j0k3r/graby": "dev-extractor", |
68 | "friendsofsymfony/user-bundle": "^2.0", | 68 | "friendsofsymfony/user-bundle": "^2.0", |
69 | "friendsofsymfony/oauth-server-bundle": "^1.5", | 69 | "friendsofsymfony/oauth-server-bundle": "^1.5", |
70 | "stof/doctrine-extensions-bundle": "^1.2", | 70 | "stof/doctrine-extensions-bundle": "^1.2", |
diff --git a/src/Wallabag/ApiBundle/Controller/EntryRestController.php b/src/Wallabag/ApiBundle/Controller/EntryRestController.php index e6bbe552..0930c109 100644 --- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php +++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php | |||
@@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController | |||
336 | $entry->setUrl($url); | 336 | $entry->setUrl($url); |
337 | } | 337 | } |
338 | 338 | ||
339 | |||
340 | if (!empty($tags)) { | 339 | if (!empty($tags)) { |
341 | $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags); | 340 | $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags); |
342 | } | 341 | } |
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index e06ad3d6..a1df16d8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php | |||
@@ -47,6 +47,16 @@ class ContentProxy | |||
47 | { | 47 | { |
48 | // ensure content is a bit cleaned up | 48 | // ensure content is a bit cleaned up |
49 | if (!empty($content['html'])) { | 49 | if (!empty($content['html'])) { |
50 | $extractor = $this->graby->getExtractor(); | ||
51 | $contentExtracted = $extractor->process($content['html'], $url); | ||
52 | |||
53 | if ($contentExtracted) { | ||
54 | $contentBlock = $extractor->getContent(); | ||
55 | $contentBlock->normalize(); | ||
56 | |||
57 | $content['html'] = trim($contentBlock->innerHTML); | ||
58 | } | ||
59 | |||
50 | $content['html'] = htmLawed($content['html'], [ | 60 | $content['html'] = htmLawed($content['html'], [ |
51 | 'safe' => 1, | 61 | 'safe' => 1, |
52 | // which means: do not remove iframe elements | 62 | // which means: do not remove iframe elements |
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php index 44fca073..7a50b373 100644 --- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php +++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php | |||
@@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry; | |||
8 | use Wallabag\CoreBundle\Entity\Tag; | 8 | use Wallabag\CoreBundle\Entity\Tag; |
9 | use Wallabag\UserBundle\Entity\User; | 9 | use Wallabag\UserBundle\Entity\User; |
10 | use Wallabag\CoreBundle\Helper\RuleBasedTagger; | 10 | use Wallabag\CoreBundle\Helper\RuleBasedTagger; |
11 | use Graby\Graby; | ||
11 | 12 | ||
12 | class ContentProxyTest extends \PHPUnit_Framework_TestCase | 13 | class ContentProxyTest extends \PHPUnit_Framework_TestCase |
13 | { | 14 | { |
@@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase | |||
253 | $this->assertCount(0, $entry->getTags()); | 254 | $this->assertCount(0, $entry->getTags()); |
254 | } | 255 | } |
255 | 256 | ||
257 | public function dataForCrazyHtml() | ||
258 | { | ||
259 | return [ | ||
260 | 'script and comment' => [ | ||
261 | '<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />', | ||
262 | 'lol' | ||
263 | ], | ||
264 | 'script' => [ | ||
265 | '<strong>Script inside:</strong><script>alert(\'lol\');</script>', | ||
266 | 'script' | ||
267 | ], | ||
268 | ]; | ||
269 | } | ||
270 | |||
271 | /** | ||
272 | * @dataProvider dataForCrazyHtml | ||
273 | */ | ||
274 | public function testWithCrazyHtmlContent($html, $escapedString) | ||
275 | { | ||
276 | $tagger = $this->getTaggerMock(); | ||
277 | $tagger->expects($this->once()) | ||
278 | ->method('tag'); | ||
279 | |||
280 | $graby = new Graby(); | ||
281 | |||
282 | $proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage); | ||
283 | $entry = $proxy->updateEntry( | ||
284 | new Entry(new User()), | ||
285 | 'http://1.1.1.1', | ||
286 | [ | ||
287 | 'html' => $html, | ||
288 | 'title' => 'this is my title', | ||
289 | 'url' => 'http://1.1.1.1', | ||
290 | 'content_type' => 'text/html', | ||
291 | 'language' => 'fr', | ||
292 | 'status' => '200', | ||
293 | 'open_graph' => [ | ||
294 | 'og_title' => 'my OG title', | ||
295 | 'og_description' => 'OG desc', | ||
296 | 'og_image' => 'http://3.3.3.3/cover.jpg', | ||
297 | ], | ||
298 | ] | ||
299 | ); | ||
300 | |||
301 | $this->assertEquals('http://1.1.1.1', $entry->getUrl()); | ||
302 | $this->assertEquals('this is my title', $entry->getTitle()); | ||
303 | $this->assertNotContains($escapedString, $entry->getContent()); | ||
304 | $this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture()); | ||
305 | $this->assertEquals('text/html', $entry->getMimetype()); | ||
306 | $this->assertEquals('fr', $entry->getLanguage()); | ||
307 | $this->assertEquals('200', $entry->getHttpStatus()); | ||
308 | $this->assertEquals('1.1.1.1', $entry->getDomainName()); | ||
309 | } | ||
310 | |||
256 | private function getTaggerMock() | 311 | private function getTaggerMock() |
257 | { | 312 | { |
258 | return $this->getMockBuilder(RuleBasedTagger::class) | 313 | return $this->getMockBuilder(RuleBasedTagger::class) |