aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJeremy Benoist <jeremy.benoist@gmail.com>2017-05-12 07:53:21 +0200
committerJeremy Benoist <jeremy.benoist@gmail.com>2017-05-31 14:00:15 +0200
commit74a75f7d430eb7a69cd377194e52012db34d39b4 (patch)
treebb85741afe742e24351167699c434a955ab4a9fa
parentfb436e8ca0c7468b9698050df0b78447e2d0854f (diff)
downloadwallabag-74a75f7d430eb7a69cd377194e52012db34d39b4.tar.gz
wallabag-74a75f7d430eb7a69cd377194e52012db34d39b4.tar.zst
wallabag-74a75f7d430eb7a69cd377194e52012db34d39b4.zip
Use graby ContentExtractor to clean html
It might be better to re-use some graby functionalities to clean html instead of building a new system.
-rw-r--r--composer.json2
-rw-r--r--src/Wallabag/ApiBundle/Controller/EntryRestController.php1
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php10
-rw-r--r--tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php55
4 files changed, 66 insertions, 2 deletions
diff --git a/composer.json b/composer.json
index d8c58de2..31cfb6a1 100644
--- a/composer.json
+++ b/composer.json
@@ -64,7 +64,7 @@
64 "htmlawed/htmlawed": "~1.1.19", 64 "htmlawed/htmlawed": "~1.1.19",
65 "liip/theme-bundle": "~1.1", 65 "liip/theme-bundle": "~1.1",
66 "lexik/form-filter-bundle": "~5.0", 66 "lexik/form-filter-bundle": "~5.0",
67 "j0k3r/graby": "~1.0", 67 "j0k3r/graby": "dev-extractor",
68 "friendsofsymfony/user-bundle": "^2.0", 68 "friendsofsymfony/user-bundle": "^2.0",
69 "friendsofsymfony/oauth-server-bundle": "^1.5", 69 "friendsofsymfony/oauth-server-bundle": "^1.5",
70 "stof/doctrine-extensions-bundle": "^1.2", 70 "stof/doctrine-extensions-bundle": "^1.2",
diff --git a/src/Wallabag/ApiBundle/Controller/EntryRestController.php b/src/Wallabag/ApiBundle/Controller/EntryRestController.php
index e6bbe552..0930c109 100644
--- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php
+++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php
@@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController
336 $entry->setUrl($url); 336 $entry->setUrl($url);
337 } 337 }
338 338
339
340 if (!empty($tags)) { 339 if (!empty($tags)) {
341 $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags); 340 $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags);
342 } 341 }
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index e06ad3d6..a1df16d8 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -47,6 +47,16 @@ class ContentProxy
47 { 47 {
48 // ensure content is a bit cleaned up 48 // ensure content is a bit cleaned up
49 if (!empty($content['html'])) { 49 if (!empty($content['html'])) {
50 $extractor = $this->graby->getExtractor();
51 $contentExtracted = $extractor->process($content['html'], $url);
52
53 if ($contentExtracted) {
54 $contentBlock = $extractor->getContent();
55 $contentBlock->normalize();
56
57 $content['html'] = trim($contentBlock->innerHTML);
58 }
59
50 $content['html'] = htmLawed($content['html'], [ 60 $content['html'] = htmLawed($content['html'], [
51 'safe' => 1, 61 'safe' => 1,
52 // which means: do not remove iframe elements 62 // which means: do not remove iframe elements
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
index 44fca073..7a50b373 100644
--- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
+++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
@@ -8,6 +8,7 @@ use Wallabag\CoreBundle\Entity\Entry;
8use Wallabag\CoreBundle\Entity\Tag; 8use Wallabag\CoreBundle\Entity\Tag;
9use Wallabag\UserBundle\Entity\User; 9use Wallabag\UserBundle\Entity\User;
10use Wallabag\CoreBundle\Helper\RuleBasedTagger; 10use Wallabag\CoreBundle\Helper\RuleBasedTagger;
11use Graby\Graby;
11 12
12class ContentProxyTest extends \PHPUnit_Framework_TestCase 13class ContentProxyTest extends \PHPUnit_Framework_TestCase
13{ 14{
@@ -253,6 +254,60 @@ class ContentProxyTest extends \PHPUnit_Framework_TestCase
253 $this->assertCount(0, $entry->getTags()); 254 $this->assertCount(0, $entry->getTags());
254 } 255 }
255 256
257 public function dataForCrazyHtml()
258 {
259 return [
260 'script and comment' => [
261 '<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />',
262 'lol'
263 ],
264 'script' => [
265 '<strong>Script inside:</strong><script>alert(\'lol\');</script>',
266 'script'
267 ],
268 ];
269 }
270
271 /**
272 * @dataProvider dataForCrazyHtml
273 */
274 public function testWithCrazyHtmlContent($html, $escapedString)
275 {
276 $tagger = $this->getTaggerMock();
277 $tagger->expects($this->once())
278 ->method('tag');
279
280 $graby = new Graby();
281
282 $proxy = new ContentProxy($graby, $tagger, $this->getTagRepositoryMock(), $this->getLogger(), $this->fetchingErrorMessage);
283 $entry = $proxy->updateEntry(
284 new Entry(new User()),
285 'http://1.1.1.1',
286 [
287 'html' => $html,
288 'title' => 'this is my title',
289 'url' => 'http://1.1.1.1',
290 'content_type' => 'text/html',
291 'language' => 'fr',
292 'status' => '200',
293 'open_graph' => [
294 'og_title' => 'my OG title',
295 'og_description' => 'OG desc',
296 'og_image' => 'http://3.3.3.3/cover.jpg',
297 ],
298 ]
299 );
300
301 $this->assertEquals('http://1.1.1.1', $entry->getUrl());
302 $this->assertEquals('this is my title', $entry->getTitle());
303 $this->assertNotContains($escapedString, $entry->getContent());
304 $this->assertEquals('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
305 $this->assertEquals('text/html', $entry->getMimetype());
306 $this->assertEquals('fr', $entry->getLanguage());
307 $this->assertEquals('200', $entry->getHttpStatus());
308 $this->assertEquals('1.1.1.1', $entry->getDomainName());
309 }
310
256 private function getTaggerMock() 311 private function getTaggerMock()
257 { 312 {
258 return $this->getMockBuilder(RuleBasedTagger::class) 313 return $this->getMockBuilder(RuleBasedTagger::class)