From 74a75f7d430eb7a69cd377194e52012db34d39b4 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Fri, 12 May 2017 07:53:21 +0200 Subject: Use graby ContentExtractor to clean html It might be better to re-use some graby functionalities to clean html instead of building a new system. --- src/Wallabag/ApiBundle/Controller/EntryRestController.php | 1 - src/Wallabag/CoreBundle/Helper/ContentProxy.php | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'src/Wallabag') diff --git a/src/Wallabag/ApiBundle/Controller/EntryRestController.php b/src/Wallabag/ApiBundle/Controller/EntryRestController.php index e6bbe552..0930c109 100644 --- a/src/Wallabag/ApiBundle/Controller/EntryRestController.php +++ b/src/Wallabag/ApiBundle/Controller/EntryRestController.php @@ -336,7 +336,6 @@ class EntryRestController extends WallabagRestController $entry->setUrl($url); } - if (!empty($tags)) { $this->get('wallabag_core.tags_assigner')->assignTagsToEntry($entry, $tags); } diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index e06ad3d6..a1df16d8 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -47,6 +47,16 @@ class ContentProxy { // ensure content is a bit cleaned up if (!empty($content['html'])) { + $extractor = $this->graby->getExtractor(); + $contentExtracted = $extractor->process($content['html'], $url); + + if ($contentExtracted) { + $contentBlock = $extractor->getContent(); + $contentBlock->normalize(); + + $content['html'] = trim($contentBlock->innerHTML); + } + $content['html'] = htmLawed($content['html'], [ 'safe' => 1, // which means: do not remove iframe elements -- cgit v1.2.3