]> git.immae.eu Git - github/wallabag/wallabag.git/blobdiff - src/Wallabag/CoreBundle/Helper/ContentProxy.php
Use graby ContentExtractor to clean html
[github/wallabag/wallabag.git] / src / Wallabag / CoreBundle / Helper / ContentProxy.php
index e06ad3d6602ca19cac904d849d7dff0470b4aaf3..a1df16d8cc4184534c856e5c8124b5a5228368a6 100644 (file)
@@ -47,6 +47,16 @@ class ContentProxy
     {
         // ensure content is a bit cleaned up
         if (!empty($content['html'])) {
+            $extractor = $this->graby->getExtractor();
+            $contentExtracted = $extractor->process($content['html'], $url);
+
+            if ($contentExtracted) {
+                $contentBlock = $extractor->getContent();
+                $contentBlock->normalize();
+
+                $content['html'] = trim($contentBlock->innerHTML);
+            }
+
             $content['html'] = htmLawed($content['html'], [
                 'safe' => 1,
                 // which means: do not remove iframe elements