From f39c5a2a702036750b4d7c32d02e7f92955a4eed Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sun, 11 Aug 2019 23:51:55 +0200 Subject: Add new Helper to process Ignore Origin rules and RulerZ operator This commits adds a new helper like RuleBasedTagger for processing ignore origin rules. It also adds a new custom RulerZ operator for the '~' pattern matching rule. Renames 'pattern' with '_all' in IgnoreOriginRule entity. Signed-off-by: Kevin Decherf --- .../Helper/RuleBasedIgnoreOriginProcessor.php | 50 ++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/Wallabag/CoreBundle/Helper/RuleBasedIgnoreOriginProcessor.php (limited to 'src/Wallabag/CoreBundle/Helper') diff --git a/src/Wallabag/CoreBundle/Helper/RuleBasedIgnoreOriginProcessor.php b/src/Wallabag/CoreBundle/Helper/RuleBasedIgnoreOriginProcessor.php new file mode 100644 index 00000000..333e5b0a --- /dev/null +++ b/src/Wallabag/CoreBundle/Helper/RuleBasedIgnoreOriginProcessor.php @@ -0,0 +1,50 @@ +rulerz = $rulerz; + $this->logger = $logger; + $this->ignoreOriginInstanceRuleRepository = $ignoreOriginInstanceRuleRepository; + } + + /** + * @param Entry $entry Entry to process + * + * @return bool + */ + public function process(Entry $entry) + { + $url = $entry->getUrl(); + $userRules = $entry->getUser()->getConfig()->getIgnoreOriginRules()->toArray(); + $rules = array_merge($this->ignoreOriginInstanceRuleRepository->findAll(), $userRules); + + $parsed_url = parse_url($url); + // We add the former url as a new key _all for pattern matching + $parsed_url['_all'] = $url; + + foreach ($rules as $rule) { + if ($this->rulerz->satisfies($parsed_url, $rule->getRule())) { + $this->logger->info('Origin url matching ignore rule.', [ + 'rule' => $rule->getRule(), + ]); + + return true; + } + } + + return false; + } +} -- cgit v1.2.3 From b22eb276232b5c15a6fbadc9dd10144e709faec3 Mon Sep 17 00:00:00 2001 From: Kevin Decherf Date: Sun, 11 Aug 2019 23:55:52 +0200 Subject: ContentProxy: replace ignoreUrl with new RuleBasedIgnoreOriginProcessor Signed-off-by: Kevin Decherf --- src/Wallabag/CoreBundle/Helper/ContentProxy.php | 41 +++---------------------- 1 file changed, 4 insertions(+), 37 deletions(-) (limited to 'src/Wallabag/CoreBundle/Helper') diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php index 9c6fa8db..7e93249d 100644 --- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php +++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php @@ -19,6 +19,7 @@ class ContentProxy { protected $graby; protected $tagger; + protected $ignoreOriginProcessor; protected $validator; protected $logger; protected $mimeGuesser; @@ -26,10 +27,11 @@ class ContentProxy protected $eventDispatcher; protected $storeArticleHeaders; - public function __construct(Graby $graby, RuleBasedTagger $tagger, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage, $storeArticleHeaders = false) + public function __construct(Graby $graby, RuleBasedTagger $tagger, RuleBasedIgnoreOriginProcessor $ignoreOriginProcessor, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage, $storeArticleHeaders = false) { $this->graby = $graby; $this->tagger = $tagger; + $this->ignoreOriginProcessor = $ignoreOriginProcessor; $this->validator = $validator; $this->logger = $logger; $this->mimeGuesser = new MimeTypeExtensionGuesser(); @@ -356,7 +358,7 @@ class ContentProxy $diff_keys = array_keys($diff); sort($diff_keys); - if ($this->ignoreUrl($entry->getUrl())) { + if ($this->ignoreOriginProcessor->process($entry)) { $entry->setUrl($url); return false; @@ -395,41 +397,6 @@ class ContentProxy } } - /** - * Check entry url against an ignore list to replace with content url. - * - * XXX: move the ignore list in the database to let users handle it - * - * @param string $url url to test - * - * @return bool true if url matches ignore list otherwise false - */ - private function ignoreUrl($url) - { - $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com']; - $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*']; - - $parsed_url = parse_url($url); - - $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) { - return $var === $parsed_url['host']; - }); - - if ([] !== $filtered) { - return true; - } - - $filtered = array_filter($ignored_patterns, function ($var) use ($url) { - return preg_match("`$var`i", $url); - }); - - if ([] !== $filtered) { - return true; - } - - return false; - } - /** * Validate that the given content has at least a title, an html and a url. * -- cgit v1.2.3