From 2cd0509b503332b1989f06da45d569d4d2929be5 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 3 Sep 2020 17:46:26 +0200 Subject: Improve regex to extract HTML metadata (title, description, etc.) Also added a bunch of tests to cover more use cases. Fixes #1375 --- application/bookmark/LinkUtils.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'application/bookmark') diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 68914fca..03e1b82a 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -66,11 +66,13 @@ function html_extract_tag($tag, $html) { $propertiesKey = ['property', 'name', 'itemprop']; $properties = implode('|', $propertiesKey); + // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' + $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; // Try to retrieve OpenGraph image. - $ogRegex = '#]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; + $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; + $ogRegexReverse = '#]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; if (preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 -- cgit v1.2.3