From 2cd0509b503332b1989f06da45d569d4d2929be5 Mon Sep 17 00:00:00 2001
From: ArthurHoaro <arthur@hoa.ro>
Date: Thu, 3 Sep 2020 17:46:26 +0200
Subject: Improve regex to extract HTML metadata (title, description, etc.)

Also added a bunch of tests to cover more use cases.

Fixes #1375
---
 application/bookmark/LinkUtils.php | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'application/bookmark')

diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 68914fca..03e1b82a 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
 {
     $propertiesKey = ['property', 'name', 'itemprop'];
     $properties = implode('|', $propertiesKey);
+    // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
+    $orCondition  = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
     // Try to retrieve OpenGraph image.
-    $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
+    $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
     // If the attributes are not in the order property => content (e.g. Github)
     // New regex to keep this readable... more or less.
-    $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
+    $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
 
     if (preg_match($ogRegex, $html, $matches) > 0
         || preg_match($ogRegexReverse, $html, $matches) > 0
-- 
cgit v1.2.3