aboutsummaryrefslogtreecommitdiffhomepage
path: root/application
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-09-03 17:46:26 +0200
committerArthurHoaro <arthur@hoa.ro>2020-09-03 17:46:26 +0200
commit2cd0509b503332b1989f06da45d569d4d2929be5 (patch)
tree7aa76192ea42a640b7238114fad1acd31ccc4960 /application
parent21163a3329ef19dc6ebadb75d6452ac02fd59ab3 (diff)
downloadShaarli-2cd0509b503332b1989f06da45d569d4d2929be5.tar.gz
Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.tar.zst
Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.zip
Improve regex to extract HTML metadata (title, description, etc.)
Also added a bunch of tests to cover more use cases. Fixes #1375
Diffstat (limited to 'application')
-rw-r--r--application/bookmark/LinkUtils.php6
1 files changed, 4 insertions, 2 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 68914fca..03e1b82a 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
66{ 66{
67 $propertiesKey = ['property', 'name', 'itemprop']; 67 $propertiesKey = ['property', 'name', 'itemprop'];
68 $properties = implode('|', $propertiesKey); 68 $properties = implode('|', $propertiesKey);
69 // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
70 $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
69 // Try to retrieve OpenGraph image. 71 // Try to retrieve OpenGraph image.
70 $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; 72 $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
71 // If the attributes are not in the order property => content (e.g. Github) 73 // If the attributes are not in the order property => content (e.g. Github)
72 // New regex to keep this readable... more or less. 74 // New regex to keep this readable... more or less.
73 $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; 75 $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
74 76
75 if (preg_match($ogRegex, $html, $matches) > 0 77 if (preg_match($ogRegex, $html, $matches) > 0
76 || preg_match($ogRegexReverse, $html, $matches) > 0 78 || preg_match($ogRegexReverse, $html, $matches) > 0