aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/bookmark
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-10-13 12:26:55 +0200
committerGitHub <noreply@github.com>2020-10-13 12:26:55 +0200
commit458b6b9918ec27154dd45416947bb93bedb97109 (patch)
treec1c565def0a4fffac5d0556794451c49fc4d52e4 /application/bookmark
parent543b16b4f4bbde4e9857490e2175e44b4d941eb3 (diff)
parent2cd0509b503332b1989f06da45d569d4d2929be5 (diff)
downloadShaarli-458b6b9918ec27154dd45416947bb93bedb97109.tar.gz
Shaarli-458b6b9918ec27154dd45416947bb93bedb97109.tar.zst
Shaarli-458b6b9918ec27154dd45416947bb93bedb97109.zip
Merge pull request #1540 from ArthurHoaro/fix/metadata-regexes
Improve regex to extract HTML metadata (title, description, etc.)
Diffstat (limited to 'application/bookmark')
-rw-r--r--application/bookmark/LinkUtils.php6
1 files changed, 4 insertions, 2 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index e7af4d55..faf5dbfd 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
66{ 66{
67 $propertiesKey = ['property', 'name', 'itemprop']; 67 $propertiesKey = ['property', 'name', 'itemprop'];
68 $properties = implode('|', $propertiesKey); 68 $properties = implode('|', $propertiesKey);
69 // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
70 $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
69 // Try to retrieve OpenGraph image. 71 // Try to retrieve OpenGraph image.
70 $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; 72 $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
71 // If the attributes are not in the order property => content (e.g. Github) 73 // If the attributes are not in the order property => content (e.g. Github)
72 // New regex to keep this readable... more or less. 74 // New regex to keep this readable... more or less.
73 $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; 75 $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
74 76
75 if (preg_match($ogRegex, $html, $matches) > 0 77 if (preg_match($ogRegex, $html, $matches) > 0
76 || preg_match($ogRegexReverse, $html, $matches) > 0 78 || preg_match($ogRegexReverse, $html, $matches) > 0