aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-09-03 17:46:26 +0200
committerArthurHoaro <arthur@hoa.ro>2020-09-03 17:46:26 +0200
commit2cd0509b503332b1989f06da45d569d4d2929be5 (patch)
tree7aa76192ea42a640b7238114fad1acd31ccc4960
parent21163a3329ef19dc6ebadb75d6452ac02fd59ab3 (diff)
downloadShaarli-2cd0509b503332b1989f06da45d569d4d2929be5.tar.gz
Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.tar.zst
Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.zip
Improve regex to extract HTML metadata (title, description, etc.)
Also added a bunch of tests to cover more use cases. Fixes #1375
-rw-r--r--application/bookmark/LinkUtils.php6
-rw-r--r--tests/bookmark/LinkUtilsTest.php89
2 files changed, 93 insertions, 2 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 68914fca..03e1b82a 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
66{ 66{
67 $propertiesKey = ['property', 'name', 'itemprop']; 67 $propertiesKey = ['property', 'name', 'itemprop'];
68 $properties = implode('|', $propertiesKey); 68 $properties = implode('|', $propertiesKey);
69 // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
70 $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
69 // Try to retrieve OpenGraph image. 71 // Try to retrieve OpenGraph image.
70 $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; 72 $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
71 // If the attributes are not in the order property => content (e.g. Github) 73 // If the attributes are not in the order property => content (e.g. Github)
72 // New regex to keep this readable... more or less. 74 // New regex to keep this readable... more or less.
73 $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; 75 $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
74 76
75 if (preg_match($ogRegex, $html, $matches) > 0 77 if (preg_match($ogRegex, $html, $matches) > 0
76 || preg_match($ogRegexReverse, $html, $matches) > 0 78 || preg_match($ogRegexReverse, $html, $matches) > 0
diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php
index 7d4a7b89..cc7819bc 100644
--- a/tests/bookmark/LinkUtilsTest.php
+++ b/tests/bookmark/LinkUtilsTest.php
@@ -81,8 +81,78 @@ class LinkUtilsTest extends TestCase
81 public function testHtmlExtractExistentNameTag() 81 public function testHtmlExtractExistentNameTag()
82 { 82 {
83 $description = 'Bob and Alice share cookies.'; 83 $description = 'Bob and Alice share cookies.';
84
85 // Simple one line
84 $html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>'; 86 $html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>';
85 $this->assertEquals($description, html_extract_tag('description', $html)); 87 $this->assertEquals($description, html_extract_tag('description', $html));
88
89 // Simple OpenGraph
90 $html = '<meta property="og:description" content="' . $description . '">';
91 $this->assertEquals($description, html_extract_tag('description', $html));
92
93 // Simple reversed OpenGraph
94 $html = '<meta content="' . $description . '" property="og:description">';
95 $this->assertEquals($description, html_extract_tag('description', $html));
96
97 // ItemProp OpenGraph
98 $html = '<meta itemprop="og:description" content="' . $description . '">';
99 $this->assertEquals($description, html_extract_tag('description', $html));
100
101 // OpenGraph without quotes
102 $html = '<meta property=og:description content="' . $description . '">';
103 $this->assertEquals($description, html_extract_tag('description', $html));
104
105 // OpenGraph reversed without quotes
106 $html = '<meta content="' . $description . '" property=og:description>';
107 $this->assertEquals($description, html_extract_tag('description', $html));
108
109 // OpenGraph with noise
110 $html = '<meta tag1="content1" property="og:description" tag2="content2" content="' .
111 $description . '" tag3="content3">';
112 $this->assertEquals($description, html_extract_tag('description', $html));
113
114 // OpenGraph reversed with noise
115 $html = '<meta tag1="content1" content="' . $description . '" ' .
116 'tag3="content3" tag2="content2" property="og:description">';
117 $this->assertEquals($description, html_extract_tag('description', $html));
118
119 // OpenGraph multiple properties start
120 $html = '<meta property="unrelated og:description" content="' . $description . '">';
121 $this->assertEquals($description, html_extract_tag('description', $html));
122
123 // OpenGraph multiple properties end
124 $html = '<meta property="og:description unrelated" content="' . $description . '">';
125 $this->assertEquals($description, html_extract_tag('description', $html));
126
127 // OpenGraph multiple properties both end
128 $html = '<meta property="og:unrelated1 og:description og:unrelated2" content="' . $description . '">';
129 $this->assertEquals($description, html_extract_tag('description', $html));
130
131 // OpenGraph multiple properties both end with noise
132 $html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
133 'tag2="content2" content="' . $description . '" tag3="content3">';
134 $this->assertEquals($description, html_extract_tag('description', $html));
135
136 // OpenGraph reversed multiple properties start
137 $html = '<meta content="' . $description . '" property="unrelated og:description">';
138 $this->assertEquals($description, html_extract_tag('description', $html));
139
140 // OpenGraph reversed multiple properties end
141 $html = '<meta content="' . $description . '" property="og:description unrelated">';
142 $this->assertEquals($description, html_extract_tag('description', $html));
143
144 // OpenGraph reversed multiple properties both end
145 $html = '<meta content="' . $description . '" property="og:unrelated1 og:description og:unrelated2">';
146 $this->assertEquals($description, html_extract_tag('description', $html));
147
148 // OpenGraph reversed multiple properties both end with noise
149 $html = '<meta tag1="content1" content="' . $description . '" tag2="content2" '.
150 'property="og:unrelated1 og:description og:unrelated2" tag3="content3">';
151 $this->assertEquals($description, html_extract_tag('description', $html));
152
153 // Suggestion from #1375
154 $html = '<meta property="og:description" name="description" content="' . $description . '">';
155 $this->assertEquals($description, html_extract_tag('description', $html));
86 } 156 }
87 157
88 /** 158 /**
@@ -92,6 +162,25 @@ class LinkUtilsTest extends TestCase
92 { 162 {
93 $html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>'; 163 $html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>';
94 $this->assertFalse(html_extract_tag('description', $html)); 164 $this->assertFalse(html_extract_tag('description', $html));
165
166 // Partial meta tag
167 $html = '<meta content="Brief description">';
168 $this->assertFalse(html_extract_tag('description', $html));
169
170 $html = '<meta property="og:description">';
171 $this->assertFalse(html_extract_tag('description', $html));
172
173 $html = '<meta tag1="content1" property="og:description">';
174 $this->assertFalse(html_extract_tag('description', $html));
175
176 $html = '<meta property="og:description" tag1="content1">';
177 $this->assertFalse(html_extract_tag('description', $html));
178
179 $html = '<meta tag1="content1" content="Brief description">';
180 $this->assertFalse(html_extract_tag('description', $html));
181
182 $html = '<meta content="Brief description" tag1="content1">';
183 $this->assertFalse(html_extract_tag('description', $html));
95 } 184 }
96 185
97 /** 186 /**