diff options
author | ArthurHoaro <arthur@hoa.ro> | 2020-09-03 17:46:26 +0200 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2020-09-03 17:46:26 +0200 |
commit | 2cd0509b503332b1989f06da45d569d4d2929be5 (patch) | |
tree | 7aa76192ea42a640b7238114fad1acd31ccc4960 | |
parent | 21163a3329ef19dc6ebadb75d6452ac02fd59ab3 (diff) | |
download | Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.tar.gz Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.tar.zst Shaarli-2cd0509b503332b1989f06da45d569d4d2929be5.zip |
Improve regex to extract HTML metadata (title, description, etc.)
Also added a bunch of tests to cover more use cases.
Fixes #1375
-rw-r--r-- | application/bookmark/LinkUtils.php | 6 | ||||
-rw-r--r-- | tests/bookmark/LinkUtilsTest.php | 89 |
2 files changed, 93 insertions, 2 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 68914fca..03e1b82a 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php | |||
@@ -66,11 +66,13 @@ function html_extract_tag($tag, $html) | |||
66 | { | 66 | { |
67 | $propertiesKey = ['property', 'name', 'itemprop']; | 67 | $propertiesKey = ['property', 'name', 'itemprop']; |
68 | $properties = implode('|', $propertiesKey); | 68 | $properties = implode('|', $propertiesKey); |
69 | // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' | ||
70 | $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; | ||
69 | // Try to retrieve OpenGraph image. | 71 | // Try to retrieve OpenGraph image. |
70 | $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; | 72 | $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; |
71 | // If the attributes are not in the order property => content (e.g. Github) | 73 | // If the attributes are not in the order property => content (e.g. Github) |
72 | // New regex to keep this readable... more or less. | 74 | // New regex to keep this readable... more or less. |
73 | $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; | 75 | $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; |
74 | 76 | ||
75 | if (preg_match($ogRegex, $html, $matches) > 0 | 77 | if (preg_match($ogRegex, $html, $matches) > 0 |
76 | || preg_match($ogRegexReverse, $html, $matches) > 0 | 78 | || preg_match($ogRegexReverse, $html, $matches) > 0 |
diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 7d4a7b89..cc7819bc 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php | |||
@@ -81,8 +81,78 @@ class LinkUtilsTest extends TestCase | |||
81 | public function testHtmlExtractExistentNameTag() | 81 | public function testHtmlExtractExistentNameTag() |
82 | { | 82 | { |
83 | $description = 'Bob and Alice share cookies.'; | 83 | $description = 'Bob and Alice share cookies.'; |
84 | |||
85 | // Simple one line | ||
84 | $html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>'; | 86 | $html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>'; |
85 | $this->assertEquals($description, html_extract_tag('description', $html)); | 87 | $this->assertEquals($description, html_extract_tag('description', $html)); |
88 | |||
89 | // Simple OpenGraph | ||
90 | $html = '<meta property="og:description" content="' . $description . '">'; | ||
91 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
92 | |||
93 | // Simple reversed OpenGraph | ||
94 | $html = '<meta content="' . $description . '" property="og:description">'; | ||
95 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
96 | |||
97 | // ItemProp OpenGraph | ||
98 | $html = '<meta itemprop="og:description" content="' . $description . '">'; | ||
99 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
100 | |||
101 | // OpenGraph without quotes | ||
102 | $html = '<meta property=og:description content="' . $description . '">'; | ||
103 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
104 | |||
105 | // OpenGraph reversed without quotes | ||
106 | $html = '<meta content="' . $description . '" property=og:description>'; | ||
107 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
108 | |||
109 | // OpenGraph with noise | ||
110 | $html = '<meta tag1="content1" property="og:description" tag2="content2" content="' . | ||
111 | $description . '" tag3="content3">'; | ||
112 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
113 | |||
114 | // OpenGraph reversed with noise | ||
115 | $html = '<meta tag1="content1" content="' . $description . '" ' . | ||
116 | 'tag3="content3" tag2="content2" property="og:description">'; | ||
117 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
118 | |||
119 | // OpenGraph multiple properties start | ||
120 | $html = '<meta property="unrelated og:description" content="' . $description . '">'; | ||
121 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
122 | |||
123 | // OpenGraph multiple properties end | ||
124 | $html = '<meta property="og:description unrelated" content="' . $description . '">'; | ||
125 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
126 | |||
127 | // OpenGraph multiple properties both end | ||
128 | $html = '<meta property="og:unrelated1 og:description og:unrelated2" content="' . $description . '">'; | ||
129 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
130 | |||
131 | // OpenGraph multiple properties both end with noise | ||
132 | $html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '. | ||
133 | 'tag2="content2" content="' . $description . '" tag3="content3">'; | ||
134 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
135 | |||
136 | // OpenGraph reversed multiple properties start | ||
137 | $html = '<meta content="' . $description . '" property="unrelated og:description">'; | ||
138 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
139 | |||
140 | // OpenGraph reversed multiple properties end | ||
141 | $html = '<meta content="' . $description . '" property="og:description unrelated">'; | ||
142 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
143 | |||
144 | // OpenGraph reversed multiple properties both end | ||
145 | $html = '<meta content="' . $description . '" property="og:unrelated1 og:description og:unrelated2">'; | ||
146 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
147 | |||
148 | // OpenGraph reversed multiple properties both end with noise | ||
149 | $html = '<meta tag1="content1" content="' . $description . '" tag2="content2" '. | ||
150 | 'property="og:unrelated1 og:description og:unrelated2" tag3="content3">'; | ||
151 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
152 | |||
153 | // Suggestion from #1375 | ||
154 | $html = '<meta property="og:description" name="description" content="' . $description . '">'; | ||
155 | $this->assertEquals($description, html_extract_tag('description', $html)); | ||
86 | } | 156 | } |
87 | 157 | ||
88 | /** | 158 | /** |
@@ -92,6 +162,25 @@ class LinkUtilsTest extends TestCase | |||
92 | { | 162 | { |
93 | $html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>'; | 163 | $html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>'; |
94 | $this->assertFalse(html_extract_tag('description', $html)); | 164 | $this->assertFalse(html_extract_tag('description', $html)); |
165 | |||
166 | // Partial meta tag | ||
167 | $html = '<meta content="Brief description">'; | ||
168 | $this->assertFalse(html_extract_tag('description', $html)); | ||
169 | |||
170 | $html = '<meta property="og:description">'; | ||
171 | $this->assertFalse(html_extract_tag('description', $html)); | ||
172 | |||
173 | $html = '<meta tag1="content1" property="og:description">'; | ||
174 | $this->assertFalse(html_extract_tag('description', $html)); | ||
175 | |||
176 | $html = '<meta property="og:description" tag1="content1">'; | ||
177 | $this->assertFalse(html_extract_tag('description', $html)); | ||
178 | |||
179 | $html = '<meta tag1="content1" content="Brief description">'; | ||
180 | $this->assertFalse(html_extract_tag('description', $html)); | ||
181 | |||
182 | $html = '<meta content="Brief description" tag1="content1">'; | ||
183 | $this->assertFalse(html_extract_tag('description', $html)); | ||
95 | } | 184 | } |
96 | 185 | ||
97 | /** | 186 | /** |