aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-11-08 13:54:39 +0100
committerArthurHoaro <arthur@hoa.ro>2020-11-08 13:54:39 +0100
commit00d3dd91ef42df13eeafbcc54dcebe3238e322c6 (patch)
tree123066f497546ad181c96ef2bdd1fde011457807
parent8c5f6c786d00310b2e863aa316927effb7bfeedb (diff)
downloadShaarli-00d3dd91ef42df13eeafbcc54dcebe3238e322c6.tar.gz
Shaarli-00d3dd91ef42df13eeafbcc54dcebe3238e322c6.tar.zst
Shaarli-00d3dd91ef42df13eeafbcc54dcebe3238e322c6.zip
Fix an issue truncating extracted metadata content
Previous regex forced the selection to stop at either the first single or double quote found, regardless of the opening quote. Using '\1', we're sure to wait for the proper quote before stopping the capture.
-rw-r--r--application/bookmark/LinkUtils.php8
-rw-r--r--tests/bookmark/LinkUtilsTest.php30
2 files changed, 34 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 17c37979..a74fda57 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -68,16 +68,16 @@ function html_extract_tag($tag, $html)
68 $properties = implode('|', $propertiesKey); 68 $properties = implode('|', $propertiesKey);
69 // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' 69 // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
70 $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; 70 $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
71 // Try to retrieve OpenGraph image. 71 // Try to retrieve OpenGraph tag.
72 $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; 72 $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#';
73 // If the attributes are not in the order property => content (e.g. Github) 73 // If the attributes are not in the order property => content (e.g. Github)
74 // New regex to keep this readable... more or less. 74 // New regex to keep this readable... more or less.
75 $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; 75 $ogRegexReverse = '#<meta[^>]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
76 76
77 if (preg_match($ogRegex, $html, $matches) > 0 77 if (preg_match($ogRegex, $html, $matches) > 0
78 || preg_match($ogRegexReverse, $html, $matches) > 0 78 || preg_match($ogRegexReverse, $html, $matches) > 0
79 ) { 79 ) {
80 return $matches[1]; 80 return $matches[2];
81 } 81 }
82 82
83 return false; 83 return false;
diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php
index 3321242f..9bddf84b 100644
--- a/tests/bookmark/LinkUtilsTest.php
+++ b/tests/bookmark/LinkUtilsTest.php
@@ -169,6 +169,36 @@ class LinkUtilsTest extends TestCase
169 } 169 }
170 170
171 /** 171 /**
172 * Test html_extract_tag() with double quoted content containing single quote, and the opposite.
173 */
174 public function testHtmlExtractExistentNameTagWithMixedQuotes(): void
175 {
176 $description = 'Bob and Alice share M&M\'s.';
177
178 $html = '<meta property="og:description" content="' . $description . '">';
179 $this->assertEquals($description, html_extract_tag('description', $html));
180
181 $html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
182 'tag2="content2" content="' . $description . '" tag3="content3">';
183 $this->assertEquals($description, html_extract_tag('description', $html));
184
185 $html = '<meta property="og:description" name="description" content="' . $description . '">';
186 $this->assertEquals($description, html_extract_tag('description', $html));
187
188 $description = 'Bob and Alice share "cookies".';
189
190 $html = '<meta property="og:description" content=\'' . $description . '\'>';
191 $this->assertEquals($description, html_extract_tag('description', $html));
192
193 $html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
194 'tag2="content2" content=\'' . $description . '\' tag3="content3">';
195 $this->assertEquals($description, html_extract_tag('description', $html));
196
197 $html = '<meta property="og:description" name="description" content=\'' . $description . '\'>';
198 $this->assertEquals($description, html_extract_tag('description', $html));
199 }
200
201 /**
172 * Test html_extract_tag() when the tag <meta name= is not found. 202 * Test html_extract_tag() when the tag <meta name= is not found.
173 */ 203 */
174 public function testHtmlExtractNonExistentNameTag() 204 public function testHtmlExtractNonExistentNameTag()