From: ArthurHoaro Date: Sun, 8 Nov 2020 13:06:38 +0000 (+0100) Subject: Merge pull request #1631 from ArthurHoaro/fix/html-extract-quote-fix X-Git-Tag: v0.12.1^2~9 X-Git-Url: https://git.immae.eu/?a=commitdiff_plain;h=8d8fa898ab09da87d6c5b333ce6c5b7e8d1506d0;hp=8c5f6c786d00310b2e863aa316927effb7bfeedb;p=github%2Fshaarli%2FShaarli.git Merge pull request #1631 from ArthurHoaro/fix/html-extract-quote-fix Fix an issue truncating extracted metadata content --- diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 17c37979..a74fda57 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -68,16 +68,16 @@ function html_extract_tag($tag, $html) $properties = implode('|', $propertiesKey); // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; - // Try to retrieve OpenGraph image. - $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; + // Try to retrieve OpenGraph tag. + $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; + $ogRegexReverse = '#]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; if (preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 ) { - return $matches[1]; + return $matches[2]; } return false; diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 3321242f..9bddf84b 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -168,6 +168,36 @@ class LinkUtilsTest extends TestCase $this->assertEquals($description, html_extract_tag('description', $html)); } + /** + * Test html_extract_tag() with double quoted content containing single quote, and the opposite. + */ + public function testHtmlExtractExistentNameTagWithMixedQuotes(): void + { + $description = 'Bob and Alice share M&M\'s.'; + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $description = 'Bob and Alice share "cookies".'; + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + } + /** * Test html_extract_tag() when the tag