From 2cd0509b503332b1989f06da45d569d4d2929be5 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 3 Sep 2020 17:46:26 +0200 Subject: Improve regex to extract HTML metadata (title, description, etc.) Also added a bunch of tests to cover more use cases. Fixes #1375 --- tests/bookmark/LinkUtilsTest.php | 89 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) (limited to 'tests/bookmark/LinkUtilsTest.php') diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 7d4a7b89..cc7819bc 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -81,8 +81,78 @@ class LinkUtilsTest extends TestCase public function testHtmlExtractExistentNameTag() { $description = 'Bob and Alice share cookies.'; + + // Simple one line $html = 'stuff2'; $this->assertEquals($description, html_extract_tag('description', $html)); + + // Simple OpenGraph + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // Simple reversed OpenGraph + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // ItemProp OpenGraph + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph without quotes + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed without quotes + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties start + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties both end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph multiple properties both end with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties start + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties both end + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // OpenGraph reversed multiple properties both end with noise + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + // Suggestion from #1375 + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); } /** @@ -92,6 +162,25 @@ class LinkUtilsTest extends TestCase { $html = 'stuff2'; $this->assertFalse(html_extract_tag('description', $html)); + + // Partial meta tag + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); + + $html = ''; + $this->assertFalse(html_extract_tag('description', $html)); } /** -- cgit v1.2.3 From 5334090be04e66da5cb5c3ad487604b3733c5cac Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 15 Oct 2020 11:20:33 +0200 Subject: Improve metadata retrieval (performances and accuracy) - Use dedicated function to download headers to avoid apply multiple regexps on headers - Also try to extract title from meta tags --- tests/bookmark/LinkUtilsTest.php | 223 +++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 115 deletions(-) (limited to 'tests/bookmark/LinkUtilsTest.php') diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 29941c8c..3321242f 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -215,61 +215,92 @@ class LinkUtilsTest extends TestCase $this->assertFalse(html_extract_tag('description', $html)); } + /** + * Test the header callback with valid value + */ + public function testCurlHeaderCallbackOk(): void + { + $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_ok'); + $data = [ + 'HTTP/1.1 200 OK', + 'Server: GitHub.com', + 'Date: Sat, 28 Oct 2017 12:01:33 GMT', + 'Content-Type: text/html; charset=utf-8', + 'Status: 200 OK', + ]; + + foreach ($data as $chunk) { + static::assertIsInt($callback(null, $chunk)); + } + + static::assertSame('utf-8', $charset); + } + /** * Test the download callback with valid value */ - public function testCurlDownloadCallbackOk() + public function testCurlDownloadCallbackOk(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_ok' + false ); + $data = [ - 'HTTP/1.1 200 OK', - 'Server: GitHub.com', - 'Date: Sat, 28 Oct 2017 12:01:33 GMT', - 'Content-Type: text/html; charset=utf-8', - 'Status: 200 OK', - 'end' => 'th=device-width">' + 'th=device-width">' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } - $this->assertEquals('utf-8', $charset); - $this->assertEquals('Refactoring · GitHub', $title); - $this->assertEmpty($desc); - $this->assertEmpty($keywords); + + static::assertSame('utf-8', $charset); + static::assertSame('Refactoring · GitHub', $title); + static::assertEmpty($desc); + static::assertEmpty($keywords); + } + + /** + * Test the header callback with valid value + */ + public function testCurlHeaderCallbackNoCharset(): void + { + $callback = get_curl_header_callback($charset, 'ut_curl_getinfo_no_charset'); + $data = [ + 'HTTP/1.1 200 OK', + ]; + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); + } + + static::assertFalse($charset); } /** * Test the download callback with valid values and no charset */ - public function testCurlDownloadCallbackOkNoCharset() + public function testCurlDownloadCallbackOkNoCharset(): void { + $charset = null; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_no_charset' + false ); + $data = [ - 'HTTP/1.1 200 OK', 'end' => 'th=device-width">' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $this->assertEquals(strlen($line), $callback($ignore, $line)); + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEmpty($charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEmpty($desc); @@ -290,18 +322,18 @@ class LinkUtilsTest extends TestCase /** * Test the download callback with valid values and no charset */ - public function testCurlDownloadCallbackOkHtmlCharset() + public function testCurlDownloadCallbackOkHtmlCharset(): void { + $charset = null; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_no_charset' + false ); + $data = [ - 'HTTP/1.1 200 OK', '', 'end' => 'th=device-width">' . 'Refactoring · GitHub' @@ -310,14 +342,10 @@ class LinkUtilsTest extends TestCase . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEquals('utf-8', $charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEmpty($desc); @@ -327,25 +355,26 @@ class LinkUtilsTest extends TestCase /** * Test the download callback with valid values and no title */ - public function testCurlDownloadCallbackOkNoTitle() + public function testCurlDownloadCallbackOkNoTitle(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, $desc, $keywords, - false, - 'ut_curl_getinfo_ok' + false ); + $data = [ - 'HTTP/1.1 200 OK', 'end' => 'th=device-width">Refactoring · GitHub' . 'Refactoring · GitHub' . '' . '', ]; - foreach ($data as $key => $line) { - $ignore = null; - $expected = $key !== 'end' ? strlen($line) : false; - $this->assertEquals($expected, $callback($ignore, $line)); - if ($expected === false) { - break; - } + + foreach ($data as $chunk) { + static::assertSame(strlen($chunk), $callback(null, $chunk)); } + $this->assertEquals('utf-8', $charset); $this->assertEquals('Refactoring · GitHub', $title); $this->assertEquals('link desc', $desc); @@ -453,8 +453,9 @@ class LinkUtilsTest extends TestCase * Test the download callback with valid value, and retrieve_description option enabled, * but no desc or keyword defined in the page. */ - public function testCurlDownloadCallbackOkWithDescNotFound() + public function testCurlDownloadCallbackOkWithDescNotFound(): void { + $charset = 'utf-8'; $callback = get_curl_download_callback( $charset, $title, @@ -464,24 +465,16 @@ class LinkUtilsTest extends TestCase 'ut_curl_getinfo_ok' ); $data = [ - 'HTTP/1.1 200 OK', - 'Server: GitHub.com', - 'Date: Sat, 28 Oct 2017 12:01:33 GMT', - 'Content-Type: text/html; charset=utf-8', - 'Status: 200 OK', 'th=device-width">' . 'Refactoring · GitHub' . '' @@ -574,6 +579,115 @@ class LinkUtilsTest extends TestCase $this->assertFalse(is_note('https://github.com/shaarli/Shaarli/?hi')); } + /** + * Test tags_str2array with whitespace separator. + */ + public function testTagsStr2ArrayWithSpaceSeparator(): void + { + $separator = ' '; + + static::assertSame(['tag1', 'tag2', 'tag3'], tags_str2array('tag1 tag2 tag3', $separator)); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_str2array('tag1 tag2 tag3', $separator)); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_str2array(' tag1 tag2 tag3 ', $separator)); + static::assertSame(['tag1@', 'tag2,', '.tag3'], tags_str2array(' tag1@ tag2, .tag3 ', $separator)); + static::assertSame([], tags_str2array('', $separator)); + static::assertSame([], tags_str2array(' ', $separator)); + static::assertSame([], tags_str2array(null, $separator)); + } + + /** + * Test tags_str2array with @ separator. + */ + public function testTagsStr2ArrayWithCharSeparator(): void + { + $separator = '@'; + + static::assertSame(['tag1', 'tag2', 'tag3'], tags_str2array('tag1@tag2@tag3', $separator)); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_str2array('tag1@@@@tag2@@@@tag3', $separator)); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_str2array('@@@tag1@@@tag2@@@@tag3@@', $separator)); + static::assertSame( + ['tag1#', 'tag2, and other', '.tag3'], + tags_str2array('@@@ tag1# @@@ tag2, and other @@@@.tag3@@', $separator) + ); + static::assertSame([], tags_str2array('', $separator)); + static::assertSame([], tags_str2array(' ', $separator)); + static::assertSame([], tags_str2array(null, $separator)); + } + + /** + * Test tags_array2str with ' ' separator. + */ + public function testTagsArray2StrWithSpaceSeparator(): void + { + $separator = ' '; + + static::assertSame('tag1 tag2 tag3', tags_array2str(['tag1', 'tag2', 'tag3'], $separator)); + static::assertSame('tag1, tag2@ tag3', tags_array2str(['tag1,', 'tag2@', 'tag3'], $separator)); + static::assertSame('tag1 tag2 tag3', tags_array2str([' tag1 ', 'tag2', 'tag3 '], $separator)); + static::assertSame('tag1 tag2 tag3', tags_array2str([' tag1 ', ' ', 'tag2', ' ', 'tag3 '], $separator)); + static::assertSame('tag1', tags_array2str([' tag1 '], $separator)); + static::assertSame('', tags_array2str([' '], $separator)); + static::assertSame('', tags_array2str([], $separator)); + static::assertSame('', tags_array2str(null, $separator)); + } + + /** + * Test tags_array2str with @ separator. + */ + public function testTagsArray2StrWithCharSeparator(): void + { + $separator = '@'; + + static::assertSame('tag1@tag2@tag3', tags_array2str(['tag1', 'tag2', 'tag3'], $separator)); + static::assertSame('tag1,@tag2@tag3', tags_array2str(['tag1,', 'tag2@', 'tag3'], $separator)); + static::assertSame( + 'tag1@tag2, and other@tag3', + tags_array2str(['@@@@ tag1@@@', ' @tag2, and other @', 'tag3@@@@'], $separator) + ); + static::assertSame('tag1@tag2@tag3', tags_array2str(['@@@tag1@@@', '@', 'tag2', '@@@', 'tag3@@@'], $separator)); + static::assertSame('tag1', tags_array2str(['@@@@tag1@@@@'], $separator)); + static::assertSame('', tags_array2str(['@@@'], $separator)); + static::assertSame('', tags_array2str([], $separator)); + static::assertSame('', tags_array2str(null, $separator)); + } + + /** + * Test tags_array2str with @ separator. + */ + public function testTagsFilterWithSpaceSeparator(): void + { + $separator = ' '; + + static::assertSame(['tag1', 'tag2', 'tag3'], tags_filter(['tag1', 'tag2', 'tag3'], $separator)); + static::assertSame(['tag1,', 'tag2@', 'tag3'], tags_filter(['tag1,', 'tag2@', 'tag3'], $separator)); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_filter([' tag1 ', 'tag2', 'tag3 '], $separator)); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_filter([' tag1 ', ' ', 'tag2', ' ', 'tag3 '], $separator)); + static::assertSame(['tag1'], tags_filter([' tag1 '], $separator)); + static::assertSame([], tags_filter([' '], $separator)); + static::assertSame([], tags_filter([], $separator)); + static::assertSame([], tags_filter(null, $separator)); + } + + /** + * Test tags_array2str with @ separator. + */ + public function testTagsArrayFilterWithSpaceSeparator(): void + { + $separator = '@'; + + static::assertSame(['tag1', 'tag2', 'tag3'], tags_filter(['tag1', 'tag2', 'tag3'], $separator)); + static::assertSame(['tag1,', 'tag2#', 'tag3'], tags_filter(['tag1,', 'tag2#', 'tag3'], $separator)); + static::assertSame( + ['tag1', 'tag2, and other', 'tag3'], + tags_filter(['@@@@ tag1@@@', ' @tag2, and other @', 'tag3@@@@'], $separator) + ); + static::assertSame(['tag1', 'tag2', 'tag3'], tags_filter(['@@@tag1@@@', '@', 'tag2', '@@@', 'tag3@@@'], $separator)); + static::assertSame(['tag1'], tags_filter(['@@@@tag1@@@@'], $separator)); + static::assertSame([], tags_filter(['@@@'], $separator)); + static::assertSame([], tags_filter([], $separator)); + static::assertSame([], tags_filter(null, $separator)); + } + /** * Util function to build an hashtag link. * -- cgit v1.2.3 From 00d3dd91ef42df13eeafbcc54dcebe3238e322c6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Sun, 8 Nov 2020 13:54:39 +0100 Subject: Fix an issue truncating extracted metadata content Previous regex forced the selection to stop at either the first single or double quote found, regardless of the opening quote. Using '\1', we're sure to wait for the proper quote before stopping the capture. --- tests/bookmark/LinkUtilsTest.php | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'tests/bookmark/LinkUtilsTest.php') diff --git a/tests/bookmark/LinkUtilsTest.php b/tests/bookmark/LinkUtilsTest.php index 3321242f..9bddf84b 100644 --- a/tests/bookmark/LinkUtilsTest.php +++ b/tests/bookmark/LinkUtilsTest.php @@ -168,6 +168,36 @@ class LinkUtilsTest extends TestCase $this->assertEquals($description, html_extract_tag('description', $html)); } + /** + * Test html_extract_tag() with double quoted content containing single quote, and the opposite. + */ + public function testHtmlExtractExistentNameTagWithMixedQuotes(): void + { + $description = 'Bob and Alice share M&M\'s.'; + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $description = 'Bob and Alice share "cookies".'; + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + + $html = ''; + $this->assertEquals($description, html_extract_tag('description', $html)); + } + /** * Test html_extract_tag() when the tag