From 2cd0509b503332b1989f06da45d569d4d2929be5 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 3 Sep 2020 17:46:26 +0200 Subject: Improve regex to extract HTML metadata (title, description, etc.) Also added a bunch of tests to cover more use cases. Fixes #1375 --- application/bookmark/LinkUtils.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'application/bookmark/LinkUtils.php') diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 68914fca..03e1b82a 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -66,11 +66,13 @@ function html_extract_tag($tag, $html) { $propertiesKey = ['property', 'name', 'itemprop']; $properties = implode('|', $propertiesKey); + // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' + $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; // Try to retrieve OpenGraph image. - $ogRegex = '#]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; + $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; + $ogRegexReverse = '#]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; if (preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 -- cgit v1.2.3 From 740b32b520e6b1723512c6f9b78cef6575b1725b Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Tue, 3 Nov 2020 12:38:38 +0100 Subject: Default formatter: add a setting to disable auto-linkification + update documentation + single parameter for both URL and hashtags Fixes #1094 --- application/bookmark/LinkUtils.php | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'application/bookmark/LinkUtils.php') diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index faf5dbfd..17c37979 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -138,12 +138,17 @@ function space2nbsp($text) * * @param string $description shaare's description. * @param string $indexUrl URL to Shaarli's index. - + * @param bool $autolink Turn on/off automatic linkifications of URLs and hashtags + * * @return string formatted description. */ -function format_description($description, $indexUrl = '') +function format_description($description, $indexUrl = '', $autolink = true) { - return nl2br(space2nbsp(hashtag_autolink(text2clickable($description), $indexUrl))); + if ($autolink) { + $description = hashtag_autolink(text2clickable($description), $indexUrl); + } + + return nl2br(space2nbsp($description)); } /** -- cgit v1.2.3 From b3bd8c3e8d367975980043e772f7cd78b7f96bc6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 22 Oct 2020 16:21:03 +0200 Subject: Feature: support any tag separator So it allows to have multiple words tags. Breaking change: commas ',' are no longer a default separator. Fixes #594 --- application/bookmark/LinkUtils.php | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'application/bookmark/LinkUtils.php') diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 17c37979..9493b0aa 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -176,3 +176,49 @@ function is_note($linkUrl) { return isset($linkUrl[0]) && $linkUrl[0] === '?'; } + +/** + * Extract an array of tags from a given tag string, with provided separator. + * + * @param string|null $tags String containing a list of tags separated by $separator. + * @param string $separator Shaarli's default: ' ' (whitespace) + * + * @return array List of tags + */ +function tags_str2array(?string $tags, string $separator): array +{ + // For whitespaces, we use the special \s regex character + $separator = $separator === ' ' ? '\s' : $separator; + + return preg_split('/\s*' . $separator . '+\s*/', trim($tags) ?? '', -1, PREG_SPLIT_NO_EMPTY); +} + +/** + * Return a tag string with provided separator from a list of tags. + * Note that given array is clean up by tags_filter(). + * + * @param array|null $tags List of tags + * @param string $separator + * + * @return string + */ +function tags_array2str(?array $tags, string $separator): string +{ + return implode($separator, tags_filter($tags, $separator)); +} + +/** + * Clean an array of tags: trim + remove empty entries + * + * @param array|null $tags List of tags + * @param string $separator + * + * @return array + */ +function tags_filter(?array $tags, string $separator): array +{ + $trimDefault = " \t\n\r\0\x0B"; + return array_values(array_filter(array_map(function (string $entry) use ($separator, $trimDefault): string { + return trim($entry, $trimDefault . $separator); + }, $tags ?? []))); +} -- cgit v1.2.3 From 00d3dd91ef42df13eeafbcc54dcebe3238e322c6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Sun, 8 Nov 2020 13:54:39 +0100 Subject: Fix an issue truncating extracted metadata content Previous regex forced the selection to stop at either the first single or double quote found, regardless of the opening quote. Using '\1', we're sure to wait for the proper quote before stopping the capture. --- application/bookmark/LinkUtils.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'application/bookmark/LinkUtils.php') diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 17c37979..a74fda57 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -68,16 +68,16 @@ function html_extract_tag($tag, $html) $properties = implode('|', $propertiesKey); // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; - // Try to retrieve OpenGraph image. - $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#'; + // Try to retrieve OpenGraph tag. + $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; + $ogRegexReverse = '#]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; if (preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 ) { - return $matches[1]; + return $matches[2]; } return false; -- cgit v1.2.3 From 53054b2bf6a919fd4ff9b44b6ad1986f21f488b6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Tue, 22 Sep 2020 20:25:47 +0200 Subject: Apply PHP Code Beautifier on source code for linter automatic fixes --- application/bookmark/LinkUtils.php | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'application/bookmark/LinkUtils.php') diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index cf97e3b0..d65e97ed 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php @@ -67,14 +67,15 @@ function html_extract_tag($tag, $html) $propertiesKey = ['property', 'name', 'itemprop']; $properties = implode('|', $propertiesKey); // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"' - $orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; + $orCondition = '["\']?(?:og:)?' . $tag . '["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]'; // Try to retrieve OpenGraph tag. - $ogRegex = '#]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=(["\'])([^\1]*?)\1.*?>#'; + $ogRegex = '#]+(?:' . $properties . ')=(?:' . $orCondition . ')[^>]*content=(["\'])([^\1]*?)\1.*?>#'; // If the attributes are not in the order property => content (e.g. Github) // New regex to keep this readable... more or less. - $ogRegexReverse = '#]+content=(["\'])([^\1]*?)\1[^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#'; + $ogRegexReverse = '#]+content=(["\'])([^\1]*?)\1[^>]+(?:' . $properties . ')=(?:' . $orCondition . ').*?>#'; - if (preg_match($ogRegex, $html, $matches) > 0 + if ( + preg_match($ogRegex, $html, $matches) > 0 || preg_match($ogRegexReverse, $html, $matches) > 0 ) { return $matches[2]; @@ -116,7 +117,7 @@ function hashtag_autolink($description, $indexUrl = '') * \p{Mn} - any non marking space (accents, umlauts, etc) */ $regex = '/(^|\s)#([\p{Pc}\p{N}\p{L}\p{Mn}]+)/mui'; - $replacement = '$1#$2'; + $replacement = '$1#$2'; return preg_replace($regex, $replacement, $description); } -- cgit v1.2.3