Automatically retrieve description for new bookmarks

If the option is enabled, it will try to find a meta tag containing the page description and keywords, just like we do for the page title. It will either look for regular meta tag or OpenGraph ones. The option is disabled by default. Note that keywords meta tags is mostly not used. In `configure` template, the variable associated with this setting is `$retrieve_description`. Fixes #1302
author: ArthurHoaro <arthur@hoa.ro> 2019-06-08 13:59:19 +0200
committer: ArthurHoaro <arthur@hoa.ro> 2019-07-06 12:21:52 +0200
commit: 6a4872520cbbc012b5a8358cd50c78844afe8d07 (patch)
tree: bf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark
parent: 5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff)
download: Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.gz
Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.zst
Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.zip
1 files changed, 81 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 35a5b290..77eb2d95 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
 *
 * @param string $charset     to extract from the downloaded page (reference)
 * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
 * @param string $curlGetInfo Optionally overrides curl_getinfo function
 *
 * @return Closure
 */
-function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
+function get_curl_download_callback(
-{
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription,
+    $curlGetInfo = 'curl_getinfo'
+) {
    $isRedirected = false;
+    $currentChunk = 0;
+    $foundChunk = null;
    /**
     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
     *
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
     *
     * @return int|bool length of $data or false if we need to stop the download
     */
-    return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) {
+    return function (&$ch, $data) use (
+        $retrieveDescription,
+        $curlGetInfo,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$isRedirected,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $currentChunk++;
        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
            $isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
        }
        if (empty($title)) {
            $title = html_extract_title($data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($description)) {
+            $description = html_extract_tag('description', $data);
+            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
        }
+        if ($retrieveDescription && empty($keywords)) {
+            $keywords = html_extract_tag('keywords', $data);
+            if (! empty($keywords)) {
+                $foundChunk = $currentChunk;
+                // Keywords use the format tag1, tag2 multiple words, tag
+                // So we format them to match Shaarli's separator and glue multiple words with '-'
+                $keywords = implode(' ', array_map(function($keyword) {
+                    return implode('-', preg_split('/\s+/', trim($keyword)));
+                }, explode(',', $keywords)));
+            }
+        }
        // We got everything we want, stop the download.
-        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+        // If we already found either the title, description or keywords,
+        // it's highly unlikely that we'll found the other metas further than
+        // in the same chunk of data or the next one. So we also stop the download after that.
+        if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+            && (! $retrieveDescription
+                || $foundChunk < $currentChunk
+                || (!empty($title) && !empty($description) && !empty($keywords))
+            )
+        ) {
            return false;
        }
@@ -111,6 +159,35 @@ function html_extract_charset($html)
 }
 /**
+ * Extract meta tag from HTML content in either:
+ *   - OpenGraph: <meta property="og:[tag]" ...>
+ *   - Meta tag: <meta name="[tag]" ...>
+ *
+ * @param string $tag  Name of the tag to retrieve.
+ * @param string $html HTML content where to look for charset.
+ *
+ * @return bool|string Charset string if found, false otherwise.
+ */
+function html_extract_tag($tag, $html)
+{
+    $propertiesKey = ['property', 'name', 'itemprop'];
+    $properties = implode('|', $propertiesKey);
+    // Try to retrieve OpenGraph image.
+    $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
+    // If the attributes are not in the order property => content (e.g. Github)
+    // New regex to keep this readable... more or less.
+    $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
+    if (preg_match($ogRegex, $html, $matches) > 0
+        || preg_match($ogRegexReverse, $html, $matches) > 0
+    ) {
+        return $matches[1];
+    }
+    return false;
+}
+/**
 * Count private links in given linklist.
 *
 * @param array|Countable $links Linklist.
author	ArthurHoaro <arthur@hoa.ro>	2019-06-08 13:59:19 +0200
committer	ArthurHoaro <arthur@hoa.ro>	2019-07-06 12:21:52 +0200
commit	6a4872520cbbc012b5a8358cd50c78844afe8d07 (patch)
tree	bf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark
parent	5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff)
download	Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.gz Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.zst Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.zip

diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 35a5b290..77eb2d95 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
7	*	7	*
8	* @param string $charset to extract from the downloaded page (reference)	8	* @param string $charset to extract from the downloaded page (reference)
9	* @param string $title to extract from the downloaded page (reference)	9	* @param string $title to extract from the downloaded page (reference)
		10	* @param string $description to extract from the downloaded page (reference)
		11	* @param string $keywords to extract from the downloaded page (reference)
		12	* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
10	* @param string $curlGetInfo Optionally overrides curl_getinfo function	13	* @param string $curlGetInfo Optionally overrides curl_getinfo function
11	*	14	*
12	* @return Closure	15	* @return Closure
13	*/	16	*/
14	function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')	17	function get_curl_download_callback(
15	{	18	&$charset,
		19	&$title,
		20	&$description,
		21	&$keywords,
		22	$retrieveDescription,
		23	$curlGetInfo = 'curl_getinfo'
		24	) {
16	$isRedirected = false;	25	$isRedirected = false;
		26	$currentChunk = 0;
		27	$foundChunk = null;
		28
17	/**	29	/**
18	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).	30	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
19	*	31	*
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
25	*	37	*
26	* @return int\|bool length of $data or false if we need to stop the download	38	* @return int\|bool length of $data or false if we need to stop the download
27	*/	39	*/
28	return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) {	40	return function (&$ch, $data) use (
		41	$retrieveDescription,
		42	$curlGetInfo,
		43	&$charset,
		44	&$title,
		45	&$description,
		46	&$keywords,
		47	&$isRedirected,
		48	&$currentChunk,
		49	&$foundChunk
		50	) {
		51	$currentChunk++;
29	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);	52	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
30	if (!empty($responseCode) && in_array($responseCode, [301, 302])) {	53	if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
31	$isRedirected = true;	54	$isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
50	}	73	}
51	if (empty($title)) {	74	if (empty($title)) {
52	$title = html_extract_title($data);	75	$title = html_extract_title($data);
		76	$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
		77	}
		78	if ($retrieveDescription && empty($description)) {
		79	$description = html_extract_tag('description', $data);
		80	$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
53	}	81	}
		82	if ($retrieveDescription && empty($keywords)) {
		83	$keywords = html_extract_tag('keywords', $data);
		84	if (! empty($keywords)) {
		85	$foundChunk = $currentChunk;
		86	// Keywords use the format tag1, tag2 multiple words, tag
		87	// So we format them to match Shaarli's separator and glue multiple words with '-'
		88	$keywords = implode(' ', array_map(function($keyword) {
		89	return implode('-', preg_split('/\s+/', trim($keyword)));
		90	}, explode(',', $keywords)));
		91	}
		92	}
		93
54	// We got everything we want, stop the download.	94	// We got everything we want, stop the download.
55	if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {	95	// If we already found either the title, description or keywords,
		96	// it's highly unlikely that we'll found the other metas further than
		97	// in the same chunk of data or the next one. So we also stop the download after that.
		98	if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
		99	&& (! $retrieveDescription
		100	\|\| $foundChunk < $currentChunk
		101	\|\| (!empty($title) && !empty($description) && !empty($keywords))
		102	)
		103	) {
56	return false;	104	return false;
57	}	105	}
58		106
@@ -111,6 +159,35 @@ function html_extract_charset($html)
111	}	159	}
112		160
113	/**	161	/**
		162	* Extract meta tag from HTML content in either:
		163	* - OpenGraph: <meta property="og:[tag]" ...>
		164	* - Meta tag: <meta name="[tag]" ...>
		165	*
		166	* @param string $tag Name of the tag to retrieve.
		167	* @param string $html HTML content where to look for charset.
		168	*
		169	* @return bool\|string Charset string if found, false otherwise.
		170	*/
		171	function html_extract_tag($tag, $html)
		172	{
		173	$propertiesKey = ['property', 'name', 'itemprop'];
		174	$properties = implode('\|', $propertiesKey);
		175	// Try to retrieve OpenGraph image.
		176	$ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]content=["\']?(.?)["\'/>]#';
		177	// If the attributes are not in the order property => content (e.g. Github)
		178	// New regex to keep this readable... more or less.
		179	$ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
		180
		181	if (preg_match($ogRegex, $html, $matches) > 0
		182	\|\| preg_match($ogRegexReverse, $html, $matches) > 0
		183	) {
		184	return $matches[1];
		185	}
		186
		187	return false;
		188	}
		189
		190	/**
114	* Count private links in given linklist.	191	* Count private links in given linklist.
115	*	192	*
116	* @param array\|Countable $links Linklist.	193	* @param array\|Countable $links Linklist.