Merge pull request #1313 from ArthurHoaro/feature/desc-retrieval

Automatically retrieve description for new bookmarks
author: ArthurHoaro <arthur@hoa.ro> 2019-07-06 12:34:02 +0200
committer: GitHub <noreply@github.com> 2019-07-06 12:34:02 +0200
commit: c03c90a13e1356ca9cf40cc664547c49305cb24b (patch)
tree: bf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark/LinkUtils.php
parent: 5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff)
parent: 6a4872520cbbc012b5a8358cd50c78844afe8d07 (diff)
download: Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.gz
Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.zst
Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.zip
1 files changed, 81 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 35a5b290..77eb2d95 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
 *
 * @param string $charset     to extract from the downloaded page (reference)
 * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
 * @param string $curlGetInfo Optionally overrides curl_getinfo function
 *
 * @return Closure
 */
-function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
+function get_curl_download_callback(
-{
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription,
+    $curlGetInfo = 'curl_getinfo'
+) {
    $isRedirected = false;
+    $currentChunk = 0;
+    $foundChunk = null;
    /**
     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
     *
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
     *
     * @return int|bool length of $data or false if we need to stop the download
     */
-    return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) {
+    return function (&$ch, $data) use (
+        $retrieveDescription,
+        $curlGetInfo,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$isRedirected,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $currentChunk++;
        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
            $isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
        }
        if (empty($title)) {
            $title = html_extract_title($data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($description)) {
+            $description = html_extract_tag('description', $data);
+            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
        }
+        if ($retrieveDescription && empty($keywords)) {
+            $keywords = html_extract_tag('keywords', $data);
+            if (! empty($keywords)) {
+                $foundChunk = $currentChunk;
+                // Keywords use the format tag1, tag2 multiple words, tag
+                // So we format them to match Shaarli's separator and glue multiple words with '-'
+                $keywords = implode(' ', array_map(function($keyword) {
+                    return implode('-', preg_split('/\s+/', trim($keyword)));
+                }, explode(',', $keywords)));
+            }
+        }
        // We got everything we want, stop the download.
-        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+        // If we already found either the title, description or keywords,
+        // it's highly unlikely that we'll found the other metas further than
+        // in the same chunk of data or the next one. So we also stop the download after that.
+        if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+            && (! $retrieveDescription
+                || $foundChunk < $currentChunk
+                || (!empty($title) && !empty($description) && !empty($keywords))
+            )
+        ) {
            return false;
        }
@@ -111,6 +159,35 @@ function html_extract_charset($html)
 }
 /**
+ * Extract meta tag from HTML content in either:
+ *   - OpenGraph: <meta property="og:[tag]" ...>
+ *   - Meta tag: <meta name="[tag]" ...>
+ *
+ * @param string $tag  Name of the tag to retrieve.
+ * @param string $html HTML content where to look for charset.
+ *
+ * @return bool|string Charset string if found, false otherwise.
+ */
+function html_extract_tag($tag, $html)
+{
+    $propertiesKey = ['property', 'name', 'itemprop'];
+    $properties = implode('|', $propertiesKey);
+    // Try to retrieve OpenGraph image.
+    $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
+    // If the attributes are not in the order property => content (e.g. Github)
+    // New regex to keep this readable... more or less.
+    $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
+    if (preg_match($ogRegex, $html, $matches) > 0
+        || preg_match($ogRegexReverse, $html, $matches) > 0
+    ) {
+        return $matches[1];
+    }
+    return false;
+}
+/**
 * Count private links in given linklist.
 *
 * @param array|Countable $links Linklist.
author	ArthurHoaro <arthur@hoa.ro>	2019-07-06 12:34:02 +0200
committer	GitHub <noreply@github.com>	2019-07-06 12:34:02 +0200
commit	c03c90a13e1356ca9cf40cc664547c49305cb24b (patch)
tree	bf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark/LinkUtils.php
parent	5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff)
parent	6a4872520cbbc012b5a8358cd50c78844afe8d07 (diff)
download	Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.gz Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.zst Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.zip

diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 35a5b290..77eb2d95 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
7	*	7	*
8	* @param string $charset to extract from the downloaded page (reference)	8	* @param string $charset to extract from the downloaded page (reference)
9	* @param string $title to extract from the downloaded page (reference)	9	* @param string $title to extract from the downloaded page (reference)
		10	* @param string $description to extract from the downloaded page (reference)
		11	* @param string $keywords to extract from the downloaded page (reference)
		12	* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
10	* @param string $curlGetInfo Optionally overrides curl_getinfo function	13	* @param string $curlGetInfo Optionally overrides curl_getinfo function
11	*	14	*
12	* @return Closure	15	* @return Closure
13	*/	16	*/
14	function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')	17	function get_curl_download_callback(
15	{	18	&$charset,
		19	&$title,
		20	&$description,
		21	&$keywords,
		22	$retrieveDescription,
		23	$curlGetInfo = 'curl_getinfo'
		24	) {
16	$isRedirected = false;	25	$isRedirected = false;
		26	$currentChunk = 0;
		27	$foundChunk = null;
		28
17	/**	29	/**
18	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).	30	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
19	*	31	*
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
25	*	37	*
26	* @return int\|bool length of $data or false if we need to stop the download	38	* @return int\|bool length of $data or false if we need to stop the download
27	*/	39	*/
28	return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) {	40	return function (&$ch, $data) use (
		41	$retrieveDescription,
		42	$curlGetInfo,
		43	&$charset,
		44	&$title,
		45	&$description,
		46	&$keywords,
		47	&$isRedirected,
		48	&$currentChunk,
		49	&$foundChunk
		50	) {
		51	$currentChunk++;
29	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);	52	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
30	if (!empty($responseCode) && in_array($responseCode, [301, 302])) {	53	if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
31	$isRedirected = true;	54	$isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
50	}	73	}
51	if (empty($title)) {	74	if (empty($title)) {
52	$title = html_extract_title($data);	75	$title = html_extract_title($data);
		76	$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
		77	}
		78	if ($retrieveDescription && empty($description)) {
		79	$description = html_extract_tag('description', $data);
		80	$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
53	}	81	}
		82	if ($retrieveDescription && empty($keywords)) {
		83	$keywords = html_extract_tag('keywords', $data);
		84	if (! empty($keywords)) {
		85	$foundChunk = $currentChunk;
		86	// Keywords use the format tag1, tag2 multiple words, tag
		87	// So we format them to match Shaarli's separator and glue multiple words with '-'
		88	$keywords = implode(' ', array_map(function($keyword) {
		89	return implode('-', preg_split('/\s+/', trim($keyword)));
		90	}, explode(',', $keywords)));
		91	}
		92	}
		93
54	// We got everything we want, stop the download.	94	// We got everything we want, stop the download.
55	if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {	95	// If we already found either the title, description or keywords,
		96	// it's highly unlikely that we'll found the other metas further than
		97	// in the same chunk of data or the next one. So we also stop the download after that.
		98	if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
		99	&& (! $retrieveDescription
		100	\|\| $foundChunk < $currentChunk
		101	\|\| (!empty($title) && !empty($description) && !empty($keywords))
		102	)
		103	) {
56	return false;	104	return false;
57	}	105	}
58		106
@@ -111,6 +159,35 @@ function html_extract_charset($html)
111	}	159	}
112		160
113	/**	161	/**
		162	* Extract meta tag from HTML content in either:
		163	* - OpenGraph: <meta property="og:[tag]" ...>
		164	* - Meta tag: <meta name="[tag]" ...>
		165	*
		166	* @param string $tag Name of the tag to retrieve.
		167	* @param string $html HTML content where to look for charset.
		168	*
		169	* @return bool\|string Charset string if found, false otherwise.
		170	*/
		171	function html_extract_tag($tag, $html)
		172	{
		173	$propertiesKey = ['property', 'name', 'itemprop'];
		174	$properties = implode('\|', $propertiesKey);
		175	// Try to retrieve OpenGraph image.
		176	$ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]content=["\']?(.?)["\'/>]#';
		177	// If the attributes are not in the order property => content (e.g. Github)
		178	// New regex to keep this readable... more or less.
		179	$ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
		180
		181	if (preg_match($ogRegex, $html, $matches) > 0
		182	\|\| preg_match($ogRegexReverse, $html, $matches) > 0
		183	) {
		184	return $matches[1];
		185	}
		186
		187	return false;
		188	}
		189
		190	/**
114	* Count private links in given linklist.	191	* Count private links in given linklist.
115	*	192	*
116	* @param array\|Countable $links Linklist.	193	* @param array\|Countable $links Linklist.