3 files changed, 286 insertions, 26 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
new file mode 100644
index 00000000..646a5264
--- /dev/null
+++ b/application/http/HttpAccess.php
@@ -0,0 +1,47 @@
+<?php
+declare(strict_types=1);
+namespace Shaarli\Http;
+/**
+ * Class HttpAccess
+ *
+ * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
+ * It is used as dependency injection in Shaarli's container.
+ *
+ * @package Shaarli\Http
+ */
+class HttpAccess
+{
+    public function getHttpResponse(
+        $url,
+        $timeout = 30,
+        $maxBytes = 4194304,
+        $curlHeaderFunction = null,
+        $curlWriteFunction = null
+    ) {
+        return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
+    }
+    public function getCurlDownloadCallback(
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        $retrieveDescription
+    ) {
+        return get_curl_download_callback(
+            $charset,
+            $title,
+            $description,
+            $keywords,
+            $retrieveDescription
+        );
+    }
+    public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
+    {
+        return get_curl_header_callback($charset, $curlGetInfo);
+    }
+}
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index 2ea9195d..28c12969 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -6,12 +6,14 @@ use Shaarli\Http\Url;
 * GET an HTTP URL to retrieve its content
 * Uses the cURL library or a fallback method
 *
- * @param string          $url               URL to get (http://...)
+ * @param string          $url                URL to get (http://...)
- * @param int             $timeout           network timeout (in seconds)
+ * @param int             $timeout            network timeout (in seconds)
- * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
+ * @param int             $maxBytes           maximum downloaded bytes (default: 4 MiB)
- * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
- *                                           Can be used to add download conditions on the
+ *                                            (CURLOPT_HEADERFUNCTION)
- *                                           headers (response code, content type, etc.).
+ * @param callable|string $curlWriteFunction  Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                            Can be used to add download conditions on the
+ *                                            headers (response code, content type, etc.).
 *
 * @return array HTTP response headers, downloaded content
 *
@@ -35,8 +37,13 @@ use Shaarli\Http\Url;
 * @see http://stackoverflow.com/q/9183178
 * @see http://stackoverflow.com/q/1462720
 */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
+function get_http_response(
-{
+    $url,
+    $timeout = 30,
+    $maxBytes = 4194304,
+    $curlHeaderFunction = null,
+    $curlWriteFunction = null
+) {
    $urlObj = new Url($url);
    $cleanUrl = $urlObj->idnToAscii();
@@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
    // General cURL settings
    curl_setopt($ch, CURLOPT_AUTOREFERER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
-    curl_setopt($ch, CURLOPT_HEADER, true);
+    // Default header download if the $curlHeaderFunction is not defined
+    curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
    curl_setopt(
        $ch,
        CURLOPT_HTTPHEADER,
@@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
-    if (is_callable($curlWriteFunction)) {
-        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
-    }
    // Max download size management
    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
    curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+    if (is_callable($curlHeaderFunction)) {
+        curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
+    }
+    if (is_callable($curlWriteFunction)) {
+        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+    }
    curl_setopt(
        $ch,
        CURLOPT_PROGRESSFUNCTION,
-        function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
+        function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
-            if (version_compare(phpversion(), '5.5', '<')) {
+            $downloaded = $arg2;
-                // PHP version lower than 5.5
-                // Callback has 4 arguments
-                $downloaded = $arg1;
-            } else {
-                // Callback has 5 arguments
-                $downloaded = $arg2;
-            }
            // Non-zero return stops downloading
            return ($downloaded > $maxBytes) ? 1 : 0;
        }
@@ -369,7 +373,11 @@ function server_url($server)
 */
 function index_url($server)
 {
-    $scriptname = $server['SCRIPT_NAME'];
+    if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
+        return rtrim(SHAARLI_ROOT_URL, '/') . '/';
+    }
+    $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
    if (endsWith($scriptname, 'index.php')) {
        $scriptname = substr($scriptname, 0, -9);
    }
@@ -377,7 +385,7 @@ function index_url($server)
 }
 /**
- * Returns the absolute URL of the current script, with the query
+ * Returns the absolute URL of the current script, with current route and query
 *
 * If the resource is "index.php", then it is removed (for better-looking URLs)
 *
@@ -387,10 +395,17 @@ function index_url($server)
 */
 function page_url($server)
 {
+    $scriptname = $server['SCRIPT_NAME'] ?? '';
+    if (endsWith($scriptname, 'index.php')) {
+        $scriptname = substr($scriptname, 0, -9);
+    }
+    $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
    if (! empty($server['QUERY_STRING'])) {
-        return index_url($server).'?'.$server['QUERY_STRING'];
+        return index_url($server) . $route . '?' . $server['QUERY_STRING'];
    }
-    return index_url($server);
+    return index_url($server) . $route;
 }
 /**
@@ -477,3 +492,132 @@ function is_https($server)
    return ! empty($server['HTTPS']);
 }
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_header_callback(
+    &$charset,
+    $curlGetInfo = 'curl_getinfo'
+) {
+    $isRedirected = false;
+    return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        $chunkLength = strlen($data);
+        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
+            $isRedirected = true;
+            return $chunkLength;
+        }
+        if (!empty($responseCode) && $responseCode !== 200) {
+            return false;
+        }
+        // After a redirection, the content type will keep the previous request value
+        // until it finds the next content-type header.
+        if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
+            $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        }
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (!empty($contentType) && empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        return $chunkLength;
+    };
+}
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_download_callback(
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription
+) {
+    $currentChunk = 0;
+    $foundChunk = null;
+    /**
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+     *
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function ($ch, $data) use (
+        $retrieveDescription,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $chunkLength = strlen($data);
+        $currentChunk++;
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if (empty($title)) {
+            $title = html_extract_tag('title', $data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($description)) {
+            $description = html_extract_tag('description', $data);
+            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($keywords)) {
+            $keywords = html_extract_tag('keywords', $data);
+            if (! empty($keywords)) {
+                $foundChunk = $currentChunk;
+                // Keywords use the format tag1, tag2 multiple words, tag
+                // So we format them to match Shaarli's separator and glue multiple words with '-'
+                $keywords = implode(' ', array_map(function($keyword) {
+                    return implode('-', preg_split('/\s+/', trim($keyword)));
+                }, explode(',', $keywords)));
+            }
+        }
+        // We got everything we want, stop the download.
+        // If we already found either the title, description or keywords,
+        // it's highly unlikely that we'll found the other metas further than
+        // in the same chunk of data or the next one. So we also stop the download after that.
+        if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+            && (! $retrieveDescription
+                || $foundChunk < $currentChunk
+                || (!empty($title) && !empty($description) && !empty($keywords))
+            )
+        ) {
+            return false;
+        }
+        return $chunkLength;
+    };
+}
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php
new file mode 100644
index 00000000..ba9bd40c
--- /dev/null
+++ b/application/http/MetadataRetriever.php
@@ -0,0 +1,69 @@
+<?php
+declare(strict_types=1);
+namespace Shaarli\Http;
+use Shaarli\Config\ConfigManager;
+/**
+ * HTTP Tool used to extract metadata from external URL (title, description, etc.).
+ */
+class MetadataRetriever
+{
+    /** @var ConfigManager */
+    protected $conf;
+    /** @var HttpAccess */
+    protected $httpAccess;
+    public function __construct(ConfigManager $conf, HttpAccess $httpAccess)
+    {
+        $this->conf = $conf;
+        $this->httpAccess = $httpAccess;
+    }
+    /**
+     * Retrieve metadata for given URL.
+     *
+     * @return array [
+     *                  'title' => <remote title>,
+     *                  'description' => <remote description>,
+     *                  'tags' => <remote keywords>,
+     *               ]
+     */
+    public function retrieve(string $url): array
+    {
+        $charset = null;
+        $title = null;
+        $description = null;
+        $tags = null;
+        $retrieveDescription = $this->conf->get('general.retrieve_description');
+        // Short timeout to keep the application responsive
+        // The callback will fill $charset and $title with data from the downloaded page.
+        $this->httpAccess->getHttpResponse(
+            $url,
+            $this->conf->get('general.download_timeout', 30),
+            $this->conf->get('general.download_max_size', 4194304),
+            $this->httpAccess->getCurlHeaderCallback($charset),
+            $this->httpAccess->getCurlDownloadCallback(
+                $charset,
+                $title,
+                $description,
+                $tags,
+                $retrieveDescription
+            )
+        );
+        if (!empty($title) && strtolower($charset) !== 'utf-8') {
+            $title = mb_convert_encoding($title, 'utf-8', $charset);
+        }
+        return [
+            'title' => $title,
+            'description' => $description,
+            'tags' => $tags,
+        ];
+    }
+}

diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php new file mode 100644 index 00000000..646a5264 --- /dev/null +++ b/application/http/HttpAccess.php
@@ -0,0 +1,47 @@
		1	<?php
		2
		3	declare(strict_types=1);
		4
		5	namespace Shaarli\Http;
		6
		7	/**
		8	* Class HttpAccess
		9	*
		10	* This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
		11	* It is used as dependency injection in Shaarli's container.
		12	*
		13	* @package Shaarli\Http
		14	*/
		15	class HttpAccess
		16	{
		17	public function getHttpResponse(
		18	$url,
		19	$timeout = 30,
		20	$maxBytes = 4194304,
		21	$curlHeaderFunction = null,
		22	$curlWriteFunction = null
		23	) {
		24	return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
		25	}
		26
		27	public function getCurlDownloadCallback(
		28	&$charset,
		29	&$title,
		30	&$description,
		31	&$keywords,
		32	$retrieveDescription
		33	) {
		34	return get_curl_download_callback(
		35	$charset,
		36	$title,
		37	$description,
		38	$keywords,
		39	$retrieveDescription
		40	);
		41	}
		42
		43	public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
		44	{
		45	return get_curl_header_callback($charset, $curlGetInfo);
		46	}
		47	}


diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 2ea9195d..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php
@@ -6,12 +6,14 @@ use Shaarli\Http\Url;
6	* GET an HTTP URL to retrieve its content	6	* GET an HTTP URL to retrieve its content
7	* Uses the cURL library or a fallback method	7	* Uses the cURL library or a fallback method
8	*	8	*
9	* @param string $url URL to get (http://...)	9	* @param string $url URL to get (http://...)
10	* @param int $timeout network timeout (in seconds)	10	* @param int $timeout network timeout (in seconds)
11	* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)	11	* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
12	* @param callable\|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).	12	* @param callable\|string $curlHeaderFunction Optional callback called during the download of headers
13	* Can be used to add download conditions on the	13	* (CURLOPT_HEADERFUNCTION)
14	* headers (response code, content type, etc.).	14	* @param callable\|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
		15	* Can be used to add download conditions on the
		16	* headers (response code, content type, etc.).
15	*	17	*
16	* @return array HTTP response headers, downloaded content	18	* @return array HTTP response headers, downloaded content
17	*	19	*
@@ -35,8 +37,13 @@ use Shaarli\Http\Url;
35	* @see http://stackoverflow.com/q/9183178	37	* @see http://stackoverflow.com/q/9183178
36	* @see http://stackoverflow.com/q/1462720	38	* @see http://stackoverflow.com/q/1462720
37	*/	39	*/
38	function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)	40	function get_http_response(
39	{	41	$url,
		42	$timeout = 30,
		43	$maxBytes = 4194304,
		44	$curlHeaderFunction = null,
		45	$curlWriteFunction = null
		46	) {
40	$urlObj = new Url($url);	47	$urlObj = new Url($url);
41	$cleanUrl = $urlObj->idnToAscii();	48	$cleanUrl = $urlObj->idnToAscii();
42		49
@@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
70	// General cURL settings	77	// General cURL settings
71	curl_setopt($ch, CURLOPT_AUTOREFERER, true);	78	curl_setopt($ch, CURLOPT_AUTOREFERER, true);
72	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);	79	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
73	curl_setopt($ch, CURLOPT_HEADER, true);	80	// Default header download if the $curlHeaderFunction is not defined
		81	curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
74	curl_setopt(	82	curl_setopt(
75	$ch,	83	$ch,
76	CURLOPT_HTTPHEADER,	84	CURLOPT_HTTPHEADER,
@@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
81	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);	89	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
82	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);	90	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
83		91
84	if (is_callable($curlWriteFunction)) {
85	curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
86	}
87
88	// Max download size management	92	// Max download size management
89	curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);	93	curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
90	curl_setopt($ch, CURLOPT_NOPROGRESS, false);	94	curl_setopt($ch, CURLOPT_NOPROGRESS, false);
		95	if (is_callable($curlHeaderFunction)) {
		96	curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
		97	}
		98	if (is_callable($curlWriteFunction)) {
		99	curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
		100	}
91	curl_setopt(	101	curl_setopt(
92	$ch,	102	$ch,
93	CURLOPT_PROGRESSFUNCTION,	103	CURLOPT_PROGRESSFUNCTION,
94	function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {	104	function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
95	if (version_compare(phpversion(), '5.5', '<')) {	105	$downloaded = $arg2;
96	// PHP version lower than 5.5	106
97	// Callback has 4 arguments
98	$downloaded = $arg1;
99	} else {
100	// Callback has 5 arguments
101	$downloaded = $arg2;
102	}
103	// Non-zero return stops downloading	107	// Non-zero return stops downloading
104	return ($downloaded > $maxBytes) ? 1 : 0;	108	return ($downloaded > $maxBytes) ? 1 : 0;
105	}	109	}
@@ -369,7 +373,11 @@ function server_url($server)
369	*/	373	*/
370	function index_url($server)	374	function index_url($server)
371	{	375	{
372	$scriptname = $server['SCRIPT_NAME'];	376	if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
		377	return rtrim(SHAARLI_ROOT_URL, '/') . '/';
		378	}
		379
		380	$scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
373	if (endsWith($scriptname, 'index.php')) {	381	if (endsWith($scriptname, 'index.php')) {
374	$scriptname = substr($scriptname, 0, -9);	382	$scriptname = substr($scriptname, 0, -9);
375	}	383	}
@@ -377,7 +385,7 @@ function index_url($server)
377	}	385	}
378		386
379	/**	387	/**
380	* Returns the absolute URL of the current script, with the query	388	* Returns the absolute URL of the current script, with current route and query
381	*	389	*
382	* If the resource is "index.php", then it is removed (for better-looking URLs)	390	* If the resource is "index.php", then it is removed (for better-looking URLs)
383	*	391	*
@@ -387,10 +395,17 @@ function index_url($server)
387	*/	395	*/
388	function page_url($server)	396	function page_url($server)
389	{	397	{
		398	$scriptname = $server['SCRIPT_NAME'] ?? '';
		399	if (endsWith($scriptname, 'index.php')) {
		400	$scriptname = substr($scriptname, 0, -9);
		401	}
		402
		403	$route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
390	if (! empty($server['QUERY_STRING'])) {	404	if (! empty($server['QUERY_STRING'])) {
391	return index_url($server).'?'.$server['QUERY_STRING'];	405	return index_url($server) . $route . '?' . $server['QUERY_STRING'];
392	}	406	}
393	return index_url($server);	407
		408	return index_url($server) . $route;
394	}	409	}
395		410
396	/**	411	/**
@@ -477,3 +492,132 @@ function is_https($server)
477		492
478	return ! empty($server['HTTPS']);	493	return ! empty($server['HTTPS']);
479	}	494	}
		495
		496	/**
		497	* Get cURL callback function for CURLOPT_WRITEFUNCTION
		498	*
		499	* @param string $charset to extract from the downloaded page (reference)
		500	* @param string $curlGetInfo Optionally overrides curl_getinfo function
		501	*
		502	* @return Closure
		503	*/
		504	function get_curl_header_callback(
		505	&$charset,
		506	$curlGetInfo = 'curl_getinfo'
		507	) {
		508	$isRedirected = false;
		509
		510	return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
		511	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
		512	$chunkLength = strlen($data);
		513	if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
		514	$isRedirected = true;
		515	return $chunkLength;
		516	}
		517	if (!empty($responseCode) && $responseCode !== 200) {
		518	return false;
		519	}
		520	// After a redirection, the content type will keep the previous request value
		521	// until it finds the next content-type header.
		522	if (! $isRedirected \|\| strpos(strtolower($data), 'content-type') !== false) {
		523	$contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
		524	}
		525	if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
		526	return false;
		527	}
		528	if (!empty($contentType) && empty($charset)) {
		529	$charset = header_extract_charset($contentType);
		530	}
		531
		532	return $chunkLength;
		533	};
		534	}
		535
		536	/**
		537	* Get cURL callback function for CURLOPT_WRITEFUNCTION
		538	*
		539	* @param string $charset to extract from the downloaded page (reference)
		540	* @param string $title to extract from the downloaded page (reference)
		541	* @param string $description to extract from the downloaded page (reference)
		542	* @param string $keywords to extract from the downloaded page (reference)
		543	* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
		544	* @param string $curlGetInfo Optionally overrides curl_getinfo function
		545	*
		546	* @return Closure
		547	*/
		548	function get_curl_download_callback(
		549	&$charset,
		550	&$title,
		551	&$description,
		552	&$keywords,
		553	$retrieveDescription
		554	) {
		555	$currentChunk = 0;
		556	$foundChunk = null;
		557
		558	/**
		559	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
		560	*
		561	* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
		562	* Then we extract the title and the charset and stop the download when it's done.
		563	*
		564	* @param resource $ch cURL resource
		565	* @param string $data chunk of data being downloaded
		566	*
		567	* @return int\|bool length of $data or false if we need to stop the download
		568	*/
		569	return function ($ch, $data) use (
		570	$retrieveDescription,
		571	&$charset,
		572	&$title,
		573	&$description,
		574	&$keywords,
		575	&$currentChunk,
		576	&$foundChunk
		577	) {
		578	$chunkLength = strlen($data);
		579	$currentChunk++;
		580
		581	if (empty($charset)) {
		582	$charset = html_extract_charset($data);
		583	}
		584	if (empty($title)) {
		585	$title = html_extract_title($data);
		586	$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
		587	}
		588	if (empty($title)) {
		589	$title = html_extract_tag('title', $data);
		590	$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
		591	}
		592	if ($retrieveDescription && empty($description)) {
		593	$description = html_extract_tag('description', $data);
		594	$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
		595	}
		596	if ($retrieveDescription && empty($keywords)) {
		597	$keywords = html_extract_tag('keywords', $data);
		598	if (! empty($keywords)) {
		599	$foundChunk = $currentChunk;
		600	// Keywords use the format tag1, tag2 multiple words, tag
		601	// So we format them to match Shaarli's separator and glue multiple words with '-'
		602	$keywords = implode(' ', array_map(function($keyword) {
		603	return implode('-', preg_split('/\s+/', trim($keyword)));
		604	}, explode(',', $keywords)));
		605	}
		606	}
		607
		608	// We got everything we want, stop the download.
		609	// If we already found either the title, description or keywords,
		610	// it's highly unlikely that we'll found the other metas further than
		611	// in the same chunk of data or the next one. So we also stop the download after that.
		612	if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
		613	&& (! $retrieveDescription
		614	\|\| $foundChunk < $currentChunk
		615	\|\| (!empty($title) && !empty($description) && !empty($keywords))
		616	)
		617	) {
		618	return false;
		619	}
		620
		621	return $chunkLength;
		622	};
		623	}


diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php new file mode 100644 index 00000000..ba9bd40c --- /dev/null +++ b/application/http/MetadataRetriever.php
@@ -0,0 +1,69 @@
		1	<?php
		2
		3	declare(strict_types=1);
		4
		5	namespace Shaarli\Http;
		6
		7	use Shaarli\Config\ConfigManager;
		8
		9	/**
		10	* HTTP Tool used to extract metadata from external URL (title, description, etc.).
		11	*/
		12	class MetadataRetriever
		13	{
		14	/** @var ConfigManager */
		15	protected $conf;
		16
		17	/** @var HttpAccess */
		18	protected $httpAccess;
		19
		20	public function __construct(ConfigManager $conf, HttpAccess $httpAccess)
		21	{
		22	$this->conf = $conf;
		23	$this->httpAccess = $httpAccess;
		24	}
		25
		26	/**
		27	* Retrieve metadata for given URL.
		28	*
		29	* @return array [
		30	* 'title' => <remote title>,
		31	* 'description' => <remote description>,
		32	* 'tags' => <remote keywords>,
		33	* ]
		34	*/
		35	public function retrieve(string $url): array
		36	{
		37	$charset = null;
		38	$title = null;
		39	$description = null;
		40	$tags = null;
		41	$retrieveDescription = $this->conf->get('general.retrieve_description');
		42
		43	// Short timeout to keep the application responsive
		44	// The callback will fill $charset and $title with data from the downloaded page.
		45	$this->httpAccess->getHttpResponse(
		46	$url,
		47	$this->conf->get('general.download_timeout', 30),
		48	$this->conf->get('general.download_max_size', 4194304),
		49	$this->httpAccess->getCurlHeaderCallback($charset),
		50	$this->httpAccess->getCurlDownloadCallback(
		51	$charset,
		52	$title,
		53	$description,
		54	$tags,
		55	$retrieveDescription
		56	)
		57	);
		58
		59	if (!empty($title) && strtolower($charset) !== 'utf-8') {
		60	$title = mb_convert_encoding($title, 'utf-8', $charset);
		61	}
		62
		63	return [
		64	'title' => $title,
		65	'description' => $description,
		66	'tags' => $tags,
		67	];
		68	}
		69	}