Merge branch 'v0.12' into latest

author: ArthurHoaro <arthur@hoa.ro> 2020-10-13 12:05:08 +0200
committer: ArthurHoaro <arthur@hoa.ro> 2020-10-13 12:05:08 +0200
commit: b6f678a5a1d15acf284ebcec16c905e976671ce1 (patch)
tree: 33c7da831482ed79c44896ef19c73c72ada84f2e /application/http
parent: b14687036b9b800681197f51fdc47e62f0c88e2e (diff)
parent: 1c1520b6b98ab20201bfe15577782a52320339df (diff)
download: Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.gz
Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.zst
Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.zip
3 files changed, 161 insertions, 5 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
new file mode 100644
index 00000000..81d9e076
--- /dev/null
+++ b/application/http/HttpAccess.php
@@ -0,0 +1,39 @@
+<?php
+declare(strict_types=1);
+namespace Shaarli\Http;
+/**
+ * Class HttpAccess
+ *
+ * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
+ * It is used as dependency injection in Shaarli's container.
+ *
+ * @package Shaarli\Http
+ */
+class HttpAccess
+{
+    public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
+    {
+        return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
+    }
+    public function getCurlDownloadCallback(
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        $retrieveDescription,
+        $curlGetInfo = 'curl_getinfo'
+    ) {
+        return get_curl_download_callback(
+            $charset,
+            $title,
+            $description,
+            $keywords,
+            $retrieveDescription,
+            $curlGetInfo
+        );
+    }
+}
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index 2ea9195d..9f414073 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -369,7 +369,11 @@ function server_url($server)
 */
 function index_url($server)
 {
-    $scriptname = $server['SCRIPT_NAME'];
+    if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
+        return rtrim(SHAARLI_ROOT_URL, '/') . '/';
+    }
+    $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
    if (endsWith($scriptname, 'index.php')) {
        $scriptname = substr($scriptname, 0, -9);
    }
@@ -377,7 +381,7 @@ function index_url($server)
 }
 /**
- * Returns the absolute URL of the current script, with the query
+ * Returns the absolute URL of the current script, with current route and query
 *
 * If the resource is "index.php", then it is removed (for better-looking URLs)
 *
@@ -387,10 +391,17 @@ function index_url($server)
 */
 function page_url($server)
 {
+    $scriptname = $server['SCRIPT_NAME'] ?? '';
+    if (endsWith($scriptname, 'index.php')) {
+        $scriptname = substr($scriptname, 0, -9);
+    }
+    $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
    if (! empty($server['QUERY_STRING'])) {
-        return index_url($server).'?'.$server['QUERY_STRING'];
+        return index_url($server) . $route . '?' . $server['QUERY_STRING'];
    }
-    return index_url($server);
+    return index_url($server) . $route;
 }
 /**
@@ -477,3 +488,109 @@ function is_https($server)
    return ! empty($server['HTTPS']);
 }
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_download_callback(
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription,
+    $curlGetInfo = 'curl_getinfo'
+) {
+    $isRedirected = false;
+    $currentChunk = 0;
+    $foundChunk = null;
+    /**
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+     *
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function (&$ch, $data) use (
+        $retrieveDescription,
+        $curlGetInfo,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$isRedirected,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $currentChunk++;
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
+            $isRedirected = true;
+            return strlen($data);
+        }
+        if (!empty($responseCode) && $responseCode !== 200) {
+            return false;
+        }
+        // After a redirection, the content type will keep the previous request value
+        // until it finds the next content-type header.
+        if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
+            $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        }
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (!empty($contentType) && empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($description)) {
+            $description = html_extract_tag('description', $data);
+            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($keywords)) {
+            $keywords = html_extract_tag('keywords', $data);
+            if (! empty($keywords)) {
+                $foundChunk = $currentChunk;
+                // Keywords use the format tag1, tag2 multiple words, tag
+                // So we format them to match Shaarli's separator and glue multiple words with '-'
+                $keywords = implode(' ', array_map(function($keyword) {
+                    return implode('-', preg_split('/\s+/', trim($keyword)));
+                }, explode(',', $keywords)));
+            }
+        }
+        // We got everything we want, stop the download.
+        // If we already found either the title, description or keywords,
+        // it's highly unlikely that we'll found the other metas further than
+        // in the same chunk of data or the next one. So we also stop the download after that.
+        if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+            && (! $retrieveDescription
+                || $foundChunk < $currentChunk
+                || (!empty($title) && !empty($description) && !empty($keywords))
+            )
+        ) {
+            return false;
+        }
+        return strlen($data);
+    };
+}
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php
index 4bc84b82..e8d1a283 100644
--- a/application/http/UrlUtils.php
+++ b/application/http/UrlUtils.php
@@ -73,7 +73,7 @@ function add_trailing_slash($url)
 */
 function whitelist_protocols($url, $protocols)
 {
-    if (startsWith($url, '?') || startsWith($url, '/')) {
+    if (startsWith($url, '?') || startsWith($url, '/') || startsWith($url, '#')) {
        return $url;
    }
    $protocols = array_merge(['http', 'https'], $protocols);
author	ArthurHoaro <arthur@hoa.ro>	2020-10-13 12:05:08 +0200
committer	ArthurHoaro <arthur@hoa.ro>	2020-10-13 12:05:08 +0200
commit	b6f678a5a1d15acf284ebcec16c905e976671ce1 (patch)
tree	33c7da831482ed79c44896ef19c73c72ada84f2e /application/http
parent	b14687036b9b800681197f51fdc47e62f0c88e2e (diff)
parent	1c1520b6b98ab20201bfe15577782a52320339df (diff)
download	Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.gz Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.zst Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.zip

diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php new file mode 100644 index 00000000..81d9e076 --- /dev/null +++ b/application/http/HttpAccess.php
@@ -0,0 +1,39 @@
		1	<?php
		2
		3	declare(strict_types=1);
		4
		5	namespace Shaarli\Http;
		6
		7	/**
		8	* Class HttpAccess
		9	*
		10	* This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
		11	* It is used as dependency injection in Shaarli's container.
		12	*
		13	* @package Shaarli\Http
		14	*/
		15	class HttpAccess
		16	{
		17	public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
		18	{
		19	return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
		20	}
		21
		22	public function getCurlDownloadCallback(
		23	&$charset,
		24	&$title,
		25	&$description,
		26	&$keywords,
		27	$retrieveDescription,
		28	$curlGetInfo = 'curl_getinfo'
		29	) {
		30	return get_curl_download_callback(
		31	$charset,
		32	$title,
		33	$description,
		34	$keywords,
		35	$retrieveDescription,
		36	$curlGetInfo
		37	);
		38	}
		39	}


diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 2ea9195d..9f414073 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php
@@ -369,7 +369,11 @@ function server_url($server)
369	*/	369	*/
370	function index_url($server)	370	function index_url($server)
371	{	371	{
372	$scriptname = $server['SCRIPT_NAME'];	372	if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
		373	return rtrim(SHAARLI_ROOT_URL, '/') . '/';
		374	}
		375
		376	$scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
373	if (endsWith($scriptname, 'index.php')) {	377	if (endsWith($scriptname, 'index.php')) {
374	$scriptname = substr($scriptname, 0, -9);	378	$scriptname = substr($scriptname, 0, -9);
375	}	379	}
@@ -377,7 +381,7 @@ function index_url($server)
377	}	381	}
378		382
379	/**	383	/**
380	* Returns the absolute URL of the current script, with the query	384	* Returns the absolute URL of the current script, with current route and query
381	*	385	*
382	* If the resource is "index.php", then it is removed (for better-looking URLs)	386	* If the resource is "index.php", then it is removed (for better-looking URLs)
383	*	387	*
@@ -387,10 +391,17 @@ function index_url($server)
387	*/	391	*/
388	function page_url($server)	392	function page_url($server)
389	{	393	{
		394	$scriptname = $server['SCRIPT_NAME'] ?? '';
		395	if (endsWith($scriptname, 'index.php')) {
		396	$scriptname = substr($scriptname, 0, -9);
		397	}
		398
		399	$route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
390	if (! empty($server['QUERY_STRING'])) {	400	if (! empty($server['QUERY_STRING'])) {
391	return index_url($server).'?'.$server['QUERY_STRING'];	401	return index_url($server) . $route . '?' . $server['QUERY_STRING'];
392	}	402	}
393	return index_url($server);	403
		404	return index_url($server) . $route;
394	}	405	}
395		406
396	/**	407	/**
@@ -477,3 +488,109 @@ function is_https($server)
477		488
478	return ! empty($server['HTTPS']);	489	return ! empty($server['HTTPS']);
479	}	490	}
		491
		492	/**
		493	* Get cURL callback function for CURLOPT_WRITEFUNCTION
		494	*
		495	* @param string $charset to extract from the downloaded page (reference)
		496	* @param string $title to extract from the downloaded page (reference)
		497	* @param string $description to extract from the downloaded page (reference)
		498	* @param string $keywords to extract from the downloaded page (reference)
		499	* @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
		500	* @param string $curlGetInfo Optionally overrides curl_getinfo function
		501	*
		502	* @return Closure
		503	*/
		504	function get_curl_download_callback(
		505	&$charset,
		506	&$title,
		507	&$description,
		508	&$keywords,
		509	$retrieveDescription,
		510	$curlGetInfo = 'curl_getinfo'
		511	) {
		512	$isRedirected = false;
		513	$currentChunk = 0;
		514	$foundChunk = null;
		515
		516	/**
		517	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
		518	*
		519	* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
		520	* Then we extract the title and the charset and stop the download when it's done.
		521	*
		522	* @param resource $ch cURL resource
		523	* @param string $data chunk of data being downloaded
		524	*
		525	* @return int\|bool length of $data or false if we need to stop the download
		526	*/
		527	return function (&$ch, $data) use (
		528	$retrieveDescription,
		529	$curlGetInfo,
		530	&$charset,
		531	&$title,
		532	&$description,
		533	&$keywords,
		534	&$isRedirected,
		535	&$currentChunk,
		536	&$foundChunk
		537	) {
		538	$currentChunk++;
		539	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
		540	if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
		541	$isRedirected = true;
		542	return strlen($data);
		543	}
		544	if (!empty($responseCode) && $responseCode !== 200) {
		545	return false;
		546	}
		547	// After a redirection, the content type will keep the previous request value
		548	// until it finds the next content-type header.
		549	if (! $isRedirected \|\| strpos(strtolower($data), 'content-type') !== false) {
		550	$contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
		551	}
		552	if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
		553	return false;
		554	}
		555	if (!empty($contentType) && empty($charset)) {
		556	$charset = header_extract_charset($contentType);
		557	}
		558	if (empty($charset)) {
		559	$charset = html_extract_charset($data);
		560	}
		561	if (empty($title)) {
		562	$title = html_extract_title($data);
		563	$foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
		564	}
		565	if ($retrieveDescription && empty($description)) {
		566	$description = html_extract_tag('description', $data);
		567	$foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
		568	}
		569	if ($retrieveDescription && empty($keywords)) {
		570	$keywords = html_extract_tag('keywords', $data);
		571	if (! empty($keywords)) {
		572	$foundChunk = $currentChunk;
		573	// Keywords use the format tag1, tag2 multiple words, tag
		574	// So we format them to match Shaarli's separator and glue multiple words with '-'
		575	$keywords = implode(' ', array_map(function($keyword) {
		576	return implode('-', preg_split('/\s+/', trim($keyword)));
		577	}, explode(',', $keywords)));
		578	}
		579	}
		580
		581	// We got everything we want, stop the download.
		582	// If we already found either the title, description or keywords,
		583	// it's highly unlikely that we'll found the other metas further than
		584	// in the same chunk of data or the next one. So we also stop the download after that.
		585	if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
		586	&& (! $retrieveDescription
		587	\|\| $foundChunk < $currentChunk
		588	\|\| (!empty($title) && !empty($description) && !empty($keywords))
		589	)
		590	) {
		591	return false;
		592	}
		593
		594	return strlen($data);
		595	};
		596	}


diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php index 4bc84b82..e8d1a283 100644 --- a/application/http/UrlUtils.php +++ b/application/http/UrlUtils.php
@@ -73,7 +73,7 @@ function add_trailing_slash($url)
73	*/	73	*/
74	function whitelist_protocols($url, $protocols)	74	function whitelist_protocols($url, $protocols)
75	{	75	{
76	if (startsWith($url, '?') \|\| startsWith($url, '/')) {	76	if (startsWith($url, '?') \|\| startsWith($url, '/') \|\| startsWith($url, '#')) {
77	return $url;	77	return $url;
78	}	78	}
79	$protocols = array_merge(['http', 'https'], $protocols);	79	$protocols = array_merge(['http', 'https'], $protocols);