Extract the title/charset during page download, and check content type

Use CURLOPT_WRITEFUNCTION to check the response code and content type (only allow HTML). Also extract the title and charset during downloading chunk of data, and stop it when everything has been extracted. Closes #579
author: ArthurHoaro <arthur@hoa.ro> 2017-09-30 11:04:13 +0200
committer: ArthurHoaro <arthur@hoa.ro> 2017-10-28 14:35:49 +0200
commit: d65342e304f92643ba922200953cfebc51e1e482 (patch)
tree: 3097c77bb4dd0590c4644422b5dc4369a4186eb7 /application/LinkUtils.php
parent: a59bbf50d7530d7e82a91896a210b9da49cb1568 (diff)
download: Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.gz
Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.zst
Shaarli-d65342e304f92643ba922200953cfebc51e1e482.zip
1 files changed, 55 insertions, 34 deletions
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index 976474de..c0dd32a6 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
 <?php
 /**
- * Extract title from an HTML document.
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
 *
- * @param string $html HTML content where to look for a title.
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
 *
- * @return bool|string Extracted title if found, false otherwise.
+ * @return Closure
 */
-function html_extract_title($html)
+function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
 {
-    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
+    /**
-        return trim(str_replace("\n", '', $matches[1]));
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
-    }
+     *
-    return false;
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        if (!empty($responseCode) && $responseCode != 200) {
+            return false;
+        }
+        $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+        }
+        // We got everything we want, stop the download.
+        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+            return false;
+        }
+        return strlen($data);
+    };
 }
 /**
- * Determine charset from downloaded page.
+ * Extract title from an HTML document.
- * Priority:
- *   1. HTTP headers (Content type).
- *   2. HTML content page (tag <meta charset>).
- *   3. Use a default charset (default: UTF-8).
 *
- * @param array  $headers           HTTP headers array.
+ * @param string $html HTML content where to look for a title.
- * @param string $htmlContent       HTML content where to look for charset.
- * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
- * @return string Determined charset.
+ * @return bool|string Extracted title if found, false otherwise.
 */
-function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+function html_extract_title($html)
 {
-    if ($charset = headers_extract_charset($headers)) {
+    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
-        return $charset;
+        return trim(str_replace("\n", '', $matches[1]));
-    }
-    if ($charset = html_extract_charset($htmlContent)) {
-        return $charset;
    }
+    return false;
-    return $defaultCharset;
 }
 /**
- * Extract charset from HTTP headers if it's defined.
+ * Extract charset from HTTP header if it's defined.
 *
- * @param array $headers HTTP headers array.
+ * @param string $header HTTP header Content-Type line.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
-function headers_extract_charset($headers)
+function header_extract_charset($header)
 {
-    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+    preg_match('/charset="?([^; ]+)/i', $header, $match);
-        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
+    if (! empty($match[1])) {
-        if (! empty($match[1])) {
+        return strtolower(trim($match[1]));
-            return strtolower(trim($match[1]));
-        }
    }
    return false;
author	ArthurHoaro <arthur@hoa.ro>	2017-09-30 11:04:13 +0200
committer	ArthurHoaro <arthur@hoa.ro>	2017-10-28 14:35:49 +0200
commit	d65342e304f92643ba922200953cfebc51e1e482 (patch)
tree	3097c77bb4dd0590c4644422b5dc4369a4186eb7 /application/LinkUtils.php
parent	a59bbf50d7530d7e82a91896a210b9da49cb1568 (diff)
download	Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.gz Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.zst Shaarli-d65342e304f92643ba922200953cfebc51e1e482.zip

diff --git a/application/LinkUtils.php b/application/LinkUtils.php index 976474de..c0dd32a6 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
1	<?php	1	<?php
2		2
3	/**	3	/**
4	* Extract title from an HTML document.	4	* Get cURL callback function for CURLOPT_WRITEFUNCTION
5	*	5	*
6	* @param string $html HTML content where to look for a title.	6	* @param string $charset to extract from the downloaded page (reference)
		7	* @param string $title to extract from the downloaded page (reference)
		8	* @param string $curlGetInfo Optionnaly overrides curl_getinfo function
7	*	9	*
8	* @return bool\|string Extracted title if found, false otherwise.	10	* @return Closure
9	*/	11	*/
10	function html_extract_title($html)	12	function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
11	{	13	{
12	if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {	14	/**
13	return trim(str_replace("\n", '', $matches[1]));	15	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
14	}	16	*
15	return false;	17	* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
		18	* Then we extract the title and the charset and stop the download when it's done.
		19	*
		20	* @param resource $ch cURL resource
		21	* @param string $data chunk of data being downloaded
		22	*
		23	* @return int\|bool length of $data or false if we need to stop the download
		24	*/
		25	return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
		26	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
		27	if (!empty($responseCode) && $responseCode != 200) {
		28	return false;
		29	}
		30	$contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
		31	if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
		32	return false;
		33	}
		34	if (empty($charset)) {
		35	$charset = header_extract_charset($contentType);
		36	}
		37	if (empty($charset)) {
		38	$charset = html_extract_charset($data);
		39	}
		40	if (empty($title)) {
		41	$title = html_extract_title($data);
		42	}
		43	// We got everything we want, stop the download.
		44	if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
		45	return false;
		46	}
		47
		48	return strlen($data);
		49	};
16	}	50	}
17		51
18	/**	52	/**
19	* Determine charset from downloaded page.	53	* Extract title from an HTML document.
20	* Priority:
21	* 1. HTTP headers (Content type).
22	* 2. HTML content page (tag <meta charset>).
23	* 3. Use a default charset (default: UTF-8).
24	*	54	*
25	* @param array $headers HTTP headers array.	55	* @param string $html HTML content where to look for a title.
26	* @param string $htmlContent HTML content where to look for charset.
27	* @param string $defaultCharset Default charset to apply if other methods failed.
28	*	56	*
29	* @return string Determined charset.	57	* @return bool\|string Extracted title if found, false otherwise.
30	*/	58	*/
31	function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')	59	function html_extract_title($html)
32	{	60	{
33	if ($charset = headers_extract_charset($headers)) {	61	if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {
34	return $charset;	62	return trim(str_replace("\n", '', $matches[1]));
35	}
36
37	if ($charset = html_extract_charset($htmlContent)) {
38	return $charset;
39	}	63	}
40		64	return false;
41	return $defaultCharset;
42	}	65	}
43		66
44	/**	67	/**
45	* Extract charset from HTTP headers if it's defined.	68	* Extract charset from HTTP header if it's defined.
46	*	69	*
47	* @param array $headers HTTP headers array.	70	* @param string $header HTTP header Content-Type line.
48	*	71	*
49	* @return bool\|string Charset string if found (lowercase), false otherwise.	72	* @return bool\|string Charset string if found (lowercase), false otherwise.
50	*/	73	*/
51	function headers_extract_charset($headers)	74	function header_extract_charset($header)
52	{	75	{
53	if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {	76	preg_match('/charset="?([^; ]+)/i', $header, $match);
54	preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);	77	if (! empty($match[1])) {
55	if (! empty($match[1])) {	78	return strtolower(trim($match[1]));
56	return strtolower(trim($match[1]));
57	}
58	}	79	}
59		80
60	return false;	81	return false;