Fixes #410 - Retrieve title fails in multiple cases

* `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary.
author: ArthurHoaro <arthur@hoa.ro> 2016-01-04 10:45:54 +0100
committer: ArthurHoaro <arthur@hoa.ro> 2016-01-11 21:19:31 +0100
commit: 1557cefbd76257ceb830f65806831b490faf0acc (patch)
tree: 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/LinkUtils.php
parent: c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
download: Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip
1 files changed, 79 insertions, 0 deletions
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
new file mode 100755
index 00000000..26dd6b67
--- /dev/null
+++ b/application/LinkUtils.php
@@ -0,0 +1,79 @@
+<?php
+/**
+ * Extract title from an HTML document.
+ *
+ * @param string $html HTML content where to look for a title.
+ *
+ * @return bool|string Extracted title if found, false otherwise.
+ */
+function html_extract_title($html)
+{
+    if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
+        return trim(str_replace("\n", ' ', $matches[1]));
+    }
+    return false;
+}
+/**
+ * Determine charset from downloaded page.
+ * Priority:
+ *   1. HTTP headers (Content type).
+ *   2. HTML content page (tag <meta charset>).
+ *   3. Use a default charset (default: UTF-8).
+ *
+ * @param array  $headers           HTTP headers array.
+ * @param string $htmlContent       HTML content where to look for charset.
+ * @param string $defaultCharset    Default charset to apply if other methods failed.
+ *
+ * @return string Determined charset.
+ */
+function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+{
+    if ($charset = headers_extract_charset($headers)) {
+        return $charset;
+    }
+    if ($charset = html_extract_charset($htmlContent)) {
+        return $charset;
+    }
+    return $defaultCharset;
+}
+/**
+ * Extract charset from HTTP headers if it's defined.
+ *
+ * @param array $headers HTTP headers array.
+ *
+ * @return bool|string Charset string if found (lowercase), false otherwise.
+ */
+function headers_extract_charset($headers)
+{
+    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
+        if (! empty($match[1])) {
+            return strtolower(trim($match[1]));
+        }
+    }
+    return false;
+}
+/**
+ * Extract charset HTML content (tag <meta charset>).
+ *
+ * @param string $html HTML content where to look for charset.
+ *
+ * @return bool|string Charset string if found, false otherwise.
+ */
+function html_extract_charset($html)
+{
+    // Get encoding specified in HTML header.
+    preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc);
+    if (!empty($enc[1])) {
+        return strtolower($enc[1]);
+    }
+    return false;
+}
author	ArthurHoaro <arthur@hoa.ro>	2016-01-04 10:45:54 +0100
committer	ArthurHoaro <arthur@hoa.ro>	2016-01-11 21:19:31 +0100
commit	1557cefbd76257ceb830f65806831b490faf0acc (patch)
tree	787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/LinkUtils.php
parent	c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
download	Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip

diff --git a/application/LinkUtils.php b/application/LinkUtils.php new file mode 100755 index 00000000..26dd6b67 --- /dev/null +++ b/application/LinkUtils.php
@@ -0,0 +1,79 @@
	1	<?php
	2
	3	/**
	4	* Extract title from an HTML document.
	5	*
	6	* @param string $html HTML content where to look for a title.
	7	*
	8	* @return bool\|string Extracted title if found, false otherwise.
	9	*/
	10	function html_extract_title($html)
	11	{
	12	if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
	13	return trim(str_replace("\n", ' ', $matches[1]));
	14	}
	15	return false;
	16	}
	17
	18	/**
	19	* Determine charset from downloaded page.
	20	* Priority:
	21	* 1. HTTP headers (Content type).
	22	* 2. HTML content page (tag <meta charset>).
	23	* 3. Use a default charset (default: UTF-8).
	24	*
	25	* @param array $headers HTTP headers array.
	26	* @param string $htmlContent HTML content where to look for charset.
	27	* @param string $defaultCharset Default charset to apply if other methods failed.
	28	*
	29	* @return string Determined charset.
	30	*/
	31	function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
	32	{
	33	if ($charset = headers_extract_charset($headers)) {
	34	return $charset;
	35	}
	36
	37	if ($charset = html_extract_charset($htmlContent)) {
	38	return $charset;
	39	}
	40
	41	return $defaultCharset;
	42	}
	43
	44	/**
	45	* Extract charset from HTTP headers if it's defined.
	46	*
	47	* @param array $headers HTTP headers array.
	48	*
	49	* @return bool\|string Charset string if found (lowercase), false otherwise.
	50	*/
	51	function headers_extract_charset($headers)
	52	{
	53	if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
	54	preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
	55	if (! empty($match[1])) {
	56	return strtolower(trim($match[1]));
	57	}
	58	}
	59
	60	return false;
	61	}
	62
	63	/**
	64	* Extract charset HTML content (tag <meta charset>).
	65	*
	66	* @param string $html HTML content where to look for charset.
	67	*
	68	* @return bool\|string Charset string if found, false otherwise.
	69	*/
	70	function html_extract_charset($html)
	71	{
	72	// Get encoding specified in HTML header.
	73	preg_match('#<meta .charset="?([^">/]+)"? /?>#Usi', $html, $enc);
	74	if (!empty($enc[1])) {
	75	return strtolower($enc[1]);
	76	}
	77
	78	return false;
	79	}