Merge pull request #432 from ArthurHoaro/title-retrieve

Fixes #410 - Retrieve title fails in multiple cases
author: VirtualTam <virtualtam@flibidi.net> 2016-01-11 21:47:00 +0100
committer: VirtualTam <virtualtam@flibidi.net> 2016-01-11 21:47:00 +0100
commit: 92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44 (patch)
tree: 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/LinkUtils.php
parent: c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
parent: 1557cefbd76257ceb830f65806831b490faf0acc (diff)
download: Shaarli-92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44.tar.gz
Shaarli-92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44.tar.zst
Shaarli-92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44.zip
1 files changed, 79 insertions, 0 deletions
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
new file mode 100755
index 00000000..26dd6b67
--- /dev/null
+++ b/application/LinkUtils.php
@@ -0,0 +1,79 @@
+<?php
+/**
+ * Extract title from an HTML document.
+ *
+ * @param string $html HTML content where to look for a title.
+ *
+ * @return bool|string Extracted title if found, false otherwise.
+ */
+function html_extract_title($html)
+{
+    if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
+        return trim(str_replace("\n", ' ', $matches[1]));
+    }
+    return false;
+}
+/**
+ * Determine charset from downloaded page.
+ * Priority:
+ *   1. HTTP headers (Content type).
+ *   2. HTML content page (tag <meta charset>).
+ *   3. Use a default charset (default: UTF-8).
+ *
+ * @param array  $headers           HTTP headers array.
+ * @param string $htmlContent       HTML content where to look for charset.
+ * @param string $defaultCharset    Default charset to apply if other methods failed.
+ *
+ * @return string Determined charset.
+ */
+function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+{
+    if ($charset = headers_extract_charset($headers)) {
+        return $charset;
+    }
+    if ($charset = html_extract_charset($htmlContent)) {
+        return $charset;
+    }
+    return $defaultCharset;
+}
+/**
+ * Extract charset from HTTP headers if it's defined.
+ *
+ * @param array $headers HTTP headers array.
+ *
+ * @return bool|string Charset string if found (lowercase), false otherwise.
+ */
+function headers_extract_charset($headers)
+{
+    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
+        if (! empty($match[1])) {
+            return strtolower(trim($match[1]));
+        }
+    }
+    return false;
+}
+/**
+ * Extract charset HTML content (tag <meta charset>).
+ *
+ * @param string $html HTML content where to look for charset.
+ *
+ * @return bool|string Charset string if found, false otherwise.
+ */
+function html_extract_charset($html)
+{
+    // Get encoding specified in HTML header.
+    preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc);
+    if (!empty($enc[1])) {
+        return strtolower($enc[1]);
+    }
+    return false;
+}
author	VirtualTam <virtualtam@flibidi.net>	2016-01-11 21:47:00 +0100
committer	VirtualTam <virtualtam@flibidi.net>	2016-01-11 21:47:00 +0100
commit	92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44 (patch)
tree	787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/LinkUtils.php
parent	c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
parent	1557cefbd76257ceb830f65806831b490faf0acc (diff)
download	Shaarli-92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44.tar.gz Shaarli-92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44.tar.zst Shaarli-92ba7b573f2833bd35c7eb2fc7fdbeb1a0ac7b44.zip

diff --git a/application/LinkUtils.php b/application/LinkUtils.php new file mode 100755 index 00000000..26dd6b67 --- /dev/null +++ b/application/LinkUtils.php
@@ -0,0 +1,79 @@
	1	<?php
	2
	3	/**
	4	* Extract title from an HTML document.
	5	*
	6	* @param string $html HTML content where to look for a title.
	7	*
	8	* @return bool\|string Extracted title if found, false otherwise.
	9	*/
	10	function html_extract_title($html)
	11	{
	12	if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
	13	return trim(str_replace("\n", ' ', $matches[1]));
	14	}
	15	return false;
	16	}
	17
	18	/**
	19	* Determine charset from downloaded page.
	20	* Priority:
	21	* 1. HTTP headers (Content type).
	22	* 2. HTML content page (tag <meta charset>).
	23	* 3. Use a default charset (default: UTF-8).
	24	*
	25	* @param array $headers HTTP headers array.
	26	* @param string $htmlContent HTML content where to look for charset.
	27	* @param string $defaultCharset Default charset to apply if other methods failed.
	28	*
	29	* @return string Determined charset.
	30	*/
	31	function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
	32	{
	33	if ($charset = headers_extract_charset($headers)) {
	34	return $charset;
	35	}
	36
	37	if ($charset = html_extract_charset($htmlContent)) {
	38	return $charset;
	39	}
	40
	41	return $defaultCharset;
	42	}
	43
	44	/**
	45	* Extract charset from HTTP headers if it's defined.
	46	*
	47	* @param array $headers HTTP headers array.
	48	*
	49	* @return bool\|string Charset string if found (lowercase), false otherwise.
	50	*/
	51	function headers_extract_charset($headers)
	52	{
	53	if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
	54	preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
	55	if (! empty($match[1])) {
	56	return strtolower(trim($match[1]));
	57	}
	58	}
	59
	60	return false;
	61	}
	62
	63	/**
	64	* Extract charset HTML content (tag <meta charset>).
	65	*
	66	* @param string $html HTML content where to look for charset.
	67	*
	68	* @return bool\|string Charset string if found, false otherwise.
	69	*/
	70	function html_extract_charset($html)
	71	{
	72	// Get encoding specified in HTML header.
	73	preg_match('#<meta .charset="?([^">/]+)"? /?>#Usi', $html, $enc);
	74	if (!empty($enc[1])) {
	75	return strtolower($enc[1]);
	76	}
	77
	78	return false;
	79	}