diff options
author | ArthurHoaro <arthur@hoa.ro> | 2016-01-04 10:45:54 +0100 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2016-01-11 21:19:31 +0100 |
commit | 1557cefbd76257ceb830f65806831b490faf0acc (patch) | |
tree | 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/LinkUtils.php | |
parent | c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff) | |
download | Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip |
Fixes #410 - Retrieve title fails in multiple cases
* `get_http_url()` renamed to `get_http_response()`.
* Use the same HTTP context to retrieve response headers and content.
* Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections).
* Add `LinkUtils` to extract titles and charset.
* Try to retrieve charset from HTTP headers first (new), then HTML content.
* Use mb_string to re-encode title if necessary.
Diffstat (limited to 'application/LinkUtils.php')
-rwxr-xr-x | application/LinkUtils.php | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/application/LinkUtils.php b/application/LinkUtils.php new file mode 100755 index 00000000..26dd6b67 --- /dev/null +++ b/application/LinkUtils.php | |||
@@ -0,0 +1,79 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Extract title from an HTML document. | ||
5 | * | ||
6 | * @param string $html HTML content where to look for a title. | ||
7 | * | ||
8 | * @return bool|string Extracted title if found, false otherwise. | ||
9 | */ | ||
10 | function html_extract_title($html) | ||
11 | { | ||
12 | if (preg_match('!<title>(.*)</title>!is', $html, $matches)) { | ||
13 | return trim(str_replace("\n", ' ', $matches[1])); | ||
14 | } | ||
15 | return false; | ||
16 | } | ||
17 | |||
18 | /** | ||
19 | * Determine charset from downloaded page. | ||
20 | * Priority: | ||
21 | * 1. HTTP headers (Content type). | ||
22 | * 2. HTML content page (tag <meta charset>). | ||
23 | * 3. Use a default charset (default: UTF-8). | ||
24 | * | ||
25 | * @param array $headers HTTP headers array. | ||
26 | * @param string $htmlContent HTML content where to look for charset. | ||
27 | * @param string $defaultCharset Default charset to apply if other methods failed. | ||
28 | * | ||
29 | * @return string Determined charset. | ||
30 | */ | ||
31 | function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') | ||
32 | { | ||
33 | if ($charset = headers_extract_charset($headers)) { | ||
34 | return $charset; | ||
35 | } | ||
36 | |||
37 | if ($charset = html_extract_charset($htmlContent)) { | ||
38 | return $charset; | ||
39 | } | ||
40 | |||
41 | return $defaultCharset; | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * Extract charset from HTTP headers if it's defined. | ||
46 | * | ||
47 | * @param array $headers HTTP headers array. | ||
48 | * | ||
49 | * @return bool|string Charset string if found (lowercase), false otherwise. | ||
50 | */ | ||
51 | function headers_extract_charset($headers) | ||
52 | { | ||
53 | if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { | ||
54 | preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); | ||
55 | if (! empty($match[1])) { | ||
56 | return strtolower(trim($match[1])); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | return false; | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * Extract charset HTML content (tag <meta charset>). | ||
65 | * | ||
66 | * @param string $html HTML content where to look for charset. | ||
67 | * | ||
68 | * @return bool|string Charset string if found, false otherwise. | ||
69 | */ | ||
70 | function html_extract_charset($html) | ||
71 | { | ||
72 | // Get encoding specified in HTML header. | ||
73 | preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); | ||
74 | if (!empty($enc[1])) { | ||
75 | return strtolower($enc[1]); | ||
76 | } | ||
77 | |||
78 | return false; | ||
79 | } | ||