diff options
author | ArthurHoaro <arthur@hoa.ro> | 2016-01-04 10:45:54 +0100 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2016-01-11 21:19:31 +0100 |
commit | 1557cefbd76257ceb830f65806831b490faf0acc (patch) | |
tree | 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application | |
parent | c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff) | |
download | Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip |
Fixes #410 - Retrieve title fails in multiple cases
* `get_http_url()` renamed to `get_http_response()`.
* Use the same HTTP context to retrieve response headers and content.
* Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections).
* Add `LinkUtils` to extract titles and charset.
* Try to retrieve charset from HTTP headers first (new), then HTML content.
* Use mb_string to re-encode title if necessary.
Diffstat (limited to 'application')
-rw-r--r-- | application/ApplicationUtils.php | 2 | ||||
-rwxr-xr-x[-rw-r--r--] | application/HttpUtils.php | 49 | ||||
-rwxr-xr-x | application/LinkUtils.php | 79 | ||||
-rwxr-xr-x[-rw-r--r--] | application/Url.php | 11 |
4 files changed, 132 insertions, 9 deletions
diff --git a/application/ApplicationUtils.php b/application/ApplicationUtils.php index 274331e1..978fc9da 100644 --- a/application/ApplicationUtils.php +++ b/application/ApplicationUtils.php | |||
@@ -19,7 +19,7 @@ class ApplicationUtils | |||
19 | */ | 19 | */ |
20 | public static function getLatestGitVersionCode($url, $timeout=2) | 20 | public static function getLatestGitVersionCode($url, $timeout=2) |
21 | { | 21 | { |
22 | list($headers, $data) = get_http_url($url, $timeout); | 22 | list($headers, $data) = get_http_response($url, $timeout); |
23 | 23 | ||
24 | if (strpos($headers[0], '200 OK') === false) { | 24 | if (strpos($headers[0], '200 OK') === false) { |
25 | error_log('Failed to retrieve ' . $url); | 25 | error_log('Failed to retrieve ' . $url); |
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 499220c5..e2c1cb47 100644..100755 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -13,7 +13,7 @@ | |||
13 | * [1] = URL content (downloaded data) | 13 | * [1] = URL content (downloaded data) |
14 | * | 14 | * |
15 | * Example: | 15 | * Example: |
16 | * list($headers, $data) = get_http_url('http://sebauvage.net/'); | 16 | * list($headers, $data) = get_http_response('http://sebauvage.net/'); |
17 | * if (strpos($headers[0], '200 OK') !== false) { | 17 | * if (strpos($headers[0], '200 OK') !== false) { |
18 | * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); | 18 | * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); |
19 | * } else { | 19 | * } else { |
@@ -24,31 +24,66 @@ | |||
24 | * @see http://php.net/manual/en/function.stream-context-create.php | 24 | * @see http://php.net/manual/en/function.stream-context-create.php |
25 | * @see http://php.net/manual/en/function.get-headers.php | 25 | * @see http://php.net/manual/en/function.get-headers.php |
26 | */ | 26 | */ |
27 | function get_http_url($url, $timeout = 30, $maxBytes = 4194304) | 27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) |
28 | { | 28 | { |
29 | $urlObj = new Url($url); | ||
30 | if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | ||
31 | return array(array(0 => 'Invalid HTTP Url'), false); | ||
32 | } | ||
33 | |||
29 | $options = array( | 34 | $options = array( |
30 | 'http' => array( | 35 | 'http' => array( |
31 | 'method' => 'GET', | 36 | 'method' => 'GET', |
32 | 'timeout' => $timeout, | 37 | 'timeout' => $timeout, |
33 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' | 38 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' |
34 | .' Gecko/20100101 Firefox/23.0' | 39 | .' Gecko/20100101 Firefox/23.0', |
40 | 'request_fulluri' => true, | ||
35 | ) | 41 | ) |
36 | ); | 42 | ); |
37 | 43 | ||
38 | $context = stream_context_create($options); | 44 | $context = stream_context_create($options); |
45 | stream_context_set_default($options); | ||
46 | |||
47 | list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup()); | ||
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | ||
49 | return array($headers, false); | ||
50 | } | ||
39 | 51 | ||
40 | try { | 52 | try { |
41 | // TODO: catch Exception in calling code (thumbnailer) | 53 | // TODO: catch Exception in calling code (thumbnailer) |
42 | $content = file_get_contents($url, false, $context, -1, $maxBytes); | 54 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); |
43 | } catch (Exception $exc) { | 55 | } catch (Exception $exc) { |
44 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | 56 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); |
45 | } | 57 | } |
46 | 58 | ||
47 | if (!$content) { | 59 | return array($headers, $content); |
48 | return array(array(0 => 'HTTP Error'), ''); | 60 | } |
61 | |||
62 | /** | ||
63 | * Retrieve HTTP headers, following n redirections (temporary and permanent). | ||
64 | * | ||
65 | * @param string $url initial URL to reach. | ||
66 | * @param int $redirectionLimit max redirection follow.. | ||
67 | * | ||
68 | * @return array | ||
69 | */ | ||
70 | function get_redirected_headers($url, $redirectionLimit = 3) | ||
71 | { | ||
72 | $headers = get_headers($url, 1); | ||
73 | |||
74 | // Headers found, redirection found, and limit not reached. | ||
75 | if ($redirectionLimit-- > 0 | ||
76 | && !empty($headers) | ||
77 | && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) | ||
78 | && !empty($headers['Location'])) { | ||
79 | |||
80 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; | ||
81 | if ($redirection != $url) { | ||
82 | return get_redirected_headers($redirection, $redirectionLimit); | ||
83 | } | ||
49 | } | 84 | } |
50 | 85 | ||
51 | return array(get_headers($url, 1), $content); | 86 | return array($headers, $url); |
52 | } | 87 | } |
53 | 88 | ||
54 | /** | 89 | /** |
diff --git a/application/LinkUtils.php b/application/LinkUtils.php new file mode 100755 index 00000000..26dd6b67 --- /dev/null +++ b/application/LinkUtils.php | |||
@@ -0,0 +1,79 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Extract title from an HTML document. | ||
5 | * | ||
6 | * @param string $html HTML content where to look for a title. | ||
7 | * | ||
8 | * @return bool|string Extracted title if found, false otherwise. | ||
9 | */ | ||
10 | function html_extract_title($html) | ||
11 | { | ||
12 | if (preg_match('!<title>(.*)</title>!is', $html, $matches)) { | ||
13 | return trim(str_replace("\n", ' ', $matches[1])); | ||
14 | } | ||
15 | return false; | ||
16 | } | ||
17 | |||
18 | /** | ||
19 | * Determine charset from downloaded page. | ||
20 | * Priority: | ||
21 | * 1. HTTP headers (Content type). | ||
22 | * 2. HTML content page (tag <meta charset>). | ||
23 | * 3. Use a default charset (default: UTF-8). | ||
24 | * | ||
25 | * @param array $headers HTTP headers array. | ||
26 | * @param string $htmlContent HTML content where to look for charset. | ||
27 | * @param string $defaultCharset Default charset to apply if other methods failed. | ||
28 | * | ||
29 | * @return string Determined charset. | ||
30 | */ | ||
31 | function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') | ||
32 | { | ||
33 | if ($charset = headers_extract_charset($headers)) { | ||
34 | return $charset; | ||
35 | } | ||
36 | |||
37 | if ($charset = html_extract_charset($htmlContent)) { | ||
38 | return $charset; | ||
39 | } | ||
40 | |||
41 | return $defaultCharset; | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * Extract charset from HTTP headers if it's defined. | ||
46 | * | ||
47 | * @param array $headers HTTP headers array. | ||
48 | * | ||
49 | * @return bool|string Charset string if found (lowercase), false otherwise. | ||
50 | */ | ||
51 | function headers_extract_charset($headers) | ||
52 | { | ||
53 | if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { | ||
54 | preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); | ||
55 | if (! empty($match[1])) { | ||
56 | return strtolower(trim($match[1])); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | return false; | ||
61 | } | ||
62 | |||
63 | /** | ||
64 | * Extract charset HTML content (tag <meta charset>). | ||
65 | * | ||
66 | * @param string $html HTML content where to look for charset. | ||
67 | * | ||
68 | * @return bool|string Charset string if found, false otherwise. | ||
69 | */ | ||
70 | function html_extract_charset($html) | ||
71 | { | ||
72 | // Get encoding specified in HTML header. | ||
73 | preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); | ||
74 | if (!empty($enc[1])) { | ||
75 | return strtolower($enc[1]); | ||
76 | } | ||
77 | |||
78 | return false; | ||
79 | } | ||
diff --git a/application/Url.php b/application/Url.php index d80c9c58..a4ac2e73 100644..100755 --- a/application/Url.php +++ b/application/Url.php | |||
@@ -118,7 +118,7 @@ class Url | |||
118 | */ | 118 | */ |
119 | public function __construct($url) | 119 | public function __construct($url) |
120 | { | 120 | { |
121 | $this->parts = parse_url($url); | 121 | $this->parts = parse_url(trim($url)); |
122 | 122 | ||
123 | if (!empty($url) && empty($this->parts['scheme'])) { | 123 | if (!empty($url) && empty($this->parts['scheme'])) { |
124 | $this->parts['scheme'] = 'http'; | 124 | $this->parts['scheme'] = 'http'; |
@@ -201,4 +201,13 @@ class Url | |||
201 | } | 201 | } |
202 | return $this->parts['scheme']; | 202 | return $this->parts['scheme']; |
203 | } | 203 | } |
204 | |||
205 | /** | ||
206 | * Test if the Url is an HTTP one. | ||
207 | * | ||
208 | * @return true is HTTP, false otherwise. | ||
209 | */ | ||
210 | public function isHttp() { | ||
211 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | ||
212 | } | ||
204 | } | 213 | } |