diff options
author | ArthurHoaro <arthur@hoa.ro> | 2016-01-04 10:45:54 +0100 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2016-01-11 21:19:31 +0100 |
commit | 1557cefbd76257ceb830f65806831b490faf0acc (patch) | |
tree | 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/HttpUtils.php | |
parent | c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff) | |
download | Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip |
Fixes #410 - Retrieve title fails in multiple cases
* `get_http_url()` renamed to `get_http_response()`.
* Use the same HTTP context to retrieve response headers and content.
* Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections).
* Add `LinkUtils` to extract titles and charset.
* Try to retrieve charset from HTTP headers first (new), then HTML content.
* Use mb_string to re-encode title if necessary.
Diffstat (limited to 'application/HttpUtils.php')
-rwxr-xr-x[-rw-r--r--] | application/HttpUtils.php | 49 |
1 files changed, 42 insertions, 7 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 499220c5..e2c1cb47 100644..100755 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -13,7 +13,7 @@ | |||
13 | * [1] = URL content (downloaded data) | 13 | * [1] = URL content (downloaded data) |
14 | * | 14 | * |
15 | * Example: | 15 | * Example: |
16 | * list($headers, $data) = get_http_url('http://sebauvage.net/'); | 16 | * list($headers, $data) = get_http_response('http://sebauvage.net/'); |
17 | * if (strpos($headers[0], '200 OK') !== false) { | 17 | * if (strpos($headers[0], '200 OK') !== false) { |
18 | * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); | 18 | * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); |
19 | * } else { | 19 | * } else { |
@@ -24,31 +24,66 @@ | |||
24 | * @see http://php.net/manual/en/function.stream-context-create.php | 24 | * @see http://php.net/manual/en/function.stream-context-create.php |
25 | * @see http://php.net/manual/en/function.get-headers.php | 25 | * @see http://php.net/manual/en/function.get-headers.php |
26 | */ | 26 | */ |
27 | function get_http_url($url, $timeout = 30, $maxBytes = 4194304) | 27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) |
28 | { | 28 | { |
29 | $urlObj = new Url($url); | ||
30 | if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | ||
31 | return array(array(0 => 'Invalid HTTP Url'), false); | ||
32 | } | ||
33 | |||
29 | $options = array( | 34 | $options = array( |
30 | 'http' => array( | 35 | 'http' => array( |
31 | 'method' => 'GET', | 36 | 'method' => 'GET', |
32 | 'timeout' => $timeout, | 37 | 'timeout' => $timeout, |
33 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' | 38 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' |
34 | .' Gecko/20100101 Firefox/23.0' | 39 | .' Gecko/20100101 Firefox/23.0', |
40 | 'request_fulluri' => true, | ||
35 | ) | 41 | ) |
36 | ); | 42 | ); |
37 | 43 | ||
38 | $context = stream_context_create($options); | 44 | $context = stream_context_create($options); |
45 | stream_context_set_default($options); | ||
46 | |||
47 | list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup()); | ||
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | ||
49 | return array($headers, false); | ||
50 | } | ||
39 | 51 | ||
40 | try { | 52 | try { |
41 | // TODO: catch Exception in calling code (thumbnailer) | 53 | // TODO: catch Exception in calling code (thumbnailer) |
42 | $content = file_get_contents($url, false, $context, -1, $maxBytes); | 54 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); |
43 | } catch (Exception $exc) { | 55 | } catch (Exception $exc) { |
44 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | 56 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); |
45 | } | 57 | } |
46 | 58 | ||
47 | if (!$content) { | 59 | return array($headers, $content); |
48 | return array(array(0 => 'HTTP Error'), ''); | 60 | } |
61 | |||
62 | /** | ||
63 | * Retrieve HTTP headers, following n redirections (temporary and permanent). | ||
64 | * | ||
65 | * @param string $url initial URL to reach. | ||
66 | * @param int $redirectionLimit max redirection follow.. | ||
67 | * | ||
68 | * @return array | ||
69 | */ | ||
70 | function get_redirected_headers($url, $redirectionLimit = 3) | ||
71 | { | ||
72 | $headers = get_headers($url, 1); | ||
73 | |||
74 | // Headers found, redirection found, and limit not reached. | ||
75 | if ($redirectionLimit-- > 0 | ||
76 | && !empty($headers) | ||
77 | && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) | ||
78 | && !empty($headers['Location'])) { | ||
79 | |||
80 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; | ||
81 | if ($redirection != $url) { | ||
82 | return get_redirected_headers($redirection, $redirectionLimit); | ||
83 | } | ||
49 | } | 84 | } |
50 | 85 | ||
51 | return array(get_headers($url, 1), $content); | 86 | return array($headers, $url); |
52 | } | 87 | } |
53 | 88 | ||
54 | /** | 89 | /** |