aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/HttpUtils.php
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2016-01-04 10:45:54 +0100
committerArthurHoaro <arthur@hoa.ro>2016-01-11 21:19:31 +0100
commit1557cefbd76257ceb830f65806831b490faf0acc (patch)
tree787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application/HttpUtils.php
parentc0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
downloadShaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip
Fixes #410 - Retrieve title fails in multiple cases
* `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary.
Diffstat (limited to 'application/HttpUtils.php')
-rwxr-xr-x[-rw-r--r--]application/HttpUtils.php49
1 files changed, 42 insertions, 7 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index 499220c5..e2c1cb47 100644..100755
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -13,7 +13,7 @@
13 * [1] = URL content (downloaded data) 13 * [1] = URL content (downloaded data)
14 * 14 *
15 * Example: 15 * Example:
16 * list($headers, $data) = get_http_url('http://sebauvage.net/'); 16 * list($headers, $data) = get_http_response('http://sebauvage.net/');
17 * if (strpos($headers[0], '200 OK') !== false) { 17 * if (strpos($headers[0], '200 OK') !== false) {
18 * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); 18 * echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
19 * } else { 19 * } else {
@@ -24,31 +24,66 @@
24 * @see http://php.net/manual/en/function.stream-context-create.php 24 * @see http://php.net/manual/en/function.stream-context-create.php
25 * @see http://php.net/manual/en/function.get-headers.php 25 * @see http://php.net/manual/en/function.get-headers.php
26 */ 26 */
27function get_http_url($url, $timeout = 30, $maxBytes = 4194304) 27function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
28{ 28{
29 $urlObj = new Url($url);
30 if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
31 return array(array(0 => 'Invalid HTTP Url'), false);
32 }
33
29 $options = array( 34 $options = array(
30 'http' => array( 35 'http' => array(
31 'method' => 'GET', 36 'method' => 'GET',
32 'timeout' => $timeout, 37 'timeout' => $timeout,
33 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' 38 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)'
34 .' Gecko/20100101 Firefox/23.0' 39 .' Gecko/20100101 Firefox/23.0',
40 'request_fulluri' => true,
35 ) 41 )
36 ); 42 );
37 43
38 $context = stream_context_create($options); 44 $context = stream_context_create($options);
45 stream_context_set_default($options);
46
47 list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup());
48 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 return array($headers, false);
50 }
39 51
40 try { 52 try {
41 // TODO: catch Exception in calling code (thumbnailer) 53 // TODO: catch Exception in calling code (thumbnailer)
42 $content = file_get_contents($url, false, $context, -1, $maxBytes); 54 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
43 } catch (Exception $exc) { 55 } catch (Exception $exc) {
44 return array(array(0 => 'HTTP Error'), $exc->getMessage()); 56 return array(array(0 => 'HTTP Error'), $exc->getMessage());
45 } 57 }
46 58
47 if (!$content) { 59 return array($headers, $content);
48 return array(array(0 => 'HTTP Error'), ''); 60}
61
62/**
63 * Retrieve HTTP headers, following n redirections (temporary and permanent).
64 *
65 * @param string $url initial URL to reach.
66 * @param int $redirectionLimit max redirection follow..
67 *
68 * @return array
69 */
70function get_redirected_headers($url, $redirectionLimit = 3)
71{
72 $headers = get_headers($url, 1);
73
74 // Headers found, redirection found, and limit not reached.
75 if ($redirectionLimit-- > 0
76 && !empty($headers)
77 && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
78 && !empty($headers['Location'])) {
79
80 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
81 if ($redirection != $url) {
82 return get_redirected_headers($redirection, $redirectionLimit);
83 }
49 } 84 }
50 85
51 return array(get_headers($url, 1), $content); 86 return array($headers, $url);
52} 87}
53 88
54/** 89/**