aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorArthur <arthur@hoa.ro>2016-08-09 13:15:19 +0200
committerGitHub <noreply@github.com>2016-08-09 13:15:19 +0200
commitd0d3623172f47006607cf16dfb68f1ac394dd2cc (patch)
tree0009647c88e9ea9d6ef50db0d3008bed977a8ce7
parent6b98d4617966e66a10552ac856cc50b12d4a21e1 (diff)
parent634783f916b614fa93e701da172e3ca57d6d1860 (diff)
downloadShaarli-d0d3623172f47006607cf16dfb68f1ac394dd2cc.tar.gz
Shaarli-d0d3623172f47006607cf16dfb68f1ac394dd2cc.tar.zst
Shaarli-d0d3623172f47006607cf16dfb68f1ac394dd2cc.zip
Merge pull request #624 from julienCXX/pr-curl-http-fetch
Added (and set as default) a cURL-based method for fetching HTTP content
-rw-r--r--application/HttpUtils.php160
1 files changed, 150 insertions, 10 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index 2e0792f9..27a39d3d 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -1,6 +1,7 @@
1<?php 1<?php
2/** 2/**
3 * GET an HTTP URL to retrieve its content 3 * GET an HTTP URL to retrieve its content
4 * Uses the cURL library or a fallback method
4 * 5 *
5 * @param string $url URL to get (http://...) 6 * @param string $url URL to get (http://...)
6 * @param int $timeout network timeout (in seconds) 7 * @param int $timeout network timeout (in seconds)
@@ -20,38 +21,177 @@
20 * echo 'There was an error: '.htmlspecialchars($headers[0]); 21 * echo 'There was an error: '.htmlspecialchars($headers[0]);
21 * } 22 * }
22 * 23 *
23 * @see http://php.net/manual/en/function.file-get-contents.php 24 * @see https://secure.php.net/manual/en/ref.curl.php
24 * @see http://php.net/manual/en/function.stream-context-create.php 25 * @see https://secure.php.net/manual/en/functions.anonymous.php
25 * @see http://php.net/manual/en/function.get-headers.php 26 * @see https://secure.php.net/manual/en/function.preg-split.php
27 * @see https://secure.php.net/manual/en/function.explode.php
28 * @see http://stackoverflow.com/q/17641073
29 * @see http://stackoverflow.com/q/9183178
30 * @see http://stackoverflow.com/q/1462720
26 */ 31 */
27function get_http_response($url, $timeout = 30, $maxBytes = 4194304) 32function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
28{ 33{
29 $urlObj = new Url($url); 34 $urlObj = new Url($url);
30 $cleanUrl = $urlObj->idnToAscii(); 35 $cleanUrl = $urlObj->idnToAscii();
31 36
32 if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { 37 if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
33 return array(array(0 => 'Invalid HTTP Url'), false); 38 return array(array(0 => 'Invalid HTTP Url'), false);
34 } 39 }
35 40
41 $userAgent =
42 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
43 . ' Gecko/20100101 Firefox/45.0';
44 $acceptLanguage =
45 substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
46 $maxRedirs = 3;
47
48 if (!function_exists('curl_init')) {
49 return get_http_response_fallback(
50 $cleanUrl,
51 $timeout,
52 $maxBytes,
53 $userAgent,
54 $acceptLanguage,
55 $maxRedirs
56 );
57 }
58
59 $ch = curl_init($cleanUrl);
60 if ($ch === false) {
61 return array(array(0 => 'curl_init() error'), false);
62 }
63
64 // General cURL settings
65 curl_setopt($ch, CURLOPT_AUTOREFERER, true);
66 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
67 curl_setopt($ch, CURLOPT_HEADER, true);
68 curl_setopt(
69 $ch,
70 CURLOPT_HTTPHEADER,
71 array('Accept-Language: ' . $acceptLanguage)
72 );
73 curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
74 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
75 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
76 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
77
78 // Max download size management
79 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
80 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
81 curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
82 function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
83 {
84 if (version_compare(phpversion(), '5.5', '<')) {
85 // PHP version lower than 5.5
86 // Callback has 4 arguments
87 $downloaded = $arg1;
88 } else {
89 // Callback has 5 arguments
90 $downloaded = $arg2;
91 }
92 // Non-zero return stops downloading
93 return ($downloaded > $maxBytes) ? 1 : 0;
94 }
95 );
96
97 $response = curl_exec($ch);
98 $errorNo = curl_errno($ch);
99 $errorStr = curl_error($ch);
100 $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
101 curl_close($ch);
102
103 if ($response === false) {
104 if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
105 /*
106 * Workaround to match fallback method behaviour
107 * Removing this would require updating
108 * GetHttpUrlTest::testGetInvalidRemoteUrl()
109 */
110 return array(false, false);
111 }
112 return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
113 }
114
115 // Formatting output like the fallback method
116 $rawHeaders = substr($response, 0, $headSize);
117
118 // Keep only headers from latest redirection
119 $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
120 $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
121
122 $content = substr($response, $headSize);
123 $headers = array();
124 foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
125 if (empty($line) or ctype_space($line)) {
126 continue;
127 }
128 $splitLine = explode(': ', $line, 2);
129 if (count($splitLine) > 1) {
130 $key = $splitLine[0];
131 $value = $splitLine[1];
132 if (array_key_exists($key, $headers)) {
133 if (!is_array($headers[$key])) {
134 $headers[$key] = array(0 => $headers[$key]);
135 }
136 $headers[$key][] = $value;
137 } else {
138 $headers[$key] = $value;
139 }
140 } else {
141 $headers[] = $splitLine[0];
142 }
143 }
144
145 return array($headers, $content);
146}
147
148/**
149 * GET an HTTP URL to retrieve its content (fallback method)
150 *
151 * @param string $cleanUrl URL to get (http://... valid and in ASCII form)
152 * @param int $timeout network timeout (in seconds)
153 * @param int $maxBytes maximum downloaded bytes
154 * @param string $userAgent "User-Agent" header
155 * @param string $acceptLanguage "Accept-Language" header
156 * @param int $maxRedr maximum amount of redirections followed
157 *
158 * @return array HTTP response headers, downloaded content
159 *
160 * Output format:
161 * [0] = associative array containing HTTP response headers
162 * [1] = URL content (downloaded data)
163 *
164 * @see http://php.net/manual/en/function.file-get-contents.php
165 * @see http://php.net/manual/en/function.stream-context-create.php
166 * @see http://php.net/manual/en/function.get-headers.php
167 */
168function get_http_response_fallback(
169 $cleanUrl,
170 $timeout,
171 $maxBytes,
172 $userAgent,
173 $acceptLanguage,
174 $maxRedr
175) {
36 $options = array( 176 $options = array(
37 'http' => array( 177 'http' => array(
38 'method' => 'GET', 178 'method' => 'GET',
39 'timeout' => $timeout, 179 'timeout' => $timeout,
40 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' 180 'user_agent' => $userAgent,
41 .' Gecko/20100101 Firefox/45.0', 181 'header' => "Accept: */*\r\n"
42 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3', 182 . 'Accept-Language: ' . $acceptLanguage
43 ) 183 )
44 ); 184 );
45 185
46 stream_context_set_default($options); 186 stream_context_set_default($options);
47 list($headers, $finalUrl) = get_redirected_headers($cleanUrl); 187 list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
48 if (! $headers || strpos($headers[0], '200 OK') === false) { 188 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 $options['http']['request_fulluri'] = true; 189 $options['http']['request_fulluri'] = true;
50 stream_context_set_default($options); 190 stream_context_set_default($options);
51 list($headers, $finalUrl) = get_redirected_headers($cleanUrl); 191 list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
52 } 192 }
53 193
54 if (! $headers || strpos($headers[0], '200 OK') === false) { 194 if (! $headers) {
55 return array($headers, false); 195 return array($headers, false);
56 } 196 }
57 197