diff options
author | Arthur <arthur@hoa.ro> | 2016-08-09 13:15:19 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-08-09 13:15:19 +0200 |
commit | d0d3623172f47006607cf16dfb68f1ac394dd2cc (patch) | |
tree | 0009647c88e9ea9d6ef50db0d3008bed977a8ce7 /application | |
parent | 6b98d4617966e66a10552ac856cc50b12d4a21e1 (diff) | |
parent | 634783f916b614fa93e701da172e3ca57d6d1860 (diff) | |
download | Shaarli-d0d3623172f47006607cf16dfb68f1ac394dd2cc.tar.gz Shaarli-d0d3623172f47006607cf16dfb68f1ac394dd2cc.tar.zst Shaarli-d0d3623172f47006607cf16dfb68f1ac394dd2cc.zip |
Merge pull request #624 from julienCXX/pr-curl-http-fetch
Added (and set as default) a cURL-based method for fetching HTTP content
Diffstat (limited to 'application')
-rw-r--r-- | application/HttpUtils.php | 160 |
1 files changed, 150 insertions, 10 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 2e0792f9..27a39d3d 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -1,6 +1,7 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * GET an HTTP URL to retrieve its content | 3 | * GET an HTTP URL to retrieve its content |
4 | * Uses the cURL library or a fallback method | ||
4 | * | 5 | * |
5 | * @param string $url URL to get (http://...) | 6 | * @param string $url URL to get (http://...) |
6 | * @param int $timeout network timeout (in seconds) | 7 | * @param int $timeout network timeout (in seconds) |
@@ -20,38 +21,177 @@ | |||
20 | * echo 'There was an error: '.htmlspecialchars($headers[0]); | 21 | * echo 'There was an error: '.htmlspecialchars($headers[0]); |
21 | * } | 22 | * } |
22 | * | 23 | * |
23 | * @see http://php.net/manual/en/function.file-get-contents.php | 24 | * @see https://secure.php.net/manual/en/ref.curl.php |
24 | * @see http://php.net/manual/en/function.stream-context-create.php | 25 | * @see https://secure.php.net/manual/en/functions.anonymous.php |
25 | * @see http://php.net/manual/en/function.get-headers.php | 26 | * @see https://secure.php.net/manual/en/function.preg-split.php |
27 | * @see https://secure.php.net/manual/en/function.explode.php | ||
28 | * @see http://stackoverflow.com/q/17641073 | ||
29 | * @see http://stackoverflow.com/q/9183178 | ||
30 | * @see http://stackoverflow.com/q/1462720 | ||
26 | */ | 31 | */ |
27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | 32 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) |
28 | { | 33 | { |
29 | $urlObj = new Url($url); | 34 | $urlObj = new Url($url); |
30 | $cleanUrl = $urlObj->idnToAscii(); | 35 | $cleanUrl = $urlObj->idnToAscii(); |
31 | 36 | ||
32 | if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | 37 | if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { |
33 | return array(array(0 => 'Invalid HTTP Url'), false); | 38 | return array(array(0 => 'Invalid HTTP Url'), false); |
34 | } | 39 | } |
35 | 40 | ||
41 | $userAgent = | ||
42 | 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' | ||
43 | . ' Gecko/20100101 Firefox/45.0'; | ||
44 | $acceptLanguage = | ||
45 | substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3'; | ||
46 | $maxRedirs = 3; | ||
47 | |||
48 | if (!function_exists('curl_init')) { | ||
49 | return get_http_response_fallback( | ||
50 | $cleanUrl, | ||
51 | $timeout, | ||
52 | $maxBytes, | ||
53 | $userAgent, | ||
54 | $acceptLanguage, | ||
55 | $maxRedirs | ||
56 | ); | ||
57 | } | ||
58 | |||
59 | $ch = curl_init($cleanUrl); | ||
60 | if ($ch === false) { | ||
61 | return array(array(0 => 'curl_init() error'), false); | ||
62 | } | ||
63 | |||
64 | // General cURL settings | ||
65 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); | ||
66 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | ||
67 | curl_setopt($ch, CURLOPT_HEADER, true); | ||
68 | curl_setopt( | ||
69 | $ch, | ||
70 | CURLOPT_HTTPHEADER, | ||
71 | array('Accept-Language: ' . $acceptLanguage) | ||
72 | ); | ||
73 | curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); | ||
74 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | ||
75 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | ||
76 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | ||
77 | |||
78 | // Max download size management | ||
79 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024); | ||
80 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | ||
81 | curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, | ||
82 | function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) | ||
83 | { | ||
84 | if (version_compare(phpversion(), '5.5', '<')) { | ||
85 | // PHP version lower than 5.5 | ||
86 | // Callback has 4 arguments | ||
87 | $downloaded = $arg1; | ||
88 | } else { | ||
89 | // Callback has 5 arguments | ||
90 | $downloaded = $arg2; | ||
91 | } | ||
92 | // Non-zero return stops downloading | ||
93 | return ($downloaded > $maxBytes) ? 1 : 0; | ||
94 | } | ||
95 | ); | ||
96 | |||
97 | $response = curl_exec($ch); | ||
98 | $errorNo = curl_errno($ch); | ||
99 | $errorStr = curl_error($ch); | ||
100 | $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | ||
101 | curl_close($ch); | ||
102 | |||
103 | if ($response === false) { | ||
104 | if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) { | ||
105 | /* | ||
106 | * Workaround to match fallback method behaviour | ||
107 | * Removing this would require updating | ||
108 | * GetHttpUrlTest::testGetInvalidRemoteUrl() | ||
109 | */ | ||
110 | return array(false, false); | ||
111 | } | ||
112 | return array(array(0 => 'curl_exec() error: ' . $errorStr), false); | ||
113 | } | ||
114 | |||
115 | // Formatting output like the fallback method | ||
116 | $rawHeaders = substr($response, 0, $headSize); | ||
117 | |||
118 | // Keep only headers from latest redirection | ||
119 | $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders)); | ||
120 | $rawHeadersLastRedir = end($rawHeadersArrayRedirs); | ||
121 | |||
122 | $content = substr($response, $headSize); | ||
123 | $headers = array(); | ||
124 | foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { | ||
125 | if (empty($line) or ctype_space($line)) { | ||
126 | continue; | ||
127 | } | ||
128 | $splitLine = explode(': ', $line, 2); | ||
129 | if (count($splitLine) > 1) { | ||
130 | $key = $splitLine[0]; | ||
131 | $value = $splitLine[1]; | ||
132 | if (array_key_exists($key, $headers)) { | ||
133 | if (!is_array($headers[$key])) { | ||
134 | $headers[$key] = array(0 => $headers[$key]); | ||
135 | } | ||
136 | $headers[$key][] = $value; | ||
137 | } else { | ||
138 | $headers[$key] = $value; | ||
139 | } | ||
140 | } else { | ||
141 | $headers[] = $splitLine[0]; | ||
142 | } | ||
143 | } | ||
144 | |||
145 | return array($headers, $content); | ||
146 | } | ||
147 | |||
148 | /** | ||
149 | * GET an HTTP URL to retrieve its content (fallback method) | ||
150 | * | ||
151 | * @param string $cleanUrl URL to get (http://... valid and in ASCII form) | ||
152 | * @param int $timeout network timeout (in seconds) | ||
153 | * @param int $maxBytes maximum downloaded bytes | ||
154 | * @param string $userAgent "User-Agent" header | ||
155 | * @param string $acceptLanguage "Accept-Language" header | ||
156 | * @param int $maxRedr maximum amount of redirections followed | ||
157 | * | ||
158 | * @return array HTTP response headers, downloaded content | ||
159 | * | ||
160 | * Output format: | ||
161 | * [0] = associative array containing HTTP response headers | ||
162 | * [1] = URL content (downloaded data) | ||
163 | * | ||
164 | * @see http://php.net/manual/en/function.file-get-contents.php | ||
165 | * @see http://php.net/manual/en/function.stream-context-create.php | ||
166 | * @see http://php.net/manual/en/function.get-headers.php | ||
167 | */ | ||
168 | function get_http_response_fallback( | ||
169 | $cleanUrl, | ||
170 | $timeout, | ||
171 | $maxBytes, | ||
172 | $userAgent, | ||
173 | $acceptLanguage, | ||
174 | $maxRedr | ||
175 | ) { | ||
36 | $options = array( | 176 | $options = array( |
37 | 'http' => array( | 177 | 'http' => array( |
38 | 'method' => 'GET', | 178 | 'method' => 'GET', |
39 | 'timeout' => $timeout, | 179 | 'timeout' => $timeout, |
40 | 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' | 180 | 'user_agent' => $userAgent, |
41 | .' Gecko/20100101 Firefox/45.0', | 181 | 'header' => "Accept: */*\r\n" |
42 | 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3', | 182 | . 'Accept-Language: ' . $acceptLanguage |
43 | ) | 183 | ) |
44 | ); | 184 | ); |
45 | 185 | ||
46 | stream_context_set_default($options); | 186 | stream_context_set_default($options); |
47 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl); | 187 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); |
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | 188 | if (! $headers || strpos($headers[0], '200 OK') === false) { |
49 | $options['http']['request_fulluri'] = true; | 189 | $options['http']['request_fulluri'] = true; |
50 | stream_context_set_default($options); | 190 | stream_context_set_default($options); |
51 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl); | 191 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); |
52 | } | 192 | } |
53 | 193 | ||
54 | if (! $headers || strpos($headers[0], '200 OK') === false) { | 194 | if (! $headers) { |
55 | return array($headers, false); | 195 | return array($headers, false); |
56 | } | 196 | } |
57 | 197 | ||