<?php
/**
* GET an HTTP URL to retrieve its content
+ * Uses the cURL library or a fallback method
*
* @param string $url URL to get (http://...)
* @param int $timeout network timeout (in seconds)
* echo 'There was an error: '.htmlspecialchars($headers[0]);
* }
*
- * @see http://php.net/manual/en/function.file-get-contents.php
- * @see http://php.net/manual/en/function.stream-context-create.php
- * @see http://php.net/manual/en/function.get-headers.php
+ * @see https://secure.php.net/manual/en/ref.curl.php
+ * @see https://secure.php.net/manual/en/functions.anonymous.php
+ * @see https://secure.php.net/manual/en/function.preg-split.php
+ * @see https://secure.php.net/manual/en/function.explode.php
+ * @see http://stackoverflow.com/q/17641073
+ * @see http://stackoverflow.com/q/9183178
+ * @see http://stackoverflow.com/q/1462720
*/
function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
{
$urlObj = new Url($url);
$cleanUrl = $urlObj->idnToAscii();
- if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
+ if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
return array(array(0 => 'Invalid HTTP Url'), false);
}
+ $userAgent =
+ 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
+ . ' Gecko/20100101 Firefox/45.0';
+ $acceptLanguage =
+ substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
+ $maxRedirs = 3;
+
+ if (!function_exists('curl_init')) {
+ return get_http_response_fallback(
+ $cleanUrl,
+ $timeout,
+ $maxBytes,
+ $userAgent,
+ $acceptLanguage,
+ $maxRedirs
+ );
+ }
+
+ $ch = curl_init($cleanUrl);
+ if ($ch === false) {
+ return array(array(0 => 'curl_init() error'), false);
+ }
+
+ // General cURL settings
+ curl_setopt($ch, CURLOPT_AUTOREFERER, true);
+ curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
+ curl_setopt($ch, CURLOPT_HEADER, true);
+ curl_setopt(
+ $ch,
+ CURLOPT_HTTPHEADER,
+ array('Accept-Language: ' . $acceptLanguage)
+ );
+ curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
+ curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
+ curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
+
+ // Max download size management
+ curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
+ curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+ curl_setopt($ch, CURLOPT_PROGRESSFUNCTION,
+ function($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes)
+ {
+ if (version_compare(phpversion(), '5.5', '<')) {
+ // PHP version lower than 5.5
+ // Callback has 4 arguments
+ $downloaded = $arg1;
+ } else {
+ // Callback has 5 arguments
+ $downloaded = $arg2;
+ }
+ // Non-zero return stops downloading
+ return ($downloaded > $maxBytes) ? 1 : 0;
+ }
+ );
+
+ $response = curl_exec($ch);
+ $errorNo = curl_errno($ch);
+ $errorStr = curl_error($ch);
+ $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
+ curl_close($ch);
+
+ if ($response === false) {
+ if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
+ /*
+ * Workaround to match fallback method behaviour
+ * Removing this would require updating
+ * GetHttpUrlTest::testGetInvalidRemoteUrl()
+ */
+ return array(false, false);
+ }
+ return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
+ }
+
+ // Formatting output like the fallback method
+ $rawHeaders = substr($response, 0, $headSize);
+
+ // Keep only headers from latest redirection
+ $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
+ $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
+
+ $content = substr($response, $headSize);
+ $headers = array();
+ foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
+ if (empty($line) or ctype_space($line)) {
+ continue;
+ }
+ $splitLine = explode(': ', $line, 2);
+ if (count($splitLine) > 1) {
+ $key = $splitLine[0];
+ $value = $splitLine[1];
+ if (array_key_exists($key, $headers)) {
+ if (!is_array($headers[$key])) {
+ $headers[$key] = array(0 => $headers[$key]);
+ }
+ $headers[$key][] = $value;
+ } else {
+ $headers[$key] = $value;
+ }
+ } else {
+ $headers[] = $splitLine[0];
+ }
+ }
+
+ return array($headers, $content);
+}
+
+/**
+ * GET an HTTP URL to retrieve its content (fallback method)
+ *
+ * @param string $cleanUrl URL to get (http://... valid and in ASCII form)
+ * @param int $timeout network timeout (in seconds)
+ * @param int $maxBytes maximum downloaded bytes
+ * @param string $userAgent "User-Agent" header
+ * @param string $acceptLanguage "Accept-Language" header
+ * @param int $maxRedr maximum amount of redirections followed
+ *
+ * @return array HTTP response headers, downloaded content
+ *
+ * Output format:
+ * [0] = associative array containing HTTP response headers
+ * [1] = URL content (downloaded data)
+ *
+ * @see http://php.net/manual/en/function.file-get-contents.php
+ * @see http://php.net/manual/en/function.stream-context-create.php
+ * @see http://php.net/manual/en/function.get-headers.php
+ */
+function get_http_response_fallback(
+ $cleanUrl,
+ $timeout,
+ $maxBytes,
+ $userAgent,
+ $acceptLanguage,
+ $maxRedr
+) {
$options = array(
'http' => array(
'method' => 'GET',
'timeout' => $timeout,
- 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
- .' Gecko/20100101 Firefox/45.0',
- 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3',
+ 'user_agent' => $userAgent,
+ 'header' => "Accept: */*\r\n"
+ . 'Accept-Language: ' . $acceptLanguage
)
);
stream_context_set_default($options);
- list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
+ list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
if (! $headers || strpos($headers[0], '200 OK') === false) {
$options['http']['request_fulluri'] = true;
stream_context_set_default($options);
- list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
+ list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
}
- if (! $headers || strpos($headers[0], '200 OK') === false) {
+ if (! $headers) {
return array($headers, false);
}