5 * This class is designed to take advantage of parallel HTTP requests
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
7 * For environments which do not have these options, it reverts to standard sequential
8 * requests (using file_get_contents())
12 * @see http://php.net/HttpRequestPool
13 * @author Keyvan Minoukadeh
14 * @copyright 2011-2012 Keyvan Minoukadeh
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
20 const METHOD_REQUEST_POOL
= 1;
21 const METHOD_CURL_MULTI
= 2;
22 const METHOD_FILE_GET_CONTENTS
= 4;
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
24 const UA_BROWSER
= 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
25 const UA_PHP
= 'PHP/5.2';
26 const REF_GOOGLE
= 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
28 protected $requests = array();
29 protected $redirectQueue = array();
30 protected $requestOptions;
31 protected $maxParallelRequests = 5;
32 protected $cache = null; //TODO
33 protected $httpContext;
34 protected $minimiseMemoryUse = false; //TODO
37 public $debug = false;
38 public $debugVerbose = false;
39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
40 public $maxRedirects = 5;
41 public $userAgentMap = array();
42 public $rewriteUrls = array();
43 public $userAgentDefault;
45 //public $userAgent = 'Mozilla/5.0';
47 // Prevent certain file/mime types
48 // HTTP responses which match these content types will
49 // be returned without body.
50 public $headerOnlyTypes = array();
51 // URLs ending with one of these extensions will
52 // prompt Humble HTTP Agent to send a HEAD request first
53 // to see if returned content type matches $headerOnlyTypes.
54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
55 // AJAX triggers to search for.
56 // for AJAX sites, e.g. Blogger with its dynamic views templates.
57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
59 //TODO: set max file size
60 //TODO: normalise headers
62 function __construct($requestOptions=null, $method=null) {
63 $this->userAgentDefault
= self
::UA_BROWSER
;
64 $this->referer
= self
::REF_GOOGLE
;
65 // set the request method
66 if (in_array($method, array(1,2,4))) {
67 $this->method
= $method;
69 if (class_exists('HttpRequestPool')) {
70 $this->method
= self
::METHOD_REQUEST_POOL
;
71 } elseif (function_exists('curl_multi_init')) {
72 $this->method
= self
::METHOD_CURL_MULTI
;
74 $this->method
= self
::METHOD_FILE_GET_CONTENTS
;
77 if ($this->method
== self
::METHOD_CURL_MULTI
) {
78 require_once(dirname(__FILE__
).'/RollingCurl.php');
81 $this->cookieJar
= new CookieJar();
82 // set request options (redirect must be 0)
83 $this->requestOptions
= array(
85 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
86 // TODO: test onprogress?
88 if (is_array($requestOptions)) {
89 $this->requestOptions
= array_merge($this->requestOptions
, $requestOptions);
91 $this->httpContext
= array(
93 'ignore_errors' => true,
94 'timeout' => $this->requestOptions
['timeout'],
95 'max_redirects' => $this->requestOptions
['redirect'],
96 'header' => "Accept: */*\r\n"
101 protected function debug($msg) {
103 $mem = round(memory_get_usage()/1024, 2);
104 $memPeak = round(memory_get_peak_usage()/1024, 2);
106 if ($this->debugVerbose
) echo ' - mem used: ',$mem," (peak: $memPeak)";
113 protected function getUserAgent($url, $asArray=false) {
114 $host = @parse_url($url, PHP_URL_HOST
);
115 if (strtolower(substr($host, 0, 4)) == 'www.') {
116 $host = substr($host, 4);
120 $split = explode('.', $host);
121 if (count($split) > 1) {
123 $try[] = '.'.implode('.', $split);
125 foreach ($try as $h) {
126 if (isset($this->userAgentMap
[$h])) {
127 $ua = $this->userAgentMap
[$h];
132 if (!isset($ua)) $ua = $this->userAgentDefault
;
134 return array('User-Agent' => $ua);
136 return 'User-Agent: '.$ua;
140 public function rewriteHashbangFragment($url) {
141 // return $url if there's no '#!'
142 if (strpos($url, '#!') === false) return $url;
143 // split $url and rewrite
144 // TODO: is SimplePie_IRI included?
145 $iri = new SimplePie_IRI($url);
146 $fragment = substr($iri->fragment
, 1); // strip '!'
147 $iri->fragment
= null;
148 if (isset($iri->query
)) {
149 parse_str($iri->query
, $query);
153 $query['_escaped_fragment_'] = (string)$fragment;
154 $iri->query
= str_replace('%2F', '/', http_build_query($query)); // needed for some sites
155 return $iri->get_iri();
158 public function getUglyURL($url, $html) {
159 if ($html == '') return false;
161 foreach ($this->ajaxTriggers
as $string) {
162 if (stripos($html, $string)) {
167 if (!$found) return false;
168 $iri = new SimplePie_IRI($url);
169 if (isset($iri->query
)) {
170 parse_str($iri->query
, $query);
174 $query['_escaped_fragment_'] = '';
175 $iri->query
= str_replace('%2F', '/', http_build_query($query)); // needed for some sites
176 return $iri->get_iri();
179 public function removeFragment($url) {
180 $pos = strpos($url, '#');
181 if ($pos === false) {
184 return substr($url, 0, $pos);
188 public function rewriteUrls($url) {
189 foreach ($this->rewriteUrls
as $find => $action) {
190 if (strpos($url, $find) !== false) {
191 if (is_array($action)) {
192 return strtr($url, $action);
199 public function enableDebug($bool=true) {
200 $this->debug
= (bool)$bool;
203 public function minimiseMemoryUse($bool = true) {
204 $this->minimiseMemoryUse
= $bool;
207 public function setMaxParallelRequests($max) {
208 $this->maxParallelRequests
= $max;
211 public function validateUrl($url) {
212 $url = filter_var($url, FILTER_SANITIZE_URL
);
213 $test = filter_var($url, FILTER_VALIDATE_URL
, FILTER_FLAG_SCHEME_REQUIRED
);
214 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
215 if ($test === false) {
216 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL
, FILTER_FLAG_SCHEME_REQUIRED
);
218 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
225 public function fetchAll(array $urls) {
226 $this->fetchAllOnce($urls, $isRedirect=false);
228 while (!empty($this->redirectQueue
) && ++
$redirects <= $this->maxRedirects
) {
229 $this->debug("Following redirects #$redirects...");
230 $this->fetchAllOnce($this->redirectQueue
, $isRedirect=true);
234 // fetch all URLs without following redirects
235 public function fetchAllOnce(array $urls, $isRedirect=false) {
236 if (!$isRedirect) $urls = array_unique($urls);
237 if (empty($urls)) return;
239 //////////////////////////////////////////////////////
240 // parallel (HttpRequestPool)
241 if ($this->method
== self
::METHOD_REQUEST_POOL
) {
242 $this->debug('Starting parallel fetch (HttpRequestPool)');
244 while (count($urls) > 0) {
245 $this->debug('Processing set of '.min($this->maxParallelRequests
, count($urls)));
246 $subset = array_splice($urls, 0, $this->maxParallelRequests
);
247 $pool = new HttpRequestPool();
248 foreach ($subset as $orig => $url) {
249 if (!$isRedirect) $orig = $url;
250 unset($this->redirectQueue
[$orig]);
251 $this->debug("...$url");
252 if (!$isRedirect && isset($this->requests[$url])) {
253 $this->debug("......in memory
");
255 } elseif ($this->isCached($url)) {
256 $this->debug("......is cached
");
257 if (!$this->minimiseMemoryUse) {
258 $this->requests[$url] = $this->getCached($url);
262 $this->debug("......adding to pool
");
263 $req_url = $this->rewriteUrls($url);
264 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
265 $req_url = $this->removeFragment($req_url);
266 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
267 $_meth = HttpRequest::METH_HEAD;
269 $_meth = HttpRequest::METH_GET;
270 unset($this->requests[$orig]['wrongGuess']);
272 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
273 // send cookies, if we have any
274 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
275 $this->debug("......sending cookies
: $cookies");
276 $httpRequest->addHeaders(array('Cookie' => $cookies));
278 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
279 $httpRequest->addHeaders($this->getUserAgent($req_url, true));
280 // add referer for picky sites
281 $httpRequest->addheaders(array('Referer' => $this->referer
));
282 $this->requests
[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
283 $this->requests
[$orig]['original_url'] = $orig;
284 $pool->attach($httpRequest);
287 // did we get anything into the pool?
288 if (count($pool) > 0) {
289 $this->debug('Sending request...');
292 } catch (HttpRequestPoolException
$e) {
295 $this->debug('Received responses');
296 foreach($subset as $orig => $url) {
297 if (!$isRedirect) $orig = $url;
298 $request = $this->requests
[$orig]['httpRequest'];
299 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
300 // getResponseHeader() doesn't return status line, so, for consistency...
301 $this->requests
[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
302 // check content type
303 // TODO: use getResponseHeader('content-type') or getResponseInfo()
304 if ($this->headerOnlyType($this->requests
[$orig]['headers'])) {
305 $this->requests
[$orig]['body'] = '';
306 $_header_only_type = true;
307 $this->debug('Header only type returned');
309 $this->requests
[$orig]['body'] = $request->getResponseBody();
310 $_header_only_type = false;
312 $this->requests
[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
313 $this->requests
[$orig]['status_code'] = $status_code = $request->getResponseCode();
315 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
316 $redirectURL = $request->getResponseHeader('location');
317 if (!preg_match('!^https?://!i', $redirectURL)) {
318 $redirectURL = SimplePie_Misc
::absolutize_url($redirectURL, $url);
320 if ($this->validateURL($redirectURL)) {
321 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
323 $cookies = $request->getResponseHeader('set-cookie');
324 if ($cookies && !is_array($cookies)) $cookies = array($cookies);
325 if ($cookies) $this->cookieJar
->storeCookies($url, $cookies);
326 $this->redirectQueue
[$orig] = $redirectURL;
328 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
330 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest
::METH_HEAD
) {
331 // the response content-type did not match our 'header only' types,
332 // but we'd issues a HEAD request because we assumed it would. So
333 // let's queue a proper GET request for this item...
334 $this->debug('Wrong guess at content-type, queing GET request');
335 $this->requests
[$orig]['wrongGuess'] = true;
336 $this->redirectQueue
[$orig] = $this->requests
[$orig]['effective_url'];
337 } elseif (strpos($this->requests
[$orig]['effective_url'], '_escaped_fragment_') === false) {
338 // check for <meta name='fragment' content='!'/>
339 // for AJAX sites, e.g. Blogger with its dynamic views templates.
340 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
341 if (isset($this->requests
[$orig]['body'])) {
342 $redirectURL = $this->getUglyURL($this->requests
[$orig]['effective_url'], substr($this->requests
[$orig]['body'], 0, 4000));
344 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
345 $this->redirectQueue
[$orig] = $redirectURL;
349 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
350 $pool->detach($request);
351 unset($this->requests
[$orig]['httpRequest'], $request);
353 if ($this->minimiseMemoryUse) {
354 if ($this->cache($url)) {
355 unset($this->requests[$url]);
362 } catch (HttpException
$e) {
368 //////////////////////////////////////////////////////////
369 // parallel (curl_multi_*)
370 elseif ($this->method
== self
::METHOD_CURL_MULTI
) {
371 $this->debug('Starting parallel fetch (curl_multi_*)');
372 while (count($urls) > 0) {
373 $this->debug('Processing set of '.min($this->maxParallelRequests
, count($urls)));
374 $subset = array_splice($urls, 0, $this->maxParallelRequests
);
375 $pool = new RollingCurl(array($this, 'handleCurlResponse'));
376 $pool->window_size
= count($subset);
378 foreach ($subset as $orig => $url) {
379 if (!$isRedirect) $orig = $url;
380 unset($this->redirectQueue
[$orig]);
381 $this->debug("...$url");
382 if (!$isRedirect && isset($this->requests[$url])) {
383 $this->debug("......in memory
");
385 } elseif ($this->isCached($url)) {
386 $this->debug("......is cached
");
387 if (!$this->minimiseMemoryUse) {
388 $this->requests[$url] = $this->getCached($url);
392 $this->debug("......adding to pool
");
393 $req_url = $this->rewriteUrls($url);
394 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
395 $req_url = $this->removeFragment($req_url);
396 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
400 unset($this->requests[$orig]['wrongGuess']);
403 //$headers[] = 'User-Agent: '.$this->userAgent;
404 $headers[] = $this->getUserAgent($req_url);
405 // add referer for picky sites
406 $headers[] = 'Referer: '.$this->referer;
407 // send cookies, if we have any
408 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
409 $this->debug("......sending cookies
: $cookies");
410 $headers[] = 'Cookie: '.$cookies;
412 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
413 CURLOPT_CONNECTTIMEOUT
=> $this->requestOptions
['timeout'],
414 CURLOPT_TIMEOUT
=> $this->requestOptions
['timeout']
416 $httpRequest->set_original_url($orig);
417 $this->requests
[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
418 $this->requests
[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
419 $pool->add($httpRequest);
422 // did we get anything into the pool?
423 if (count($pool) > 0) {
424 $this->debug('Sending request...');
425 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
426 $this->debug('Received responses');
427 foreach($subset as $orig => $url) {
428 if (!$isRedirect) $orig = $url;
429 // $this->requests[$orig]['headers']
430 // $this->requests[$orig]['body']
431 // $this->requests[$orig]['effective_url']
432 // check content type
433 if ($this->headerOnlyType($this->requests
[$orig]['headers'])) {
434 $this->requests
[$orig]['body'] = '';
435 $_header_only_type = true;
436 $this->debug('Header only type returned');
438 $_header_only_type = false;
440 $status_code = $this->requests
[$orig]['status_code'];
441 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests
[$orig]['location'])) {
442 $redirectURL = $this->requests
[$orig]['location'];
443 if (!preg_match('!^https?://!i', $redirectURL)) {
444 $redirectURL = SimplePie_Misc
::absolutize_url($redirectURL, $url);
446 if ($this->validateURL($redirectURL)) {
447 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
449 $cookies = $this->cookieJar
->extractCookies($this->requests
[$orig]['headers']);
450 if (!empty($cookies)) $this->cookieJar
->storeCookies($url, $cookies);
451 $this->redirectQueue
[$orig] = $redirectURL;
453 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
455 } elseif (!$_header_only_type && $this->requests
[$orig]['method'] == 'HEAD') {
456 // the response content-type did not match our 'header only' types,
457 // but we'd issues a HEAD request because we assumed it would. So
458 // let's queue a proper GET request for this item...
459 $this->debug('Wrong guess at content-type, queing GET request');
460 $this->requests
[$orig]['wrongGuess'] = true;
461 $this->redirectQueue
[$orig] = $this->requests
[$orig]['effective_url'];
462 } elseif (strpos($this->requests
[$orig]['effective_url'], '_escaped_fragment_') === false) {
463 // check for <meta name='fragment' content='!'/>
464 // for AJAX sites, e.g. Blogger with its dynamic views templates.
465 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
466 if (isset($this->requests
[$orig]['body'])) {
467 $redirectURL = $this->getUglyURL($this->requests
[$orig]['effective_url'], substr($this->requests
[$orig]['body'], 0, 4000));
469 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
470 $this->redirectQueue
[$orig] = $redirectURL;
474 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
475 unset($this->requests
[$orig]['httpRequest'], $this->requests
[$orig]['method']);
481 //////////////////////////////////////////////////////
482 // sequential (file_get_contents)
484 $this->debug('Starting sequential fetch (file_get_contents)');
485 $this->debug('Processing set of '.count($urls));
486 foreach ($urls as $orig => $url) {
487 if (!$isRedirect) $orig = $url;
488 unset($this->redirectQueue
[$orig]);
489 $this->debug("...$url");
490 if (!$isRedirect && isset($this->requests[$url])) {
491 $this->debug("......in memory
");
493 } elseif ($this->isCached($url)) {
494 $this->debug("......is cached
");
495 if (!$this->minimiseMemoryUse) {
496 $this->requests[$url] = $this->getCached($url);
500 $this->debug("Sending request
for $url");
501 $this->requests
[$orig]['original_url'] = $orig;
502 $req_url = $this->rewriteUrls($url);
503 $req_url = ($this->rewriteHashbangFragment
) ? $this->rewriteHashbangFragment($req_url) : $req_url;
504 $req_url = $this->removeFragment($req_url);
505 // send cookies, if we have any
506 $httpContext = $this->httpContext
;
507 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
508 // add referer for picky sites
509 $httpContext['http']['header'] .= 'Referer: '.$this->referer
."\r\n";
510 if ($cookies = $this->cookieJar
->getMatchingCookies($req_url)) {
511 $this->debug("......sending cookies: $cookies");
512 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n
";
514 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
515 $this->debug('Received response');
517 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
518 $this->debug('Error: no status code found');
519 // TODO: handle error - no status code
521 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
522 // check content type
523 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
524 $this->requests[$orig]['body'] = '';
526 $this->requests[$orig]['body'] = $html;
528 $this->requests[$orig]['effective_url'] = $req_url;
529 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
532 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
533 $this->requests[$orig]['location'] = trim($match[1]);
535 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
536 $redirectURL = $this->requests[$orig]['location'];
537 if (!preg_match('!^https?://!i', $redirectURL)) {
538 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
540 if ($this->validateURL($redirectURL)) {
541 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
543 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
544 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
545 $this->redirectQueue[$orig] = $redirectURL;
547 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
549 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
550 // check for <meta name='fragment' content='!'/>
551 // for AJAX sites, e.g. Blogger with its dynamic views templates.
552 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
553 if (isset($this->requests[$orig]['body'])) {
554 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
556 $this->debug('AJAX trigger (meta name="fragment
" content="!") found. Queueing '.$redirectURL);
557 $this->redirectQueue[$orig] = $redirectURL;
563 $this->debug('Error retrieving URL');
565 //print_r($http_response_header);
568 // TODO: handle error - failed to retrieve URL
575 public function handleCurlResponse($response, $info, $request) {
576 $orig = $request->url_original;
577 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
578 $this->requests[$orig]['body'] = substr($response, $info['header_size']);
579 $this->requests[$orig]['method'] = $request->method;
580 $this->requests[$orig]['effective_url'] = $info['url'];
581 $this->requests[$orig]['status_code'] = (int)$info['http_code'];
582 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
583 $this->requests[$orig]['location'] = trim($match[1]);
587 protected function headersToString(array $headers, $associative=true) {
589 return implode("\n
", $headers);
592 foreach ($headers as $key => $val) {
593 if (is_array($val)) {
594 foreach ($val as $v) $str .= "$key: $v\n";
596 $str .= "$key: $val\n";
603 public function get($url, $remove=false, $gzdecode=true) {
605 if (isset($this->requests
[$url]) && isset($this->requests
[$url]['body'])) {
606 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
607 $response = $this->requests
[$url];
609 } elseif ($this->isCached($url)) {
610 $this->debug("URL already fetched - in disk cache ($url)");
611 $response = $this->getCached($url);
612 $this->requests[$url] = $response;
615 $this->debug("Fetching URL ($url)");
616 $this->fetchAll(array($url));
617 if (isset($this->requests
[$url]) && isset($this->requests
[$url]['body'])) {
618 $response = $this->requests
[$url];
620 $this->debug("Request failed");
625 if ($this->minimiseMemoryUse && $response) {
627 unset($this->requests[$url]);
630 if ($remove && $response) unset($this->requests
[$url]);
631 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
632 if ($html = gzdecode($response['body'])) {
633 $response['body'] = $html;
639 public function parallelSupport() {
640 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
643 private function headerOnlyType($headers) {
644 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
645 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
646 $match[1] = strtolower(trim($match[1]));
647 $match[2] = strtolower(trim($match[2]));
648 foreach (array($match[1], $match[2]) as $mime) {
649 if (in_array($mime, $this->headerOnlyTypes
)) return true;
655 private function possibleUnsupportedType($url) {
656 $path = @parse_url($url, PHP_URL_PATH
);
657 if ($path && strpos($path, '.') !== false) {
658 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION
)));
659 return in_array($ext, $this->headerOnlyClues
);
665 // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
666 if (!function_exists('gzdecode')) {
667 function gzdecode($data,&$filename='',&$error='',$maxlength=null)
669 $len = strlen($data);
670 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
671 $error = "Not in GZIP format.";
672 return null; // Not GZIP format (See RFC 1952)
674 $method = ord(substr($data,2,1)); // Compression method
675 $flags = ord(substr($data,3,1)); // Flags
676 if ($flags & 31 != $flags) {
677 $error = "Reserved bits not allowed.";
680 // NOTE: $mtime may be negative (PHP integer limitations)
681 $mtime = unpack("V", substr($data,4,4));
683 $xfl = substr($data,8,1);
684 $os = substr($data,8,1);
689 // 2-byte length prefixed EXTRA data in header
690 if ($len - $headerlen - 2 < 8) {
691 return false; // invalid
693 $extralen = unpack("v",substr($data,8,2));
694 $extralen = $extralen[1];
695 if ($len - $headerlen - 2 - $extralen < 8) {
696 return false; // invalid
698 $extra = substr($data,10,$extralen);
699 $headerlen +
= 2 +
$extralen;
705 if ($len - $headerlen - 1 < 8) {
706 return false; // invalid
708 $filenamelen = strpos(substr($data,$headerlen),chr(0));
709 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
710 return false; // invalid
712 $filename = substr($data,$headerlen,$filenamelen);
713 $headerlen +
= $filenamelen +
1;
718 // C-style string COMMENT data in header
719 if ($len - $headerlen - 1 < 8) {
720 return false; // invalid
722 $commentlen = strpos(substr($data,$headerlen),chr(0));
723 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
724 return false; // Invalid header format
726 $comment = substr($data,$headerlen,$commentlen);
727 $headerlen +
= $commentlen +
1;
731 // 2-bytes (lowest order) of CRC32 on header present
732 if ($len - $headerlen - 2 < 8) {
733 return false; // invalid
735 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
736 $headercrc = unpack("v", substr($data,$headerlen,2));
737 $headercrc = $headercrc[1];
738 if ($headercrc != $calccrc) {
739 $error = "Header checksum failed.";
740 return false; // Bad header CRC
745 $datacrc = unpack("V",substr($data,-8,4));
746 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
747 $isize = unpack("V",substr($data,-4));
750 $bodylen = $len-$headerlen-8;
752 // IMPLEMENTATION BUG!
755 $body = substr($data,$headerlen,$bodylen);
760 // Currently the only supported compression method:
761 $data = gzinflate($body,$maxlength);
764 $error = "Unknown compression method.";
767 } // zero-byte body content is allowed
769 $crc = sprintf("%u",crc32($data));
770 $crcOK = $crc == $datacrc;
771 $lenOK = $isize == strlen($data);
772 if (!$lenOK || !$crcOK) {
773 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');