-<?php\r
-/**\r
- * Humble HTTP Agent\r
- * \r
- * This class is designed to take advantage of parallel HTTP requests\r
- * offered by PHP's PECL HTTP extension or the curl_multi_* functions. \r
- * For environments which do not have these options, it reverts to standard sequential \r
- * requests (using file_get_contents())\r
- * \r
- * @version 1.1\r
- * @date 2012-08-20\r
- * @see http://php.net/HttpRequestPool\r
- * @author Keyvan Minoukadeh\r
- * @copyright 2011-2012 Keyvan Minoukadeh\r
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
- */\r
-\r
-class HumbleHttpAgent\r
-{\r
- const METHOD_REQUEST_POOL = 1;\r
- const METHOD_CURL_MULTI = 2;\r
- const METHOD_FILE_GET_CONTENTS = 4;\r
- //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';\r
- const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';\r
- const UA_PHP = 'PHP/5.2';\r
- const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';\r
- \r
- protected $requests = array();\r
- protected $redirectQueue = array();\r
- protected $requestOptions;\r
- protected $maxParallelRequests = 5;\r
- protected $cache = null; //TODO\r
- protected $httpContext;\r
- protected $minimiseMemoryUse = false; //TODO\r
- protected $method;\r
- protected $cookieJar;\r
- public $debug = false;\r
- public $debugVerbose = false;\r
- public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html\r
- public $maxRedirects = 5;\r
- public $userAgentMap = array();\r
- public $rewriteUrls = array();\r
- public $userAgentDefault;\r
- public $referer;\r
- //public $userAgent = 'Mozilla/5.0';\r
- \r
- // Prevent certain file/mime types\r
- // HTTP responses which match these content types will\r
- // be returned without body.\r
- public $headerOnlyTypes = array();\r
- // URLs ending with one of these extensions will\r
- // prompt Humble HTTP Agent to send a HEAD request first\r
- // to see if returned content type matches $headerOnlyTypes.\r
- public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');\r
- // AJAX triggers to search for.\r
- // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
- public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');\r
- \r
- //TODO: set max file size\r
- //TODO: normalise headers\r
- \r
- function __construct($requestOptions=null, $method=null) {\r
- $this->userAgentDefault = self::UA_BROWSER;\r
- $this->referer = self::REF_GOOGLE;\r
- // set the request method\r
- if (in_array($method, array(1,2,4))) {\r
- $this->method = $method;\r
- } else {\r
- if (class_exists('HttpRequestPool')) {\r
- $this->method = self::METHOD_REQUEST_POOL;\r
- } elseif (function_exists('curl_multi_init')) {\r
- $this->method = self::METHOD_CURL_MULTI;\r
- } else {\r
- $this->method = self::METHOD_FILE_GET_CONTENTS;\r
- }\r
- }\r
- if ($this->method == self::METHOD_CURL_MULTI) {\r
- require_once(dirname(__FILE__).'/RollingCurl.php');\r
- }\r
- // create cookie jar\r
- $this->cookieJar = new CookieJar();\r
- // set request options (redirect must be 0)\r
- $this->requestOptions = array(\r
- 'timeout' => 15,\r
- 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web\r
- // TODO: test onprogress?\r
- );\r
- if (is_array($requestOptions)) {\r
- $this->requestOptions = array_merge($this->requestOptions, $requestOptions);\r
- }\r
- $this->httpContext = array(\r
- 'http' => array(\r
- 'ignore_errors' => true,\r
- 'timeout' => $this->requestOptions['timeout'],\r
- 'max_redirects' => $this->requestOptions['redirect'],\r
- 'header' => "Accept: */*\r\n"\r
- )\r
- );\r
- }\r
- \r
- protected function debug($msg) {\r
- if ($this->debug) {\r
- $mem = round(memory_get_usage()/1024, 2);\r
- $memPeak = round(memory_get_peak_usage()/1024, 2);\r
- echo '* ',$msg;\r
- if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";\r
- echo "\n";\r
- ob_flush();\r
- flush();\r
- }\r
- }\r
- \r
- protected function getUserAgent($url, $asArray=false) {\r
- $host = @parse_url($url, PHP_URL_HOST);\r
- if (strtolower(substr($host, 0, 4)) == 'www.') {\r
- $host = substr($host, 4);\r
- }\r
- if ($host) {\r
- $try = array($host);\r
- $split = explode('.', $host);\r
- if (count($split) > 1) {\r
- array_shift($split);\r
- $try[] = '.'.implode('.', $split);\r
- }\r
- foreach ($try as $h) {\r
- if (isset($this->userAgentMap[$h])) {\r
- $ua = $this->userAgentMap[$h];\r
- break;\r
- }\r
- }\r
- }\r
- if (!isset($ua)) $ua = $this->userAgentDefault;\r
- if ($asArray) {\r
- return array('User-Agent' => $ua);\r
- } else {\r
- return 'User-Agent: '.$ua;\r
- }\r
- }\r
- \r
- public function rewriteHashbangFragment($url) {\r
- // return $url if there's no '#!'\r
- if (strpos($url, '#!') === false) return $url;\r
- // split $url and rewrite\r
- // TODO: is SimplePie_IRI included?\r
- $iri = new SimplePie_IRI($url);\r
- $fragment = substr($iri->fragment, 1); // strip '!'\r
- $iri->fragment = null;\r
- if (isset($iri->query)) {\r
- parse_str($iri->query, $query);\r
- } else {\r
- $query = array();\r
- }\r
- $query['_escaped_fragment_'] = (string)$fragment;\r
- $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r
- return $iri->get_iri();\r
- }\r
- \r
- public function getUglyURL($url, $html) {\r
- if ($html == '') return false;\r
- $found = false;\r
- foreach ($this->ajaxTriggers as $string) {\r
- if (stripos($html, $string)) {\r
- $found = true;\r
- break;\r
- }\r
- }\r
- if (!$found) return false;\r
- $iri = new SimplePie_IRI($url);\r
- if (isset($iri->query)) {\r
- parse_str($iri->query, $query);\r
- } else {\r
- $query = array();\r
- }\r
- $query['_escaped_fragment_'] = '';\r
- $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r
- return $iri->get_iri();\r
- }\r
- \r
- public function removeFragment($url) {\r
- $pos = strpos($url, '#');\r
- if ($pos === false) {\r
- return $url;\r
- } else {\r
- return substr($url, 0, $pos);\r
- }\r
- }\r
- \r
- public function rewriteUrls($url) {\r
- foreach ($this->rewriteUrls as $find => $action) {\r
- if (strpos($url, $find) !== false) {\r
- if (is_array($action)) {\r
- return strtr($url, $action);\r
- }\r
- }\r
- }\r
- return $url;\r
- }\r
- \r
- public function enableDebug($bool=true) {\r
- $this->debug = (bool)$bool;\r
- }\r
- \r
- public function minimiseMemoryUse($bool = true) {\r
- $this->minimiseMemoryUse = $bool;\r
- }\r
- \r
- public function setMaxParallelRequests($max) {\r
- $this->maxParallelRequests = $max;\r
- }\r
- \r
- public function validateUrl($url) {\r
- $url = filter_var($url, FILTER_SANITIZE_URL);\r
- $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r
- // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)\r
- if ($test === false) {\r
- $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r
- }\r
- if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {\r
- return $url;\r
- } else {\r
- return false;\r
- }\r
- }\r
- \r
- public function fetchAll(array $urls) {\r
- $this->fetchAllOnce($urls, $isRedirect=false);\r
- $redirects = 0;\r
- while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {\r
- $this->debug("Following redirects #$redirects...");\r
- $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);\r
- }\r
- }\r
- \r
- // fetch all URLs without following redirects\r
- public function fetchAllOnce(array $urls, $isRedirect=false) {\r
- if (!$isRedirect) $urls = array_unique($urls);\r
- if (empty($urls)) return;\r
- \r
- //////////////////////////////////////////////////////\r
- // parallel (HttpRequestPool)\r
- if ($this->method == self::METHOD_REQUEST_POOL) {\r
- $this->debug('Starting parallel fetch (HttpRequestPool)');\r
- try {\r
- while (count($urls) > 0) {\r
- $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r
- $subset = array_splice($urls, 0, $this->maxParallelRequests);\r
- $pool = new HttpRequestPool();\r
- foreach ($subset as $orig => $url) {\r
- if (!$isRedirect) $orig = $url;\r
- unset($this->redirectQueue[$orig]);\r
- $this->debug("...$url");\r
- if (!$isRedirect && isset($this->requests[$url])) {\r
- $this->debug("......in memory");\r
- /*\r
- } elseif ($this->isCached($url)) {\r
- $this->debug("......is cached");\r
- if (!$this->minimiseMemoryUse) {\r
- $this->requests[$url] = $this->getCached($url);\r
- }\r
- */\r
- } else {\r
- $this->debug("......adding to pool");\r
- $req_url = $this->rewriteUrls($url);\r
- $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
- $req_url = $this->removeFragment($req_url);\r
- if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r
- $_meth = HttpRequest::METH_HEAD;\r
- } else {\r
- $_meth = HttpRequest::METH_GET;\r
- unset($this->requests[$orig]['wrongGuess']);\r
- }\r
- $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);\r
- // send cookies, if we have any\r
- if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
- $this->debug("......sending cookies: $cookies");\r
- $httpRequest->addHeaders(array('Cookie' => $cookies));\r
- }\r
- //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));\r
- $httpRequest->addHeaders($this->getUserAgent($req_url, true));\r
- // add referer for picky sites\r
- $httpRequest->addheaders(array('Referer' => $this->referer));\r
- $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r
- $this->requests[$orig]['original_url'] = $orig;\r
- $pool->attach($httpRequest);\r
- }\r
- }\r
- // did we get anything into the pool?\r
- if (count($pool) > 0) {\r
- $this->debug('Sending request...');\r
- try {\r
- $pool->send();\r
- } catch (HttpRequestPoolException $e) {\r
- // do nothing\r
- }\r
- $this->debug('Received responses');\r
- foreach($subset as $orig => $url) {\r
- if (!$isRedirect) $orig = $url;\r
- $request = $this->requests[$orig]['httpRequest'];\r
- //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());\r
- // getResponseHeader() doesn't return status line, so, for consistency...\r
- $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));\r
- // check content type\r
- // TODO: use getResponseHeader('content-type') or getResponseInfo()\r
- if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
- $this->requests[$orig]['body'] = '';\r
- $_header_only_type = true;\r
- $this->debug('Header only type returned');\r
- } else {\r
- $this->requests[$orig]['body'] = $request->getResponseBody();\r
- $_header_only_type = false;\r
- }\r
- $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');\r
- $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();\r
- // is redirect?\r
- if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {\r
- $redirectURL = $request->getResponseHeader('location');\r
- if (!preg_match('!^https?://!i', $redirectURL)) {\r
- $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
- }\r
- if ($this->validateURL($redirectURL)) {\r
- $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
- // store any cookies\r
- $cookies = $request->getResponseHeader('set-cookie');\r
- if ($cookies && !is_array($cookies)) $cookies = array($cookies);\r
- if ($cookies) $this->cookieJar->storeCookies($url, $cookies);\r
- $this->redirectQueue[$orig] = $redirectURL;\r
- } else {\r
- $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
- }\r
- } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {\r
- // the response content-type did not match our 'header only' types, \r
- // but we'd issues a HEAD request because we assumed it would. So\r
- // let's queue a proper GET request for this item...\r
- $this->debug('Wrong guess at content-type, queing GET request');\r
- $this->requests[$orig]['wrongGuess'] = true;\r
- $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r
- } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
- // check for <meta name='fragment' content='!'/>\r
- // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
- // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
- if (isset($this->requests[$orig]['body'])) {\r
- $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
- if ($redirectURL) {\r
- $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
- $this->redirectQueue[$orig] = $redirectURL;\r
- }\r
- }\r
- }\r
- //die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r
- $pool->detach($request);\r
- unset($this->requests[$orig]['httpRequest'], $request);\r
- /*\r
- if ($this->minimiseMemoryUse) {\r
- if ($this->cache($url)) {\r
- unset($this->requests[$url]);\r
- }\r
- }\r
- */\r
- }\r
- }\r
- }\r
- } catch (HttpException $e) {\r
- $this->debug($e);\r
- return false;\r
- }\r
- }\r
- \r
- //////////////////////////////////////////////////////////\r
- // parallel (curl_multi_*)\r
- elseif ($this->method == self::METHOD_CURL_MULTI) {\r
- $this->debug('Starting parallel fetch (curl_multi_*)');\r
- while (count($urls) > 0) {\r
- $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r
- $subset = array_splice($urls, 0, $this->maxParallelRequests);\r
- $pool = new RollingCurl(array($this, 'handleCurlResponse'));\r
- $pool->window_size = count($subset); \r
- \r
- foreach ($subset as $orig => $url) {\r
- if (!$isRedirect) $orig = $url;\r
- unset($this->redirectQueue[$orig]);\r
- $this->debug("...$url");\r
- if (!$isRedirect && isset($this->requests[$url])) {\r
- $this->debug("......in memory");\r
- /*\r
- } elseif ($this->isCached($url)) {\r
- $this->debug("......is cached");\r
- if (!$this->minimiseMemoryUse) {\r
- $this->requests[$url] = $this->getCached($url);\r
- }\r
- */\r
- } else {\r
- $this->debug("......adding to pool");\r
- $req_url = $this->rewriteUrls($url);\r
- $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
- $req_url = $this->removeFragment($req_url);\r
- if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r
- $_meth = 'HEAD';\r
- } else {\r
- $_meth = 'GET';\r
- unset($this->requests[$orig]['wrongGuess']);\r
- } \r
- $headers = array();\r
- //$headers[] = 'User-Agent: '.$this->userAgent;\r
- $headers[] = $this->getUserAgent($req_url);\r
- // add referer for picky sites\r
- $headers[] = 'Referer: '.$this->referer;\r
- // send cookies, if we have any\r
- if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
- $this->debug("......sending cookies: $cookies");\r
- $headers[] = 'Cookie: '.$cookies;\r
- }\r
- $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(\r
- CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],\r
- CURLOPT_TIMEOUT => $this->requestOptions['timeout']\r
- ));\r
- $httpRequest->set_original_url($orig);\r
- $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r
- $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?\r
- $pool->add($httpRequest);\r
- }\r
- }\r
- // did we get anything into the pool?\r
- if (count($pool) > 0) {\r
- $this->debug('Sending request...');\r
- $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]\r
- $this->debug('Received responses');\r
- foreach($subset as $orig => $url) {\r
- if (!$isRedirect) $orig = $url;\r
- // $this->requests[$orig]['headers']\r
- // $this->requests[$orig]['body']\r
- // $this->requests[$orig]['effective_url']\r
- // check content type\r
- if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
- $this->requests[$orig]['body'] = '';\r
- $_header_only_type = true;\r
- $this->debug('Header only type returned');\r
- } else {\r
- $_header_only_type = false;\r
- }\r
- $status_code = $this->requests[$orig]['status_code'];\r
- if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r
- $redirectURL = $this->requests[$orig]['location'];\r
- if (!preg_match('!^https?://!i', $redirectURL)) {\r
- $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
- }\r
- if ($this->validateURL($redirectURL)) {\r
- $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
- // store any cookies\r
- $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r
- if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); \r
- $this->redirectQueue[$orig] = $redirectURL;\r
- } else {\r
- $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
- }\r
- } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {\r
- // the response content-type did not match our 'header only' types, \r
- // but we'd issues a HEAD request because we assumed it would. So\r
- // let's queue a proper GET request for this item...\r
- $this->debug('Wrong guess at content-type, queing GET request');\r
- $this->requests[$orig]['wrongGuess'] = true;\r
- $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r
- } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
- // check for <meta name='fragment' content='!'/>\r
- // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
- // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
- if (isset($this->requests[$orig]['body'])) {\r
- $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
- if ($redirectURL) {\r
- $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
- $this->redirectQueue[$orig] = $redirectURL;\r
- }\r
- }\r
- }\r
- // die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r
- unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);\r
- }\r
- }\r
- }\r
- }\r
-\r
- //////////////////////////////////////////////////////\r
- // sequential (file_get_contents)\r
- else {\r
- $this->debug('Starting sequential fetch (file_get_contents)');\r
- $this->debug('Processing set of '.count($urls));\r
- foreach ($urls as $orig => $url) {\r
- if (!$isRedirect) $orig = $url;\r
- unset($this->redirectQueue[$orig]);\r
- $this->debug("...$url");\r
- if (!$isRedirect && isset($this->requests[$url])) {\r
- $this->debug("......in memory");\r
- /*\r
- } elseif ($this->isCached($url)) {\r
- $this->debug("......is cached");\r
- if (!$this->minimiseMemoryUse) {\r
- $this->requests[$url] = $this->getCached($url);\r
- }\r
- */\r
- } else {\r
- $this->debug("Sending request for $url");\r
- $this->requests[$orig]['original_url'] = $orig;\r
- $req_url = $this->rewriteUrls($url);\r
- $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
- $req_url = $this->removeFragment($req_url);\r
- // send cookies, if we have any\r
- $httpContext = $this->httpContext;\r
- $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";\r
- // add referer for picky sites\r
- $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";\r
- if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
- $this->debug("......sending cookies: $cookies");\r
- $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";\r
- }\r
- if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {\r
- $this->debug('Received response');\r
- // get status code\r
- if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {\r
- $this->debug('Error: no status code found');\r
- // TODO: handle error - no status code\r
- } else {\r
- $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);\r
- // check content type\r
- if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
- $this->requests[$orig]['body'] = '';\r
- } else {\r
- $this->requests[$orig]['body'] = $html;\r
- }\r
- $this->requests[$orig]['effective_url'] = $req_url;\r
- $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];\r
- unset($match);\r
- // handle redirect\r
- if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r
- $this->requests[$orig]['location'] = trim($match[1]);\r
- }\r
- if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r
- $redirectURL = $this->requests[$orig]['location'];\r
- if (!preg_match('!^https?://!i', $redirectURL)) {\r
- $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
- }\r
- if ($this->validateURL($redirectURL)) {\r
- $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
- // store any cookies\r
- $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r
- if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);\r
- $this->redirectQueue[$orig] = $redirectURL;\r
- } else {\r
- $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
- }\r
- } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
- // check for <meta name='fragment' content='!'/>\r
- // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
- // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
- if (isset($this->requests[$orig]['body'])) {\r
- $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
- if ($redirectURL) {\r
- $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
- $this->redirectQueue[$orig] = $redirectURL;\r
- }\r
- }\r
- }\r
- }\r
- } else {\r
- $this->debug('Error retrieving URL');\r
- //print_r($req_url);\r
- //print_r($http_response_header);\r
- //print_r($html);\r
- \r
- // TODO: handle error - failed to retrieve URL\r
- }\r
- }\r
- }\r
- }\r
- }\r
- \r
- public function handleCurlResponse($response, $info, $request) {\r
- $orig = $request->url_original;\r
- $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);\r
- $this->requests[$orig]['body'] = substr($response, $info['header_size']);\r
- $this->requests[$orig]['method'] = $request->method;\r
- $this->requests[$orig]['effective_url'] = $info['url'];\r
- $this->requests[$orig]['status_code'] = (int)$info['http_code'];\r
- if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r
- $this->requests[$orig]['location'] = trim($match[1]);\r
- }\r
- }\r
- \r
- protected function headersToString(array $headers, $associative=true) {\r
- if (!$associative) {\r
- return implode("\n", $headers);\r
- } else {\r
- $str = '';\r
- foreach ($headers as $key => $val) {\r
- if (is_array($val)) {\r
- foreach ($val as $v) $str .= "$key: $v\n";\r
- } else {\r
- $str .= "$key: $val\n";\r
- }\r
- }\r
- return rtrim($str);\r
- }\r
- }\r
- \r
- public function get($url, $remove=false, $gzdecode=true) {\r
- $url = "$url";\r
- if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r
- $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");\r
- $response = $this->requests[$url];\r
- /*\r
- } elseif ($this->isCached($url)) {\r
- $this->debug("URL already fetched - in disk cache ($url)");\r
- $response = $this->getCached($url);\r
- $this->requests[$url] = $response;\r
- */\r
- } else {\r
- $this->debug("Fetching URL ($url)");\r
- $this->fetchAll(array($url));\r
- if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r
- $response = $this->requests[$url];\r
- } else {\r
- $this->debug("Request failed");\r
- $response = false;\r
- }\r
- }\r
- /*\r
- if ($this->minimiseMemoryUse && $response) {\r
- $this->cache($url);\r
- unset($this->requests[$url]);\r
- }\r
- */\r
- if ($remove && $response) unset($this->requests[$url]);\r
- if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {\r
- if ($html = gzdecode($response['body'])) {\r
- $response['body'] = $html;\r
- }\r
- }\r
- return $response;\r
- }\r
- \r
- public function parallelSupport() {\r
- return class_exists('HttpRequestPool') || function_exists('curl_multi_init');\r
- }\r
- \r
- private function headerOnlyType($headers) {\r
- if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {\r
- // look for full mime type (e.g. image/jpeg) or just type (e.g. image)\r
- $match[1] = strtolower(trim($match[1]));\r
- $match[2] = strtolower(trim($match[2]));\r
- foreach (array($match[1], $match[2]) as $mime) {\r
- if (in_array($mime, $this->headerOnlyTypes)) return true;\r
- }\r
- }\r
- return false;\r
- }\r
- \r
- private function possibleUnsupportedType($url) {\r
- $path = @parse_url($url, PHP_URL_PATH);\r
- if ($path && strpos($path, '.') !== false) {\r
- $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));\r
- return in_array($ext, $this->headerOnlyClues);\r
- }\r
- return false;\r
- }\r
-}\r
-\r
-// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930\r
-if (!function_exists('gzdecode')) {\r
- function gzdecode($data,&$filename='',&$error='',$maxlength=null) \r
- {\r
- $len = strlen($data);\r
- if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {\r
- $error = "Not in GZIP format.";\r
- return null; // Not GZIP format (See RFC 1952)\r
- }\r
- $method = ord(substr($data,2,1)); // Compression method\r
- $flags = ord(substr($data,3,1)); // Flags\r
- if ($flags & 31 != $flags) {\r
- $error = "Reserved bits not allowed.";\r
- return null;\r
- }\r
- // NOTE: $mtime may be negative (PHP integer limitations)\r
- $mtime = unpack("V", substr($data,4,4));\r
- $mtime = $mtime[1];\r
- $xfl = substr($data,8,1);\r
- $os = substr($data,8,1);\r
- $headerlen = 10;\r
- $extralen = 0;\r
- $extra = "";\r
- if ($flags & 4) {\r
- // 2-byte length prefixed EXTRA data in header\r
- if ($len - $headerlen - 2 < 8) {\r
- return false; // invalid\r
- }\r
- $extralen = unpack("v",substr($data,8,2));\r
- $extralen = $extralen[1];\r
- if ($len - $headerlen - 2 - $extralen < 8) {\r
- return false; // invalid\r
- }\r
- $extra = substr($data,10,$extralen);\r
- $headerlen += 2 + $extralen;\r
- }\r
- $filenamelen = 0;\r
- $filename = "";\r
- if ($flags & 8) {\r
- // C-style string\r
- if ($len - $headerlen - 1 < 8) {\r
- return false; // invalid\r
- }\r
- $filenamelen = strpos(substr($data,$headerlen),chr(0));\r
- if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {\r
- return false; // invalid\r
- }\r
- $filename = substr($data,$headerlen,$filenamelen);\r
- $headerlen += $filenamelen + 1;\r
- }\r
- $commentlen = 0;\r
- $comment = "";\r
- if ($flags & 16) {\r
- // C-style string COMMENT data in header\r
- if ($len - $headerlen - 1 < 8) {\r
- return false; // invalid\r
- }\r
- $commentlen = strpos(substr($data,$headerlen),chr(0));\r
- if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {\r
- return false; // Invalid header format\r
- }\r
- $comment = substr($data,$headerlen,$commentlen);\r
- $headerlen += $commentlen + 1;\r
- }\r
- $headercrc = "";\r
- if ($flags & 2) {\r
- // 2-bytes (lowest order) of CRC32 on header present\r
- if ($len - $headerlen - 2 < 8) {\r
- return false; // invalid\r
- }\r
- $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;\r
- $headercrc = unpack("v", substr($data,$headerlen,2));\r
- $headercrc = $headercrc[1];\r
- if ($headercrc != $calccrc) {\r
- $error = "Header checksum failed.";\r
- return false; // Bad header CRC\r
- }\r
- $headerlen += 2;\r
- }\r
- // GZIP FOOTER\r
- $datacrc = unpack("V",substr($data,-8,4));\r
- $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);\r
- $isize = unpack("V",substr($data,-4));\r
- $isize = $isize[1];\r
- // decompression:\r
- $bodylen = $len-$headerlen-8;\r
- if ($bodylen < 1) {\r
- // IMPLEMENTATION BUG!\r
- return null;\r
- }\r
- $body = substr($data,$headerlen,$bodylen);\r
- $data = "";\r
- if ($bodylen > 0) {\r
- switch ($method) {\r
- case 8:\r
- // Currently the only supported compression method:\r
- $data = gzinflate($body,$maxlength);\r
- break;\r
- default:\r
- $error = "Unknown compression method.";\r
- return false;\r
- }\r
- } // zero-byte body content is allowed\r
- // Verifiy CRC32\r
- $crc = sprintf("%u",crc32($data));\r
- $crcOK = $crc == $datacrc;\r
- $lenOK = $isize == strlen($data);\r
- if (!$lenOK || !$crcOK) {\r
- $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');\r
- return false;\r
- }\r
- return $data;\r
- }\r
-}\r
-?>
\ No newline at end of file
+<?php
+/**
+ * Humble HTTP Agent
+ *
+ * This class is designed to take advantage of parallel HTTP requests
+ * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
+ * For environments which do not have these options, it reverts to standard sequential
+ * requests (using file_get_contents())
+ *
+ * @version 1.4
+ * @date 2013-05-10
+ * @see http://php.net/HttpRequestPool
+ * @author Keyvan Minoukadeh
+ * @copyright 2011-2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class HumbleHttpAgent
+{
+ const METHOD_REQUEST_POOL = 1;
+ const METHOD_CURL_MULTI = 2;
+ const METHOD_FILE_GET_CONTENTS = 4;
+ //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
+ const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
+ const UA_PHP = 'PHP/5.4';
+ const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
+
+ protected $requests = array();
+ protected $redirectQueue = array();
+ protected $requestOptions;
+ protected $maxParallelRequests = 5;
+ protected $cache = null; //TODO
+ protected $httpContext;
+ protected $minimiseMemoryUse = false; //TODO
+ protected $method;
+ protected $cookieJar;
+ public $debug = false;
+ public $debugVerbose = false;
+ public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
+ public $maxRedirects = 5;
+ public $userAgentMap = array();
+ public $rewriteUrls = array();
+ public $userAgentDefault;
+ public $referer;
+ //public $userAgent = 'Mozilla/5.0';
+
+ // Prevent certain file/mime types
+ // HTTP responses which match these content types will
+ // be returned without body.
+ public $headerOnlyTypes = array();
+ // URLs ending with one of these extensions will
+ // prompt Humble HTTP Agent to send a HEAD request first
+ // to see if returned content type matches $headerOnlyTypes.
+ public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
+ // AJAX triggers to search for.
+ // for AJAX sites, e.g. Blogger with its dynamic views templates.
+ public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
+
+ //TODO: set max file size
+ //TODO: normalise headers
+
+ function __construct($requestOptions=null, $method=null) {
+ $this->userAgentDefault = self::UA_BROWSER;
+ $this->referer = self::REF_GOOGLE;
+ // set the request method
+ if (in_array($method, array(1,2,4))) {
+ $this->method = $method;
+ } else {
+ if (class_exists('HttpRequestPool')) {
+ $this->method = self::METHOD_REQUEST_POOL;
+ } elseif (function_exists('curl_multi_init')) {
+ $this->method = self::METHOD_CURL_MULTI;
+ } else {
+ $this->method = self::METHOD_FILE_GET_CONTENTS;
+ }
+ }
+ if ($this->method == self::METHOD_CURL_MULTI) {
+ require_once(dirname(__FILE__).'/RollingCurl.php');
+ }
+ // create cookie jar
+ $this->cookieJar = new CookieJar();
+ // set request options (redirect must be 0)
+ $this->requestOptions = array(
+ 'timeout' => 15,
+ 'connecttimeout' => 15,
+ 'dns_cache_timeout' => 300,
+ 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
+ // TODO: test onprogress?
+ );
+ if (is_array($requestOptions)) {
+ $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
+ }
+ $this->httpContext = array(
+ 'http' => array(
+ 'ignore_errors' => true,
+ 'timeout' => $this->requestOptions['timeout'],
+ 'max_redirects' => $this->requestOptions['redirect'],
+ 'header' => "Accept: */*\r\n"
+ )
+ );
+ }
+
+ protected function debug($msg) {
+ if ($this->debug) {
+ $mem = round(memory_get_usage()/1024, 2);
+ $memPeak = round(memory_get_peak_usage()/1024, 2);
+ echo '* ',$msg;
+ if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
+ echo "\n";
+ ob_flush();
+ flush();
+ }
+ }
+
+ protected function getUserAgent($url, $asArray=false) {
+ $host = @parse_url($url, PHP_URL_HOST);
+ if (strtolower(substr($host, 0, 4)) == 'www.') {
+ $host = substr($host, 4);
+ }
+ if ($host) {
+ $try = array($host);
+ $split = explode('.', $host);
+ if (count($split) > 1) {
+ array_shift($split);
+ $try[] = '.'.implode('.', $split);
+ }
+ foreach ($try as $h) {
+ if (isset($this->userAgentMap[$h])) {
+ $ua = $this->userAgentMap[$h];
+ break;
+ }
+ }
+ }
+ if (!isset($ua)) $ua = $this->userAgentDefault;
+ if ($asArray) {
+ return array('User-Agent' => $ua);
+ } else {
+ return 'User-Agent: '.$ua;
+ }
+ }
+
+ public function rewriteHashbangFragment($url) {
+ // return $url if there's no '#!'
+ if (strpos($url, '#!') === false) return $url;
+ // split $url and rewrite
+ // TODO: is SimplePie_IRI included?
+ $iri = new SimplePie_IRI($url);
+ $fragment = substr($iri->fragment, 1); // strip '!'
+ $iri->fragment = null;
+ if (isset($iri->query)) {
+ parse_str($iri->query, $query);
+ } else {
+ $query = array();
+ }
+ $query['_escaped_fragment_'] = (string)$fragment;
+ $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
+ return $iri->get_iri();
+ }
+
+ public function getRedirectURLfromHTML($url, $html) {
+ $redirect_url = $this->getMetaRefreshURL($url, $html);
+ if (!$redirect_url) {
+ $redirect_url = $this->getUglyURL($url, $html);
+ }
+ return $redirect_url;
+ }
+
+ public function getMetaRefreshURL($url, $html) {
+ if ($html == '') return false;
+ // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
+ if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
+ return false;
+ }
+ $redirect_url = $match[1];
+ if (preg_match('!^https?://!i', $redirect_url)) {
+ // already absolute
+ $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
+ return $redirect_url;
+ }
+ // absolutize redirect URL
+ $base = new SimplePie_IRI($url);
+ // remove '//' in URL path (causes URLs not to resolve properly)
+ if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
+ if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
+ $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
+ return $absolute;
+ }
+ return false;
+ }
+
+ public function getUglyURL($url, $html) {
+ if ($html == '') return false;
+ $found = false;
+ foreach ($this->ajaxTriggers as $string) {
+ if (stripos($html, $string)) {
+ $found = true;
+ break;
+ }
+ }
+ if (!$found) return false;
+ $iri = new SimplePie_IRI($url);
+ if (isset($iri->query)) {
+ parse_str($iri->query, $query);
+ } else {
+ $query = array();
+ }
+ $query['_escaped_fragment_'] = '';
+ $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
+ $ugly_url = $iri->get_iri();
+ $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
+ return $ugly_url;
+ }
+
+ public function removeFragment($url) {
+ $pos = strpos($url, '#');
+ if ($pos === false) {
+ return $url;
+ } else {
+ return substr($url, 0, $pos);
+ }
+ }
+
+ public function rewriteUrls($url) {
+ foreach ($this->rewriteUrls as $find => $action) {
+ if (strpos($url, $find) !== false) {
+ if (is_array($action)) {
+ return strtr($url, $action);
+ }
+ }
+ }
+ return $url;
+ }
+
+ public function enableDebug($bool=true) {
+ $this->debug = (bool)$bool;
+ }
+
+ public function minimiseMemoryUse($bool = true) {
+ $this->minimiseMemoryUse = $bool;
+ }
+
+ public function setMaxParallelRequests($max) {
+ $this->maxParallelRequests = $max;
+ }
+
+ public function validateUrl($url) {
+ $url = filter_var($url, FILTER_SANITIZE_URL);
+ $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
+ // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
+ if ($test === false) {
+ $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
+ }
+ if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
+ return $url;
+ } else {
+ return false;
+ }
+ }
+
+ public function fetchAll(array $urls) {
+ $this->fetchAllOnce($urls, $isRedirect=false);
+ $redirects = 0;
+ while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
+ $this->debug("Following redirects #$redirects...");
+ $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
+ }
+ }
+
+ // fetch all URLs without following redirects
+ public function fetchAllOnce(array $urls, $isRedirect=false) {
+ if (!$isRedirect) $urls = array_unique($urls);
+ if (empty($urls)) return;
+
+ //////////////////////////////////////////////////////
+ // parallel (HttpRequestPool)
+ if ($this->method == self::METHOD_REQUEST_POOL) {
+ $this->debug('Starting parallel fetch (HttpRequestPool)');
+ try {
+ while (count($urls) > 0) {
+ $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
+ $subset = array_splice($urls, 0, $this->maxParallelRequests);
+ $pool = new HttpRequestPool();
+ foreach ($subset as $orig => $url) {
+ if (!$isRedirect) $orig = $url;
+ unset($this->redirectQueue[$orig]);
+ $this->debug("...$url");
+ if (!$isRedirect && isset($this->requests[$url])) {
+ $this->debug("......in memory");
+ /*
+ } elseif ($this->isCached($url)) {
+ $this->debug("......is cached");
+ if (!$this->minimiseMemoryUse) {
+ $this->requests[$url] = $this->getCached($url);
+ }
+ */
+ } else {
+ $this->debug("......adding to pool");
+ $req_url = $this->rewriteUrls($url);
+ $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+ $req_url = $this->removeFragment($req_url);
+ if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
+ $_meth = HttpRequest::METH_HEAD;
+ } else {
+ $_meth = HttpRequest::METH_GET;
+ unset($this->requests[$orig]['wrongGuess']);
+ }
+ $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
+ // send cookies, if we have any
+ if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+ $this->debug("......sending cookies: $cookies");
+ $httpRequest->addHeaders(array('Cookie' => $cookies));
+ }
+ //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
+ $httpRequest->addHeaders($this->getUserAgent($req_url, true));
+ // add referer for picky sites
+ $httpRequest->addheaders(array('Referer' => $this->referer));
+ $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
+ $this->requests[$orig]['original_url'] = $orig;
+ $pool->attach($httpRequest);
+ }
+ }
+ // did we get anything into the pool?
+ if (count($pool) > 0) {
+ $this->debug('Sending request...');
+ try {
+ $pool->send();
+ } catch (HttpRequestPoolException $e) {
+ // do nothing
+ }
+ $this->debug('Received responses');
+ foreach($subset as $orig => $url) {
+ if (!$isRedirect) $orig = $url;
+ $request = $this->requests[$orig]['httpRequest'];
+ //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
+ // getResponseHeader() doesn't return status line, so, for consistency...
+ $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
+ // check content type
+ // TODO: use getResponseHeader('content-type') or getResponseInfo()
+ if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+ $this->requests[$orig]['body'] = '';
+ $_header_only_type = true;
+ $this->debug('Header only type returned');
+ } else {
+ $this->requests[$orig]['body'] = $request->getResponseBody();
+ $_header_only_type = false;
+ }
+ $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
+ $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
+ // is redirect?
+ if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
+ $redirectURL = $request->getResponseHeader('location');
+ if (!preg_match('!^https?://!i', $redirectURL)) {
+ $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ }
+ if ($this->validateURL($redirectURL)) {
+ $this->debug('Redirect detected. Valid URL: '.$redirectURL);
+ // store any cookies
+ $cookies = $request->getResponseHeader('set-cookie');
+ if ($cookies && !is_array($cookies)) $cookies = array($cookies);
+ if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
+ $this->redirectQueue[$orig] = $redirectURL;
+ } else {
+ $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+ }
+ } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
+ // the response content-type did not match our 'header only' types,
+ // but we'd issues a HEAD request because we assumed it would. So
+ // let's queue a proper GET request for this item...
+ $this->debug('Wrong guess at content-type, queing GET request');
+ $this->requests[$orig]['wrongGuess'] = true;
+ $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
+ } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+ // check for <meta name='fragment' content='!'/>
+ // for AJAX sites, e.g. Blogger with its dynamic views templates.
+ // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+ if (isset($this->requests[$orig]['body'])) {
+ $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+ if ($redirectURL) {
+ $this->redirectQueue[$orig] = $redirectURL;
+ }
+ }
+ }
+ //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
+ $pool->detach($request);
+ unset($this->requests[$orig]['httpRequest'], $request);
+ /*
+ if ($this->minimiseMemoryUse) {
+ if ($this->cache($url)) {
+ unset($this->requests[$url]);
+ }
+ }
+ */
+ }
+ }
+ }
+ } catch (HttpException $e) {
+ $this->debug($e);
+ return false;
+ }
+ }
+
+ //////////////////////////////////////////////////////////
+ // parallel (curl_multi_*)
+ elseif ($this->method == self::METHOD_CURL_MULTI) {
+ $this->debug('Starting parallel fetch (curl_multi_*)');
+ while (count($urls) > 0) {
+ $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
+ $subset = array_splice($urls, 0, $this->maxParallelRequests);
+ $pool = new RollingCurl(array($this, 'handleCurlResponse'));
+ $pool->window_size = count($subset);
+
+ foreach ($subset as $orig => $url) {
+ if (!$isRedirect) $orig = $url;
+ unset($this->redirectQueue[$orig]);
+ $this->debug("...$url");
+ if (!$isRedirect && isset($this->requests[$url])) {
+ $this->debug("......in memory");
+ /*
+ } elseif ($this->isCached($url)) {
+ $this->debug("......is cached");
+ if (!$this->minimiseMemoryUse) {
+ $this->requests[$url] = $this->getCached($url);
+ }
+ */
+ } else {
+ $this->debug("......adding to pool");
+ $req_url = $this->rewriteUrls($url);
+ $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+ $req_url = $this->removeFragment($req_url);
+ if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
+ $_meth = 'HEAD';
+ } else {
+ $_meth = 'GET';
+ unset($this->requests[$orig]['wrongGuess']);
+ }
+ $headers = array();
+ //$headers[] = 'User-Agent: '.$this->userAgent;
+ $headers[] = $this->getUserAgent($req_url);
+ // add referer for picky sites
+ $headers[] = 'Referer: '.$this->referer;
+ // send cookies, if we have any
+ if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+ $this->debug("......sending cookies: $cookies");
+ $headers[] = 'Cookie: '.$cookies;
+ }
+ $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
+ CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
+ CURLOPT_TIMEOUT => $this->requestOptions['timeout']
+ ));
+ $httpRequest->set_original_url($orig);
+ $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
+ $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
+ $pool->add($httpRequest);
+ }
+ }
+ // did we get anything into the pool?
+ if (count($pool) > 0) {
+ $this->debug('Sending request...');
+ $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
+ $this->debug('Received responses');
+ foreach($subset as $orig => $url) {
+ if (!$isRedirect) $orig = $url;
+ // $this->requests[$orig]['headers']
+ // $this->requests[$orig]['body']
+ // $this->requests[$orig]['effective_url']
+ // check content type
+ if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+ $this->requests[$orig]['body'] = '';
+ $_header_only_type = true;
+ $this->debug('Header only type returned');
+ } else {
+ $_header_only_type = false;
+ }
+ $status_code = $this->requests[$orig]['status_code'];
+ if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
+ $redirectURL = $this->requests[$orig]['location'];
+ if (!preg_match('!^https?://!i', $redirectURL)) {
+ $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ }
+ if ($this->validateURL($redirectURL)) {
+ $this->debug('Redirect detected. Valid URL: '.$redirectURL);
+ // store any cookies
+ $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
+ if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
+ $this->redirectQueue[$orig] = $redirectURL;
+ } else {
+ $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+ }
+ } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
+ // the response content-type did not match our 'header only' types,
+ // but we'd issues a HEAD request because we assumed it would. So
+ // let's queue a proper GET request for this item...
+ $this->debug('Wrong guess at content-type, queing GET request');
+ $this->requests[$orig]['wrongGuess'] = true;
+ $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
+ } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+ // check for <meta name='fragment' content='!'/>
+ // for AJAX sites, e.g. Blogger with its dynamic views templates.
+ // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+ if (isset($this->requests[$orig]['body'])) {
+ $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+ if ($redirectURL) {
+ $this->redirectQueue[$orig] = $redirectURL;
+ }
+ }
+ }
+ // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
+ unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
+ }
+ }
+ }
+ }
+
+ //////////////////////////////////////////////////////
+ // sequential (file_get_contents)
+ else {
+ $this->debug('Starting sequential fetch (file_get_contents)');
+ $this->debug('Processing set of '.count($urls));
+ foreach ($urls as $orig => $url) {
+ if (!$isRedirect) $orig = $url;
+ unset($this->redirectQueue[$orig]);
+ $this->debug("...$url");
+ if (!$isRedirect && isset($this->requests[$url])) {
+ $this->debug("......in memory");
+ /*
+ } elseif ($this->isCached($url)) {
+ $this->debug("......is cached");
+ if (!$this->minimiseMemoryUse) {
+ $this->requests[$url] = $this->getCached($url);
+ }
+ */
+ } else {
+ $this->debug("Sending request for $url");
+ $this->requests[$orig]['original_url'] = $orig;
+ $req_url = $this->rewriteUrls($url);
+ $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+ $req_url = $this->removeFragment($req_url);
+ // send cookies, if we have any
+ $httpContext = $this->httpContext;
+ $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
+ // add referer for picky sites
+ $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
+ if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+ $this->debug("......sending cookies: $cookies");
+ $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
+ }
+ if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
+ $this->debug('Received response');
+ // get status code
+ if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
+ $this->debug('Error: no status code found');
+ // TODO: handle error - no status code
+ } else {
+ $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
+ // check content type
+ if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+ $this->requests[$orig]['body'] = '';
+ } else {
+ $this->requests[$orig]['body'] = $html;
+ }
+ $this->requests[$orig]['effective_url'] = $req_url;
+ $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
+ unset($match);
+ // handle redirect
+ if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
+ $this->requests[$orig]['location'] = trim($match[1]);
+ }
+ if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
+ $redirectURL = $this->requests[$orig]['location'];
+ if (!preg_match('!^https?://!i', $redirectURL)) {
+ $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ }
+ if ($this->validateURL($redirectURL)) {
+ $this->debug('Redirect detected. Valid URL: '.$redirectURL);
+ // store any cookies
+ $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
+ if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
+ $this->redirectQueue[$orig] = $redirectURL;
+ } else {
+ $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+ }
+ } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+ // check for <meta name='fragment' content='!'/>
+ // for AJAX sites, e.g. Blogger with its dynamic views templates.
+ // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+ if (isset($this->requests[$orig]['body'])) {
+ $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+ if ($redirectURL) {
+ $this->redirectQueue[$orig] = $redirectURL;
+ }
+ }
+ }
+ }
+ } else {
+ $this->debug('Error retrieving URL');
+ //print_r($req_url);
+ //print_r($http_response_header);
+ //print_r($html);
+
+ // TODO: handle error - failed to retrieve URL
+ }
+ }
+ }
+ }
+ }
+
+ public function handleCurlResponse($response, $info, $request) {
+ $orig = $request->url_original;
+ $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
+ $this->requests[$orig]['body'] = substr($response, $info['header_size']);
+ $this->requests[$orig]['method'] = $request->method;
+ $this->requests[$orig]['effective_url'] = $info['url'];
+ $this->requests[$orig]['status_code'] = (int)$info['http_code'];
+ if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
+ $this->requests[$orig]['location'] = trim($match[1]);
+ }
+ }
+
+ protected function headersToString(array $headers, $associative=true) {
+ if (!$associative) {
+ return implode("\n", $headers);
+ } else {
+ $str = '';
+ foreach ($headers as $key => $val) {
+ if (is_array($val)) {
+ foreach ($val as $v) $str .= "$key: $v\n";
+ } else {
+ $str .= "$key: $val\n";
+ }
+ }
+ return rtrim($str);
+ }
+ }
+
+ public function get($url, $remove=false, $gzdecode=true) {
+ $url = "$url";
+ if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
+ $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
+ $response = $this->requests[$url];
+ /*
+ } elseif ($this->isCached($url)) {
+ $this->debug("URL already fetched - in disk cache ($url)");
+ $response = $this->getCached($url);
+ $this->requests[$url] = $response;
+ */
+ } else {
+ $this->debug("Fetching URL ($url)");
+ $this->fetchAll(array($url));
+ if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
+ $response = $this->requests[$url];
+ } else {
+ $this->debug("Request failed");
+ $response = false;
+ }
+ }
+ /*
+ if ($this->minimiseMemoryUse && $response) {
+ $this->cache($url);
+ unset($this->requests[$url]);
+ }
+ */
+ if ($remove && $response) unset($this->requests[$url]);
+ if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
+ if ($html = gzdecode($response['body'])) {
+ $response['body'] = $html;
+ }
+ }
+ return $response;
+ }
+
+ public function parallelSupport() {
+ return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
+ }
+
+ private function headerOnlyType($headers) {
+ if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
+ // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
+ $match[1] = strtolower(trim($match[1]));
+ $match[2] = strtolower(trim($match[2]));
+ foreach (array($match[1], $match[2]) as $mime) {
+ if (in_array($mime, $this->headerOnlyTypes)) return true;
+ }
+ }
+ return false;
+ }
+
+ private function possibleUnsupportedType($url) {
+ $path = @parse_url($url, PHP_URL_PATH);
+ if ($path && strpos($path, '.') !== false) {
+ $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
+ return in_array($ext, $this->headerOnlyClues);
+ }
+ return false;
+ }
+}
+
+// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
+if (!function_exists('gzdecode')) {
+ function gzdecode($data,&$filename='',&$error='',$maxlength=null)
+ {
+ $len = strlen($data);
+ if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
+ $error = "Not in GZIP format.";
+ return null; // Not GZIP format (See RFC 1952)
+ }
+ $method = ord(substr($data,2,1)); // Compression method
+ $flags = ord(substr($data,3,1)); // Flags
+ if ($flags & 31 != $flags) {
+ $error = "Reserved bits not allowed.";
+ return null;
+ }
+ // NOTE: $mtime may be negative (PHP integer limitations)
+ $mtime = unpack("V", substr($data,4,4));
+ $mtime = $mtime[1];
+ $xfl = substr($data,8,1);
+ $os = substr($data,8,1);
+ $headerlen = 10;
+ $extralen = 0;
+ $extra = "";
+ if ($flags & 4) {
+ // 2-byte length prefixed EXTRA data in header
+ if ($len - $headerlen - 2 < 8) {
+ return false; // invalid
+ }
+ $extralen = unpack("v",substr($data,8,2));
+ $extralen = $extralen[1];
+ if ($len - $headerlen - 2 - $extralen < 8) {
+ return false; // invalid
+ }
+ $extra = substr($data,10,$extralen);
+ $headerlen += 2 + $extralen;
+ }
+ $filenamelen = 0;
+ $filename = "";
+ if ($flags & 8) {
+ // C-style string
+ if ($len - $headerlen - 1 < 8) {
+ return false; // invalid
+ }
+ $filenamelen = strpos(substr($data,$headerlen),chr(0));
+ if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
+ return false; // invalid
+ }
+ $filename = substr($data,$headerlen,$filenamelen);
+ $headerlen += $filenamelen + 1;
+ }
+ $commentlen = 0;
+ $comment = "";
+ if ($flags & 16) {
+ // C-style string COMMENT data in header
+ if ($len - $headerlen - 1 < 8) {
+ return false; // invalid
+ }
+ $commentlen = strpos(substr($data,$headerlen),chr(0));
+ if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
+ return false; // Invalid header format
+ }
+ $comment = substr($data,$headerlen,$commentlen);
+ $headerlen += $commentlen + 1;
+ }
+ $headercrc = "";
+ if ($flags & 2) {
+ // 2-bytes (lowest order) of CRC32 on header present
+ if ($len - $headerlen - 2 < 8) {
+ return false; // invalid
+ }
+ $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
+ $headercrc = unpack("v", substr($data,$headerlen,2));
+ $headercrc = $headercrc[1];
+ if ($headercrc != $calccrc) {
+ $error = "Header checksum failed.";
+ return false; // Bad header CRC
+ }
+ $headerlen += 2;
+ }
+ // GZIP FOOTER
+ $datacrc = unpack("V",substr($data,-8,4));
+ $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
+ $isize = unpack("V",substr($data,-4));
+ $isize = $isize[1];
+ // decompression:
+ $bodylen = $len-$headerlen-8;
+ if ($bodylen < 1) {
+ // IMPLEMENTATION BUG!
+ return null;
+ }
+ $body = substr($data,$headerlen,$bodylen);
+ $data = "";
+ if ($bodylen > 0) {
+ switch ($method) {
+ case 8:
+ // Currently the only supported compression method:
+ $data = gzinflate($body,$maxlength);
+ break;
+ default:
+ $error = "Unknown compression method.";
+ return false;
+ }
+ } // zero-byte body content is allowed
+ // Verifiy CRC32
+ $crc = sprintf("%u",crc32($data));
+ $crcOK = $crc == $datacrc;
+ $lenOK = $isize == strlen($data);
+ if (!$lenOK || !$crcOK) {
+ $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
+ return false;
+ }
+ return $data;
+ }
+}
\ No newline at end of file