]>
Commit | Line | Data |
---|---|---|
1 | <?php\r | |
2 | /**\r | |
3 | * Humble HTTP Agent\r | |
4 | * \r | |
5 | * This class is designed to take advantage of parallel HTTP requests\r | |
6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. \r | |
7 | * For environments which do not have these options, it reverts to standard sequential \r | |
8 | * requests (using file_get_contents())\r | |
9 | * \r | |
10 | * @version 1.1\r | |
11 | * @date 2012-08-20\r | |
12 | * @see http://php.net/HttpRequestPool\r | |
13 | * @author Keyvan Minoukadeh\r | |
14 | * @copyright 2011-2012 Keyvan Minoukadeh\r | |
15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r | |
16 | */\r | |
17 | \r | |
18 | class HumbleHttpAgent\r | |
19 | {\r | |
20 | const METHOD_REQUEST_POOL = 1;\r | |
21 | const METHOD_CURL_MULTI = 2;\r | |
22 | const METHOD_FILE_GET_CONTENTS = 4;\r | |
23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';\r | |
24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';\r | |
25 | const UA_PHP = 'PHP/5.2';\r | |
26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';\r | |
27 | \r | |
28 | protected $requests = array();\r | |
29 | protected $redirectQueue = array();\r | |
30 | protected $requestOptions;\r | |
31 | protected $maxParallelRequests = 5;\r | |
32 | protected $cache = null; //TODO\r | |
33 | protected $httpContext;\r | |
34 | protected $minimiseMemoryUse = false; //TODO\r | |
35 | protected $method;\r | |
36 | protected $cookieJar;\r | |
37 | public $debug = false;\r | |
38 | public $debugVerbose = false;\r | |
39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html\r | |
40 | public $maxRedirects = 5;\r | |
41 | public $userAgentMap = array();\r | |
42 | public $rewriteUrls = array();\r | |
43 | public $userAgentDefault;\r | |
44 | public $referer;\r | |
45 | //public $userAgent = 'Mozilla/5.0';\r | |
46 | \r | |
47 | // Prevent certain file/mime types\r | |
48 | // HTTP responses which match these content types will\r | |
49 | // be returned without body.\r | |
50 | public $headerOnlyTypes = array();\r | |
51 | // URLs ending with one of these extensions will\r | |
52 | // prompt Humble HTTP Agent to send a HEAD request first\r | |
53 | // to see if returned content type matches $headerOnlyTypes.\r | |
54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');\r | |
55 | // AJAX triggers to search for.\r | |
56 | // for AJAX sites, e.g. Blogger with its dynamic views templates.\r | |
57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');\r | |
58 | \r | |
59 | //TODO: set max file size\r | |
60 | //TODO: normalise headers\r | |
61 | \r | |
62 | function __construct($requestOptions=null, $method=null) {\r | |
63 | $this->userAgentDefault = self::UA_BROWSER;\r | |
64 | $this->referer = self::REF_GOOGLE;\r | |
65 | // set the request method\r | |
66 | if (in_array($method, array(1,2,4))) {\r | |
67 | $this->method = $method;\r | |
68 | } else {\r | |
69 | if (class_exists('HttpRequestPool')) {\r | |
70 | $this->method = self::METHOD_REQUEST_POOL;\r | |
71 | } elseif (function_exists('curl_multi_init')) {\r | |
72 | $this->method = self::METHOD_CURL_MULTI;\r | |
73 | } else {\r | |
74 | $this->method = self::METHOD_FILE_GET_CONTENTS;\r | |
75 | }\r | |
76 | }\r | |
77 | if ($this->method == self::METHOD_CURL_MULTI) {\r | |
78 | require_once(dirname(__FILE__).'/RollingCurl.php');\r | |
79 | }\r | |
80 | // create cookie jar\r | |
81 | $this->cookieJar = new CookieJar();\r | |
82 | // set request options (redirect must be 0)\r | |
83 | $this->requestOptions = array(\r | |
84 | 'timeout' => 15,\r | |
85 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web\r | |
86 | // TODO: test onprogress?\r | |
87 | );\r | |
88 | if (is_array($requestOptions)) {\r | |
89 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions);\r | |
90 | }\r | |
91 | $this->httpContext = array(\r | |
92 | 'http' => array(\r | |
93 | 'ignore_errors' => true,\r | |
94 | 'timeout' => $this->requestOptions['timeout'],\r | |
95 | 'max_redirects' => $this->requestOptions['redirect'],\r | |
96 | 'header' => "Accept: */*\r\n"\r | |
97 | )\r | |
98 | );\r | |
99 | }\r | |
100 | \r | |
101 | protected function debug($msg) {\r | |
102 | if ($this->debug) {\r | |
103 | $mem = round(memory_get_usage()/1024, 2);\r | |
104 | $memPeak = round(memory_get_peak_usage()/1024, 2);\r | |
105 | echo '* ',$msg;\r | |
106 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";\r | |
107 | echo "\n";\r | |
108 | ob_flush();\r | |
109 | flush();\r | |
110 | }\r | |
111 | }\r | |
112 | \r | |
113 | protected function getUserAgent($url, $asArray=false) {\r | |
114 | $host = @parse_url($url, PHP_URL_HOST);\r | |
115 | if (strtolower(substr($host, 0, 4)) == 'www.') {\r | |
116 | $host = substr($host, 4);\r | |
117 | }\r | |
118 | if ($host) {\r | |
119 | $try = array($host);\r | |
120 | $split = explode('.', $host);\r | |
121 | if (count($split) > 1) {\r | |
122 | array_shift($split);\r | |
123 | $try[] = '.'.implode('.', $split);\r | |
124 | }\r | |
125 | foreach ($try as $h) {\r | |
126 | if (isset($this->userAgentMap[$h])) {\r | |
127 | $ua = $this->userAgentMap[$h];\r | |
128 | break;\r | |
129 | }\r | |
130 | }\r | |
131 | }\r | |
132 | if (!isset($ua)) $ua = $this->userAgentDefault;\r | |
133 | if ($asArray) {\r | |
134 | return array('User-Agent' => $ua);\r | |
135 | } else {\r | |
136 | return 'User-Agent: '.$ua;\r | |
137 | }\r | |
138 | }\r | |
139 | \r | |
140 | public function rewriteHashbangFragment($url) {\r | |
141 | // return $url if there's no '#!'\r | |
142 | if (strpos($url, '#!') === false) return $url;\r | |
143 | // split $url and rewrite\r | |
144 | // TODO: is SimplePie_IRI included?\r | |
145 | $iri = new SimplePie_IRI($url);\r | |
146 | $fragment = substr($iri->fragment, 1); // strip '!'\r | |
147 | $iri->fragment = null;\r | |
148 | if (isset($iri->query)) {\r | |
149 | parse_str($iri->query, $query);\r | |
150 | } else {\r | |
151 | $query = array();\r | |
152 | }\r | |
153 | $query['_escaped_fragment_'] = (string)$fragment;\r | |
154 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r | |
155 | return $iri->get_iri();\r | |
156 | }\r | |
157 | \r | |
158 | public function getUglyURL($url, $html) {\r | |
159 | if ($html == '') return false;\r | |
160 | $found = false;\r | |
161 | foreach ($this->ajaxTriggers as $string) {\r | |
162 | if (stripos($html, $string)) {\r | |
163 | $found = true;\r | |
164 | break;\r | |
165 | }\r | |
166 | }\r | |
167 | if (!$found) return false;\r | |
168 | $iri = new SimplePie_IRI($url);\r | |
169 | if (isset($iri->query)) {\r | |
170 | parse_str($iri->query, $query);\r | |
171 | } else {\r | |
172 | $query = array();\r | |
173 | }\r | |
174 | $query['_escaped_fragment_'] = '';\r | |
175 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r | |
176 | return $iri->get_iri();\r | |
177 | }\r | |
178 | \r | |
179 | public function removeFragment($url) {\r | |
180 | $pos = strpos($url, '#');\r | |
181 | if ($pos === false) {\r | |
182 | return $url;\r | |
183 | } else {\r | |
184 | return substr($url, 0, $pos);\r | |
185 | }\r | |
186 | }\r | |
187 | \r | |
188 | public function rewriteUrls($url) {\r | |
189 | foreach ($this->rewriteUrls as $find => $action) {\r | |
190 | if (strpos($url, $find) !== false) {\r | |
191 | if (is_array($action)) {\r | |
192 | return strtr($url, $action);\r | |
193 | }\r | |
194 | }\r | |
195 | }\r | |
196 | return $url;\r | |
197 | }\r | |
198 | \r | |
199 | public function enableDebug($bool=true) {\r | |
200 | $this->debug = (bool)$bool;\r | |
201 | }\r | |
202 | \r | |
203 | public function minimiseMemoryUse($bool = true) {\r | |
204 | $this->minimiseMemoryUse = $bool;\r | |
205 | }\r | |
206 | \r | |
207 | public function setMaxParallelRequests($max) {\r | |
208 | $this->maxParallelRequests = $max;\r | |
209 | }\r | |
210 | \r | |
211 | public function validateUrl($url) {\r | |
212 | $url = filter_var($url, FILTER_SANITIZE_URL);\r | |
213 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r | |
214 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)\r | |
215 | if ($test === false) {\r | |
216 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r | |
217 | }\r | |
218 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {\r | |
219 | return $url;\r | |
220 | } else {\r | |
221 | return false;\r | |
222 | }\r | |
223 | }\r | |
224 | \r | |
225 | public function fetchAll(array $urls) {\r | |
226 | $this->fetchAllOnce($urls, $isRedirect=false);\r | |
227 | $redirects = 0;\r | |
228 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {\r | |
229 | $this->debug("Following redirects #$redirects...");\r | |
230 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);\r | |
231 | }\r | |
232 | }\r | |
233 | \r | |
234 | // fetch all URLs without following redirects\r | |
235 | public function fetchAllOnce(array $urls, $isRedirect=false) {\r | |
236 | if (!$isRedirect) $urls = array_unique($urls);\r | |
237 | if (empty($urls)) return;\r | |
238 | \r | |
239 | //////////////////////////////////////////////////////\r | |
240 | // parallel (HttpRequestPool)\r | |
241 | if ($this->method == self::METHOD_REQUEST_POOL) {\r | |
242 | $this->debug('Starting parallel fetch (HttpRequestPool)');\r | |
243 | try {\r | |
244 | while (count($urls) > 0) {\r | |
245 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r | |
246 | $subset = array_splice($urls, 0, $this->maxParallelRequests);\r | |
247 | $pool = new HttpRequestPool();\r | |
248 | foreach ($subset as $orig => $url) {\r | |
249 | if (!$isRedirect) $orig = $url;\r | |
250 | unset($this->redirectQueue[$orig]);\r | |
251 | $this->debug("...$url");\r | |
252 | if (!$isRedirect && isset($this->requests[$url])) {\r | |
253 | $this->debug("......in memory");\r | |
254 | /*\r | |
255 | } elseif ($this->isCached($url)) {\r | |
256 | $this->debug("......is cached");\r | |
257 | if (!$this->minimiseMemoryUse) {\r | |
258 | $this->requests[$url] = $this->getCached($url);\r | |
259 | }\r | |
260 | */\r | |
261 | } else {\r | |
262 | $this->debug("......adding to pool");\r | |
263 | $req_url = $this->rewriteUrls($url);\r | |
264 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r | |
265 | $req_url = $this->removeFragment($req_url);\r | |
266 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r | |
267 | $_meth = HttpRequest::METH_HEAD;\r | |
268 | } else {\r | |
269 | $_meth = HttpRequest::METH_GET;\r | |
270 | unset($this->requests[$orig]['wrongGuess']);\r | |
271 | }\r | |
272 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);\r | |
273 | // send cookies, if we have any\r | |
274 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r | |
275 | $this->debug("......sending cookies: $cookies");\r | |
276 | $httpRequest->addHeaders(array('Cookie' => $cookies));\r | |
277 | }\r | |
278 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));\r | |
279 | $httpRequest->addHeaders($this->getUserAgent($req_url, true));\r | |
280 | // add referer for picky sites\r | |
281 | $httpRequest->addheaders(array('Referer' => $this->referer));\r | |
282 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r | |
283 | $this->requests[$orig]['original_url'] = $orig;\r | |
284 | $pool->attach($httpRequest);\r | |
285 | }\r | |
286 | }\r | |
287 | // did we get anything into the pool?\r | |
288 | if (count($pool) > 0) {\r | |
289 | $this->debug('Sending request...');\r | |
290 | try {\r | |
291 | $pool->send();\r | |
292 | } catch (HttpRequestPoolException $e) {\r | |
293 | // do nothing\r | |
294 | }\r | |
295 | $this->debug('Received responses');\r | |
296 | foreach($subset as $orig => $url) {\r | |
297 | if (!$isRedirect) $orig = $url;\r | |
298 | $request = $this->requests[$orig]['httpRequest'];\r | |
299 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());\r | |
300 | // getResponseHeader() doesn't return status line, so, for consistency...\r | |
301 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));\r | |
302 | // check content type\r | |
303 | // TODO: use getResponseHeader('content-type') or getResponseInfo()\r | |
304 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r | |
305 | $this->requests[$orig]['body'] = '';\r | |
306 | $_header_only_type = true;\r | |
307 | $this->debug('Header only type returned');\r | |
308 | } else {\r | |
309 | $this->requests[$orig]['body'] = $request->getResponseBody();\r | |
310 | $_header_only_type = false;\r | |
311 | }\r | |
312 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');\r | |
313 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();\r | |
314 | // is redirect?\r | |
315 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {\r | |
316 | $redirectURL = $request->getResponseHeader('location');\r | |
317 | if (!preg_match('!^https?://!i', $redirectURL)) {\r | |
318 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r | |
319 | }\r | |
320 | if ($this->validateURL($redirectURL)) {\r | |
321 | $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r | |
322 | // store any cookies\r | |
323 | $cookies = $request->getResponseHeader('set-cookie');\r | |
324 | if ($cookies && !is_array($cookies)) $cookies = array($cookies);\r | |
325 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies);\r | |
326 | $this->redirectQueue[$orig] = $redirectURL;\r | |
327 | } else {\r | |
328 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r | |
329 | }\r | |
330 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {\r | |
331 | // the response content-type did not match our 'header only' types, \r | |
332 | // but we'd issues a HEAD request because we assumed it would. So\r | |
333 | // let's queue a proper GET request for this item...\r | |
334 | $this->debug('Wrong guess at content-type, queing GET request');\r | |
335 | $this->requests[$orig]['wrongGuess'] = true;\r | |
336 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r | |
337 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r | |
338 | // check for <meta name='fragment' content='!'/>\r | |
339 | // for AJAX sites, e.g. Blogger with its dynamic views templates.\r | |
340 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r | |
341 | if (isset($this->requests[$orig]['body'])) {\r | |
342 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r | |
343 | if ($redirectURL) {\r | |
344 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r | |
345 | $this->redirectQueue[$orig] = $redirectURL;\r | |
346 | }\r | |
347 | }\r | |
348 | }\r | |
349 | //die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r | |
350 | $pool->detach($request);\r | |
351 | unset($this->requests[$orig]['httpRequest'], $request);\r | |
352 | /*\r | |
353 | if ($this->minimiseMemoryUse) {\r | |
354 | if ($this->cache($url)) {\r | |
355 | unset($this->requests[$url]);\r | |
356 | }\r | |
357 | }\r | |
358 | */\r | |
359 | }\r | |
360 | }\r | |
361 | }\r | |
362 | } catch (HttpException $e) {\r | |
363 | $this->debug($e);\r | |
364 | return false;\r | |
365 | }\r | |
366 | }\r | |
367 | \r | |
368 | //////////////////////////////////////////////////////////\r | |
369 | // parallel (curl_multi_*)\r | |
370 | elseif ($this->method == self::METHOD_CURL_MULTI) {\r | |
371 | $this->debug('Starting parallel fetch (curl_multi_*)');\r | |
372 | while (count($urls) > 0) {\r | |
373 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r | |
374 | $subset = array_splice($urls, 0, $this->maxParallelRequests);\r | |
375 | $pool = new RollingCurl(array($this, 'handleCurlResponse'));\r | |
376 | $pool->window_size = count($subset); \r | |
377 | \r | |
378 | foreach ($subset as $orig => $url) {\r | |
379 | if (!$isRedirect) $orig = $url;\r | |
380 | unset($this->redirectQueue[$orig]);\r | |
381 | $this->debug("...$url");\r | |
382 | if (!$isRedirect && isset($this->requests[$url])) {\r | |
383 | $this->debug("......in memory");\r | |
384 | /*\r | |
385 | } elseif ($this->isCached($url)) {\r | |
386 | $this->debug("......is cached");\r | |
387 | if (!$this->minimiseMemoryUse) {\r | |
388 | $this->requests[$url] = $this->getCached($url);\r | |
389 | }\r | |
390 | */\r | |
391 | } else {\r | |
392 | $this->debug("......adding to pool");\r | |
393 | $req_url = $this->rewriteUrls($url);\r | |
394 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r | |
395 | $req_url = $this->removeFragment($req_url);\r | |
396 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r | |
397 | $_meth = 'HEAD';\r | |
398 | } else {\r | |
399 | $_meth = 'GET';\r | |
400 | unset($this->requests[$orig]['wrongGuess']);\r | |
401 | } \r | |
402 | $headers = array();\r | |
403 | //$headers[] = 'User-Agent: '.$this->userAgent;\r | |
404 | $headers[] = $this->getUserAgent($req_url);\r | |
405 | // add referer for picky sites\r | |
406 | $headers[] = 'Referer: '.$this->referer;\r | |
407 | // send cookies, if we have any\r | |
408 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r | |
409 | $this->debug("......sending cookies: $cookies");\r | |
410 | $headers[] = 'Cookie: '.$cookies;\r | |
411 | }\r | |
412 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(\r | |
413 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],\r | |
414 | CURLOPT_TIMEOUT => $this->requestOptions['timeout']\r | |
415 | ));\r | |
416 | $httpRequest->set_original_url($orig);\r | |
417 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r | |
418 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?\r | |
419 | $pool->add($httpRequest);\r | |
420 | }\r | |
421 | }\r | |
422 | // did we get anything into the pool?\r | |
423 | if (count($pool) > 0) {\r | |
424 | $this->debug('Sending request...');\r | |
425 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]\r | |
426 | $this->debug('Received responses');\r | |
427 | foreach($subset as $orig => $url) {\r | |
428 | if (!$isRedirect) $orig = $url;\r | |
429 | // $this->requests[$orig]['headers']\r | |
430 | // $this->requests[$orig]['body']\r | |
431 | // $this->requests[$orig]['effective_url']\r | |
432 | // check content type\r | |
433 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r | |
434 | $this->requests[$orig]['body'] = '';\r | |
435 | $_header_only_type = true;\r | |
436 | $this->debug('Header only type returned');\r | |
437 | } else {\r | |
438 | $_header_only_type = false;\r | |
439 | }\r | |
440 | $status_code = $this->requests[$orig]['status_code'];\r | |
441 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r | |
442 | $redirectURL = $this->requests[$orig]['location'];\r | |
443 | if (!preg_match('!^https?://!i', $redirectURL)) {\r | |
444 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r | |
445 | }\r | |
446 | if ($this->validateURL($redirectURL)) {\r | |
447 | $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r | |
448 | // store any cookies\r | |
449 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r | |
450 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); \r | |
451 | $this->redirectQueue[$orig] = $redirectURL;\r | |
452 | } else {\r | |
453 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r | |
454 | }\r | |
455 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {\r | |
456 | // the response content-type did not match our 'header only' types, \r | |
457 | // but we'd issues a HEAD request because we assumed it would. So\r | |
458 | // let's queue a proper GET request for this item...\r | |
459 | $this->debug('Wrong guess at content-type, queing GET request');\r | |
460 | $this->requests[$orig]['wrongGuess'] = true;\r | |
461 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r | |
462 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r | |
463 | // check for <meta name='fragment' content='!'/>\r | |
464 | // for AJAX sites, e.g. Blogger with its dynamic views templates.\r | |
465 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r | |
466 | if (isset($this->requests[$orig]['body'])) {\r | |
467 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r | |
468 | if ($redirectURL) {\r | |
469 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r | |
470 | $this->redirectQueue[$orig] = $redirectURL;\r | |
471 | }\r | |
472 | }\r | |
473 | }\r | |
474 | // die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r | |
475 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);\r | |
476 | }\r | |
477 | }\r | |
478 | }\r | |
479 | }\r | |
480 | \r | |
481 | //////////////////////////////////////////////////////\r | |
482 | // sequential (file_get_contents)\r | |
483 | else {\r | |
484 | $this->debug('Starting sequential fetch (file_get_contents)');\r | |
485 | $this->debug('Processing set of '.count($urls));\r | |
486 | foreach ($urls as $orig => $url) {\r | |
487 | if (!$isRedirect) $orig = $url;\r | |
488 | unset($this->redirectQueue[$orig]);\r | |
489 | $this->debug("...$url");\r | |
490 | if (!$isRedirect && isset($this->requests[$url])) {\r | |
491 | $this->debug("......in memory");\r | |
492 | /*\r | |
493 | } elseif ($this->isCached($url)) {\r | |
494 | $this->debug("......is cached");\r | |
495 | if (!$this->minimiseMemoryUse) {\r | |
496 | $this->requests[$url] = $this->getCached($url);\r | |
497 | }\r | |
498 | */\r | |
499 | } else {\r | |
500 | $this->debug("Sending request for $url");\r | |
501 | $this->requests[$orig]['original_url'] = $orig;\r | |
502 | $req_url = $this->rewriteUrls($url);\r | |
503 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r | |
504 | $req_url = $this->removeFragment($req_url);\r | |
505 | // send cookies, if we have any\r | |
506 | $httpContext = $this->httpContext;\r | |
507 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";\r | |
508 | // add referer for picky sites\r | |
509 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";\r | |
510 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r | |
511 | $this->debug("......sending cookies: $cookies");\r | |
512 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";\r | |
513 | }\r | |
514 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {\r | |
515 | $this->debug('Received response');\r | |
516 | // get status code\r | |
517 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {\r | |
518 | $this->debug('Error: no status code found');\r | |
519 | // TODO: handle error - no status code\r | |
520 | } else {\r | |
521 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);\r | |
522 | // check content type\r | |
523 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r | |
524 | $this->requests[$orig]['body'] = '';\r | |
525 | } else {\r | |
526 | $this->requests[$orig]['body'] = $html;\r | |
527 | }\r | |
528 | $this->requests[$orig]['effective_url'] = $req_url;\r | |
529 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];\r | |
530 | unset($match);\r | |
531 | // handle redirect\r | |
532 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r | |
533 | $this->requests[$orig]['location'] = trim($match[1]);\r | |
534 | }\r | |
535 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r | |
536 | $redirectURL = $this->requests[$orig]['location'];\r | |
537 | if (!preg_match('!^https?://!i', $redirectURL)) {\r | |
538 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r | |
539 | }\r | |
540 | if ($this->validateURL($redirectURL)) {\r | |
541 | $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r | |
542 | // store any cookies\r | |
543 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r | |
544 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);\r | |
545 | $this->redirectQueue[$orig] = $redirectURL;\r | |
546 | } else {\r | |
547 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r | |
548 | }\r | |
549 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r | |
550 | // check for <meta name='fragment' content='!'/>\r | |
551 | // for AJAX sites, e.g. Blogger with its dynamic views templates.\r | |
552 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r | |
553 | if (isset($this->requests[$orig]['body'])) {\r | |
554 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r | |
555 | if ($redirectURL) {\r | |
556 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r | |
557 | $this->redirectQueue[$orig] = $redirectURL;\r | |
558 | }\r | |
559 | }\r | |
560 | }\r | |
561 | }\r | |
562 | } else {\r | |
563 | $this->debug('Error retrieving URL');\r | |
564 | //print_r($req_url);\r | |
565 | //print_r($http_response_header);\r | |
566 | //print_r($html);\r | |
567 | \r | |
568 | // TODO: handle error - failed to retrieve URL\r | |
569 | }\r | |
570 | }\r | |
571 | }\r | |
572 | }\r | |
573 | }\r | |
574 | \r | |
575 | public function handleCurlResponse($response, $info, $request) {\r | |
576 | $orig = $request->url_original;\r | |
577 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);\r | |
578 | $this->requests[$orig]['body'] = substr($response, $info['header_size']);\r | |
579 | $this->requests[$orig]['method'] = $request->method;\r | |
580 | $this->requests[$orig]['effective_url'] = $info['url'];\r | |
581 | $this->requests[$orig]['status_code'] = (int)$info['http_code'];\r | |
582 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r | |
583 | $this->requests[$orig]['location'] = trim($match[1]);\r | |
584 | }\r | |
585 | }\r | |
586 | \r | |
587 | protected function headersToString(array $headers, $associative=true) {\r | |
588 | if (!$associative) {\r | |
589 | return implode("\n", $headers);\r | |
590 | } else {\r | |
591 | $str = '';\r | |
592 | foreach ($headers as $key => $val) {\r | |
593 | if (is_array($val)) {\r | |
594 | foreach ($val as $v) $str .= "$key: $v\n";\r | |
595 | } else {\r | |
596 | $str .= "$key: $val\n";\r | |
597 | }\r | |
598 | }\r | |
599 | return rtrim($str);\r | |
600 | }\r | |
601 | }\r | |
602 | \r | |
603 | public function get($url, $remove=false, $gzdecode=true) {\r | |
604 | $url = "$url";\r | |
605 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r | |
606 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");\r | |
607 | $response = $this->requests[$url];\r | |
608 | /*\r | |
609 | } elseif ($this->isCached($url)) {\r | |
610 | $this->debug("URL already fetched - in disk cache ($url)");\r | |
611 | $response = $this->getCached($url);\r | |
612 | $this->requests[$url] = $response;\r | |
613 | */\r | |
614 | } else {\r | |
615 | $this->debug("Fetching URL ($url)");\r | |
616 | $this->fetchAll(array($url));\r | |
617 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r | |
618 | $response = $this->requests[$url];\r | |
619 | } else {\r | |
620 | $this->debug("Request failed");\r | |
621 | $response = false;\r | |
622 | }\r | |
623 | }\r | |
624 | /*\r | |
625 | if ($this->minimiseMemoryUse && $response) {\r | |
626 | $this->cache($url);\r | |
627 | unset($this->requests[$url]);\r | |
628 | }\r | |
629 | */\r | |
630 | if ($remove && $response) unset($this->requests[$url]);\r | |
631 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {\r | |
632 | if ($html = gzdecode($response['body'])) {\r | |
633 | $response['body'] = $html;\r | |
634 | }\r | |
635 | }\r | |
636 | return $response;\r | |
637 | }\r | |
638 | \r | |
639 | public function parallelSupport() {\r | |
640 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init');\r | |
641 | }\r | |
642 | \r | |
643 | private function headerOnlyType($headers) {\r | |
644 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {\r | |
645 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image)\r | |
646 | $match[1] = strtolower(trim($match[1]));\r | |
647 | $match[2] = strtolower(trim($match[2]));\r | |
648 | foreach (array($match[1], $match[2]) as $mime) {\r | |
649 | if (in_array($mime, $this->headerOnlyTypes)) return true;\r | |
650 | }\r | |
651 | }\r | |
652 | return false;\r | |
653 | }\r | |
654 | \r | |
655 | private function possibleUnsupportedType($url) {\r | |
656 | $path = @parse_url($url, PHP_URL_PATH);\r | |
657 | if ($path && strpos($path, '.') !== false) {\r | |
658 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));\r | |
659 | return in_array($ext, $this->headerOnlyClues);\r | |
660 | }\r | |
661 | return false;\r | |
662 | }\r | |
663 | }\r | |
664 | \r | |
665 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930\r | |
666 | if (!function_exists('gzdecode')) {\r | |
667 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) \r | |
668 | {\r | |
669 | $len = strlen($data);\r | |
670 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {\r | |
671 | $error = "Not in GZIP format.";\r | |
672 | return null; // Not GZIP format (See RFC 1952)\r | |
673 | }\r | |
674 | $method = ord(substr($data,2,1)); // Compression method\r | |
675 | $flags = ord(substr($data,3,1)); // Flags\r | |
676 | if ($flags & 31 != $flags) {\r | |
677 | $error = "Reserved bits not allowed.";\r | |
678 | return null;\r | |
679 | }\r | |
680 | // NOTE: $mtime may be negative (PHP integer limitations)\r | |
681 | $mtime = unpack("V", substr($data,4,4));\r | |
682 | $mtime = $mtime[1];\r | |
683 | $xfl = substr($data,8,1);\r | |
684 | $os = substr($data,8,1);\r | |
685 | $headerlen = 10;\r | |
686 | $extralen = 0;\r | |
687 | $extra = "";\r | |
688 | if ($flags & 4) {\r | |
689 | // 2-byte length prefixed EXTRA data in header\r | |
690 | if ($len - $headerlen - 2 < 8) {\r | |
691 | return false; // invalid\r | |
692 | }\r | |
693 | $extralen = unpack("v",substr($data,8,2));\r | |
694 | $extralen = $extralen[1];\r | |
695 | if ($len - $headerlen - 2 - $extralen < 8) {\r | |
696 | return false; // invalid\r | |
697 | }\r | |
698 | $extra = substr($data,10,$extralen);\r | |
699 | $headerlen += 2 + $extralen;\r | |
700 | }\r | |
701 | $filenamelen = 0;\r | |
702 | $filename = "";\r | |
703 | if ($flags & 8) {\r | |
704 | // C-style string\r | |
705 | if ($len - $headerlen - 1 < 8) {\r | |
706 | return false; // invalid\r | |
707 | }\r | |
708 | $filenamelen = strpos(substr($data,$headerlen),chr(0));\r | |
709 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {\r | |
710 | return false; // invalid\r | |
711 | }\r | |
712 | $filename = substr($data,$headerlen,$filenamelen);\r | |
713 | $headerlen += $filenamelen + 1;\r | |
714 | }\r | |
715 | $commentlen = 0;\r | |
716 | $comment = "";\r | |
717 | if ($flags & 16) {\r | |
718 | // C-style string COMMENT data in header\r | |
719 | if ($len - $headerlen - 1 < 8) {\r | |
720 | return false; // invalid\r | |
721 | }\r | |
722 | $commentlen = strpos(substr($data,$headerlen),chr(0));\r | |
723 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {\r | |
724 | return false; // Invalid header format\r | |
725 | }\r | |
726 | $comment = substr($data,$headerlen,$commentlen);\r | |
727 | $headerlen += $commentlen + 1;\r | |
728 | }\r | |
729 | $headercrc = "";\r | |
730 | if ($flags & 2) {\r | |
731 | // 2-bytes (lowest order) of CRC32 on header present\r | |
732 | if ($len - $headerlen - 2 < 8) {\r | |
733 | return false; // invalid\r | |
734 | }\r | |
735 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;\r | |
736 | $headercrc = unpack("v", substr($data,$headerlen,2));\r | |
737 | $headercrc = $headercrc[1];\r | |
738 | if ($headercrc != $calccrc) {\r | |
739 | $error = "Header checksum failed.";\r | |
740 | return false; // Bad header CRC\r | |
741 | }\r | |
742 | $headerlen += 2;\r | |
743 | }\r | |
744 | // GZIP FOOTER\r | |
745 | $datacrc = unpack("V",substr($data,-8,4));\r | |
746 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);\r | |
747 | $isize = unpack("V",substr($data,-4));\r | |
748 | $isize = $isize[1];\r | |
749 | // decompression:\r | |
750 | $bodylen = $len-$headerlen-8;\r | |
751 | if ($bodylen < 1) {\r | |
752 | // IMPLEMENTATION BUG!\r | |
753 | return null;\r | |
754 | }\r | |
755 | $body = substr($data,$headerlen,$bodylen);\r | |
756 | $data = "";\r | |
757 | if ($bodylen > 0) {\r | |
758 | switch ($method) {\r | |
759 | case 8:\r | |
760 | // Currently the only supported compression method:\r | |
761 | $data = gzinflate($body,$maxlength);\r | |
762 | break;\r | |
763 | default:\r | |
764 | $error = "Unknown compression method.";\r | |
765 | return false;\r | |
766 | }\r | |
767 | } // zero-byte body content is allowed\r | |
768 | // Verifiy CRC32\r | |
769 | $crc = sprintf("%u",crc32($data));\r | |
770 | $crcOK = $crc == $datacrc;\r | |
771 | $lenOK = $isize == strlen($data);\r | |
772 | if (!$lenOK || !$crcOK) {\r | |
773 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');\r | |
774 | return false;\r | |
775 | }\r | |
776 | return $data;\r | |
777 | }\r | |
778 | }\r | |
779 | ?> |