]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
[change] we now use Full-Text RSS 3.1, thank you so much @fivefilters
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / humble-http-agent / HumbleHttpAgent.php
CommitLineData
42c80841
NL
1<?php\r
2/**\r
3 * Humble HTTP Agent\r
4 * \r
5 * This class is designed to take advantage of parallel HTTP requests\r
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions. \r
7 * For environments which do not have these options, it reverts to standard sequential \r
8 * requests (using file_get_contents())\r
9 * \r
10 * @version 1.1\r
11 * @date 2012-08-20\r
12 * @see http://php.net/HttpRequestPool\r
13 * @author Keyvan Minoukadeh\r
14 * @copyright 2011-2012 Keyvan Minoukadeh\r
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
16 */\r
17\r
18class HumbleHttpAgent\r
19{\r
20 const METHOD_REQUEST_POOL = 1;\r
21 const METHOD_CURL_MULTI = 2;\r
22 const METHOD_FILE_GET_CONTENTS = 4;\r
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';\r
24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';\r
25 const UA_PHP = 'PHP/5.2';\r
26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';\r
27 \r
28 protected $requests = array();\r
29 protected $redirectQueue = array();\r
30 protected $requestOptions;\r
31 protected $maxParallelRequests = 5;\r
32 protected $cache = null; //TODO\r
33 protected $httpContext;\r
34 protected $minimiseMemoryUse = false; //TODO\r
35 protected $method;\r
36 protected $cookieJar;\r
37 public $debug = false;\r
38 public $debugVerbose = false;\r
39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html\r
40 public $maxRedirects = 5;\r
41 public $userAgentMap = array();\r
42 public $rewriteUrls = array();\r
43 public $userAgentDefault;\r
44 public $referer;\r
45 //public $userAgent = 'Mozilla/5.0';\r
46 \r
47 // Prevent certain file/mime types\r
48 // HTTP responses which match these content types will\r
49 // be returned without body.\r
50 public $headerOnlyTypes = array();\r
51 // URLs ending with one of these extensions will\r
52 // prompt Humble HTTP Agent to send a HEAD request first\r
53 // to see if returned content type matches $headerOnlyTypes.\r
54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');\r
55 // AJAX triggers to search for.\r
56 // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');\r
58 \r
59 //TODO: set max file size\r
60 //TODO: normalise headers\r
61 \r
62 function __construct($requestOptions=null, $method=null) {\r
63 $this->userAgentDefault = self::UA_BROWSER;\r
64 $this->referer = self::REF_GOOGLE;\r
65 // set the request method\r
66 if (in_array($method, array(1,2,4))) {\r
67 $this->method = $method;\r
68 } else {\r
69 if (class_exists('HttpRequestPool')) {\r
70 $this->method = self::METHOD_REQUEST_POOL;\r
71 } elseif (function_exists('curl_multi_init')) {\r
72 $this->method = self::METHOD_CURL_MULTI;\r
73 } else {\r
74 $this->method = self::METHOD_FILE_GET_CONTENTS;\r
75 }\r
76 }\r
77 if ($this->method == self::METHOD_CURL_MULTI) {\r
78 require_once(dirname(__FILE__).'/RollingCurl.php');\r
79 }\r
80 // create cookie jar\r
81 $this->cookieJar = new CookieJar();\r
82 // set request options (redirect must be 0)\r
83 $this->requestOptions = array(\r
84 'timeout' => 15,\r
85 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web\r
86 // TODO: test onprogress?\r
87 );\r
88 if (is_array($requestOptions)) {\r
89 $this->requestOptions = array_merge($this->requestOptions, $requestOptions);\r
90 }\r
91 $this->httpContext = array(\r
92 'http' => array(\r
93 'ignore_errors' => true,\r
94 'timeout' => $this->requestOptions['timeout'],\r
95 'max_redirects' => $this->requestOptions['redirect'],\r
96 'header' => "Accept: */*\r\n"\r
97 )\r
98 );\r
99 }\r
100 \r
101 protected function debug($msg) {\r
102 if ($this->debug) {\r
103 $mem = round(memory_get_usage()/1024, 2);\r
104 $memPeak = round(memory_get_peak_usage()/1024, 2);\r
105 echo '* ',$msg;\r
106 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";\r
107 echo "\n";\r
108 ob_flush();\r
109 flush();\r
110 }\r
111 }\r
112 \r
113 protected function getUserAgent($url, $asArray=false) {\r
114 $host = @parse_url($url, PHP_URL_HOST);\r
115 if (strtolower(substr($host, 0, 4)) == 'www.') {\r
116 $host = substr($host, 4);\r
117 }\r
118 if ($host) {\r
119 $try = array($host);\r
120 $split = explode('.', $host);\r
121 if (count($split) > 1) {\r
122 array_shift($split);\r
123 $try[] = '.'.implode('.', $split);\r
124 }\r
125 foreach ($try as $h) {\r
126 if (isset($this->userAgentMap[$h])) {\r
127 $ua = $this->userAgentMap[$h];\r
128 break;\r
129 }\r
130 }\r
131 }\r
132 if (!isset($ua)) $ua = $this->userAgentDefault;\r
133 if ($asArray) {\r
134 return array('User-Agent' => $ua);\r
135 } else {\r
136 return 'User-Agent: '.$ua;\r
137 }\r
138 }\r
139 \r
140 public function rewriteHashbangFragment($url) {\r
141 // return $url if there's no '#!'\r
142 if (strpos($url, '#!') === false) return $url;\r
143 // split $url and rewrite\r
144 // TODO: is SimplePie_IRI included?\r
145 $iri = new SimplePie_IRI($url);\r
146 $fragment = substr($iri->fragment, 1); // strip '!'\r
147 $iri->fragment = null;\r
148 if (isset($iri->query)) {\r
149 parse_str($iri->query, $query);\r
150 } else {\r
151 $query = array();\r
152 }\r
153 $query['_escaped_fragment_'] = (string)$fragment;\r
154 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r
155 return $iri->get_iri();\r
156 }\r
157 \r
158 public function getUglyURL($url, $html) {\r
159 if ($html == '') return false;\r
160 $found = false;\r
161 foreach ($this->ajaxTriggers as $string) {\r
162 if (stripos($html, $string)) {\r
163 $found = true;\r
164 break;\r
165 }\r
166 }\r
167 if (!$found) return false;\r
168 $iri = new SimplePie_IRI($url);\r
169 if (isset($iri->query)) {\r
170 parse_str($iri->query, $query);\r
171 } else {\r
172 $query = array();\r
173 }\r
174 $query['_escaped_fragment_'] = '';\r
175 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r
176 return $iri->get_iri();\r
177 }\r
178 \r
179 public function removeFragment($url) {\r
180 $pos = strpos($url, '#');\r
181 if ($pos === false) {\r
182 return $url;\r
183 } else {\r
184 return substr($url, 0, $pos);\r
185 }\r
186 }\r
187 \r
188 public function rewriteUrls($url) {\r
189 foreach ($this->rewriteUrls as $find => $action) {\r
190 if (strpos($url, $find) !== false) {\r
191 if (is_array($action)) {\r
192 return strtr($url, $action);\r
193 }\r
194 }\r
195 }\r
196 return $url;\r
197 }\r
198 \r
199 public function enableDebug($bool=true) {\r
200 $this->debug = (bool)$bool;\r
201 }\r
202 \r
203 public function minimiseMemoryUse($bool = true) {\r
204 $this->minimiseMemoryUse = $bool;\r
205 }\r
206 \r
207 public function setMaxParallelRequests($max) {\r
208 $this->maxParallelRequests = $max;\r
209 }\r
210 \r
211 public function validateUrl($url) {\r
212 $url = filter_var($url, FILTER_SANITIZE_URL);\r
213 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r
214 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)\r
215 if ($test === false) {\r
216 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r
217 }\r
218 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {\r
219 return $url;\r
220 } else {\r
221 return false;\r
222 }\r
223 }\r
224 \r
225 public function fetchAll(array $urls) {\r
226 $this->fetchAllOnce($urls, $isRedirect=false);\r
227 $redirects = 0;\r
228 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {\r
229 $this->debug("Following redirects #$redirects...");\r
230 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);\r
231 }\r
232 }\r
233 \r
234 // fetch all URLs without following redirects\r
235 public function fetchAllOnce(array $urls, $isRedirect=false) {\r
236 if (!$isRedirect) $urls = array_unique($urls);\r
237 if (empty($urls)) return;\r
238 \r
239 //////////////////////////////////////////////////////\r
240 // parallel (HttpRequestPool)\r
241 if ($this->method == self::METHOD_REQUEST_POOL) {\r
242 $this->debug('Starting parallel fetch (HttpRequestPool)');\r
243 try {\r
244 while (count($urls) > 0) {\r
245 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r
246 $subset = array_splice($urls, 0, $this->maxParallelRequests);\r
247 $pool = new HttpRequestPool();\r
248 foreach ($subset as $orig => $url) {\r
249 if (!$isRedirect) $orig = $url;\r
250 unset($this->redirectQueue[$orig]);\r
251 $this->debug("...$url");\r
252 if (!$isRedirect && isset($this->requests[$url])) {\r
253 $this->debug("......in memory");\r
254 /*\r
255 } elseif ($this->isCached($url)) {\r
256 $this->debug("......is cached");\r
257 if (!$this->minimiseMemoryUse) {\r
258 $this->requests[$url] = $this->getCached($url);\r
259 }\r
260 */\r
261 } else {\r
262 $this->debug("......adding to pool");\r
263 $req_url = $this->rewriteUrls($url);\r
264 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
265 $req_url = $this->removeFragment($req_url);\r
266 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r
267 $_meth = HttpRequest::METH_HEAD;\r
268 } else {\r
269 $_meth = HttpRequest::METH_GET;\r
270 unset($this->requests[$orig]['wrongGuess']);\r
271 }\r
272 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);\r
273 // send cookies, if we have any\r
274 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
275 $this->debug("......sending cookies: $cookies");\r
276 $httpRequest->addHeaders(array('Cookie' => $cookies));\r
277 }\r
278 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));\r
279 $httpRequest->addHeaders($this->getUserAgent($req_url, true));\r
280 // add referer for picky sites\r
281 $httpRequest->addheaders(array('Referer' => $this->referer));\r
282 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r
283 $this->requests[$orig]['original_url'] = $orig;\r
284 $pool->attach($httpRequest);\r
285 }\r
286 }\r
287 // did we get anything into the pool?\r
288 if (count($pool) > 0) {\r
289 $this->debug('Sending request...');\r
290 try {\r
291 $pool->send();\r
292 } catch (HttpRequestPoolException $e) {\r
293 // do nothing\r
294 }\r
295 $this->debug('Received responses');\r
296 foreach($subset as $orig => $url) {\r
297 if (!$isRedirect) $orig = $url;\r
298 $request = $this->requests[$orig]['httpRequest'];\r
299 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());\r
300 // getResponseHeader() doesn't return status line, so, for consistency...\r
301 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));\r
302 // check content type\r
303 // TODO: use getResponseHeader('content-type') or getResponseInfo()\r
304 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
305 $this->requests[$orig]['body'] = '';\r
306 $_header_only_type = true;\r
307 $this->debug('Header only type returned');\r
308 } else {\r
309 $this->requests[$orig]['body'] = $request->getResponseBody();\r
310 $_header_only_type = false;\r
311 }\r
312 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');\r
313 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();\r
314 // is redirect?\r
315 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {\r
316 $redirectURL = $request->getResponseHeader('location');\r
317 if (!preg_match('!^https?://!i', $redirectURL)) {\r
318 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
319 }\r
320 if ($this->validateURL($redirectURL)) {\r
321 $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
322 // store any cookies\r
323 $cookies = $request->getResponseHeader('set-cookie');\r
324 if ($cookies && !is_array($cookies)) $cookies = array($cookies);\r
325 if ($cookies) $this->cookieJar->storeCookies($url, $cookies);\r
326 $this->redirectQueue[$orig] = $redirectURL;\r
327 } else {\r
328 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
329 }\r
330 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {\r
331 // the response content-type did not match our 'header only' types, \r
332 // but we'd issues a HEAD request because we assumed it would. So\r
333 // let's queue a proper GET request for this item...\r
334 $this->debug('Wrong guess at content-type, queing GET request');\r
335 $this->requests[$orig]['wrongGuess'] = true;\r
336 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r
337 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
338 // check for <meta name='fragment' content='!'/>\r
339 // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
340 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
341 if (isset($this->requests[$orig]['body'])) {\r
342 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
343 if ($redirectURL) {\r
344 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
345 $this->redirectQueue[$orig] = $redirectURL;\r
346 }\r
347 }\r
348 }\r
349 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r
350 $pool->detach($request);\r
351 unset($this->requests[$orig]['httpRequest'], $request);\r
352 /*\r
353 if ($this->minimiseMemoryUse) {\r
354 if ($this->cache($url)) {\r
355 unset($this->requests[$url]);\r
356 }\r
357 }\r
358 */\r
359 }\r
360 }\r
361 }\r
362 } catch (HttpException $e) {\r
363 $this->debug($e);\r
364 return false;\r
365 }\r
366 }\r
367 \r
368 //////////////////////////////////////////////////////////\r
369 // parallel (curl_multi_*)\r
370 elseif ($this->method == self::METHOD_CURL_MULTI) {\r
371 $this->debug('Starting parallel fetch (curl_multi_*)');\r
372 while (count($urls) > 0) {\r
373 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r
374 $subset = array_splice($urls, 0, $this->maxParallelRequests);\r
375 $pool = new RollingCurl(array($this, 'handleCurlResponse'));\r
376 $pool->window_size = count($subset); \r
377 \r
378 foreach ($subset as $orig => $url) {\r
379 if (!$isRedirect) $orig = $url;\r
380 unset($this->redirectQueue[$orig]);\r
381 $this->debug("...$url");\r
382 if (!$isRedirect && isset($this->requests[$url])) {\r
383 $this->debug("......in memory");\r
384 /*\r
385 } elseif ($this->isCached($url)) {\r
386 $this->debug("......is cached");\r
387 if (!$this->minimiseMemoryUse) {\r
388 $this->requests[$url] = $this->getCached($url);\r
389 }\r
390 */\r
391 } else {\r
392 $this->debug("......adding to pool");\r
393 $req_url = $this->rewriteUrls($url);\r
394 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
395 $req_url = $this->removeFragment($req_url);\r
396 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r
397 $_meth = 'HEAD';\r
398 } else {\r
399 $_meth = 'GET';\r
400 unset($this->requests[$orig]['wrongGuess']);\r
401 } \r
402 $headers = array();\r
403 //$headers[] = 'User-Agent: '.$this->userAgent;\r
404 $headers[] = $this->getUserAgent($req_url);\r
405 // add referer for picky sites\r
406 $headers[] = 'Referer: '.$this->referer;\r
407 // send cookies, if we have any\r
408 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
409 $this->debug("......sending cookies: $cookies");\r
410 $headers[] = 'Cookie: '.$cookies;\r
411 }\r
412 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(\r
413 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],\r
414 CURLOPT_TIMEOUT => $this->requestOptions['timeout']\r
415 ));\r
416 $httpRequest->set_original_url($orig);\r
417 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r
418 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?\r
419 $pool->add($httpRequest);\r
420 }\r
421 }\r
422 // did we get anything into the pool?\r
423 if (count($pool) > 0) {\r
424 $this->debug('Sending request...');\r
425 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]\r
426 $this->debug('Received responses');\r
427 foreach($subset as $orig => $url) {\r
428 if (!$isRedirect) $orig = $url;\r
429 // $this->requests[$orig]['headers']\r
430 // $this->requests[$orig]['body']\r
431 // $this->requests[$orig]['effective_url']\r
432 // check content type\r
433 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
434 $this->requests[$orig]['body'] = '';\r
435 $_header_only_type = true;\r
436 $this->debug('Header only type returned');\r
437 } else {\r
438 $_header_only_type = false;\r
439 }\r
440 $status_code = $this->requests[$orig]['status_code'];\r
441 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r
442 $redirectURL = $this->requests[$orig]['location'];\r
443 if (!preg_match('!^https?://!i', $redirectURL)) {\r
444 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
445 }\r
446 if ($this->validateURL($redirectURL)) {\r
447 $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
448 // store any cookies\r
449 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r
450 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); \r
451 $this->redirectQueue[$orig] = $redirectURL;\r
452 } else {\r
453 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
454 }\r
455 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {\r
456 // the response content-type did not match our 'header only' types, \r
457 // but we'd issues a HEAD request because we assumed it would. So\r
458 // let's queue a proper GET request for this item...\r
459 $this->debug('Wrong guess at content-type, queing GET request');\r
460 $this->requests[$orig]['wrongGuess'] = true;\r
461 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r
462 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
463 // check for <meta name='fragment' content='!'/>\r
464 // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
465 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
466 if (isset($this->requests[$orig]['body'])) {\r
467 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
468 if ($redirectURL) {\r
469 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
470 $this->redirectQueue[$orig] = $redirectURL;\r
471 }\r
472 }\r
473 }\r
474 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r
475 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);\r
476 }\r
477 }\r
478 }\r
479 }\r
480\r
481 //////////////////////////////////////////////////////\r
482 // sequential (file_get_contents)\r
483 else {\r
484 $this->debug('Starting sequential fetch (file_get_contents)');\r
485 $this->debug('Processing set of '.count($urls));\r
486 foreach ($urls as $orig => $url) {\r
487 if (!$isRedirect) $orig = $url;\r
488 unset($this->redirectQueue[$orig]);\r
489 $this->debug("...$url");\r
490 if (!$isRedirect && isset($this->requests[$url])) {\r
491 $this->debug("......in memory");\r
492 /*\r
493 } elseif ($this->isCached($url)) {\r
494 $this->debug("......is cached");\r
495 if (!$this->minimiseMemoryUse) {\r
496 $this->requests[$url] = $this->getCached($url);\r
497 }\r
498 */\r
499 } else {\r
500 $this->debug("Sending request for $url");\r
501 $this->requests[$orig]['original_url'] = $orig;\r
502 $req_url = $this->rewriteUrls($url);\r
503 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
504 $req_url = $this->removeFragment($req_url);\r
505 // send cookies, if we have any\r
506 $httpContext = $this->httpContext;\r
507 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";\r
508 // add referer for picky sites\r
509 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";\r
510 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
511 $this->debug("......sending cookies: $cookies");\r
512 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";\r
513 }\r
514 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {\r
515 $this->debug('Received response');\r
516 // get status code\r
517 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {\r
518 $this->debug('Error: no status code found');\r
519 // TODO: handle error - no status code\r
520 } else {\r
521 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);\r
522 // check content type\r
523 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
524 $this->requests[$orig]['body'] = '';\r
525 } else {\r
526 $this->requests[$orig]['body'] = $html;\r
527 }\r
528 $this->requests[$orig]['effective_url'] = $req_url;\r
529 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];\r
530 unset($match);\r
531 // handle redirect\r
532 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r
533 $this->requests[$orig]['location'] = trim($match[1]);\r
534 }\r
535 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r
536 $redirectURL = $this->requests[$orig]['location'];\r
537 if (!preg_match('!^https?://!i', $redirectURL)) {\r
538 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
539 }\r
540 if ($this->validateURL($redirectURL)) {\r
541 $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
542 // store any cookies\r
543 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r
544 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);\r
545 $this->redirectQueue[$orig] = $redirectURL;\r
546 } else {\r
547 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
548 }\r
549 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
550 // check for <meta name='fragment' content='!'/>\r
551 // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
552 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
553 if (isset($this->requests[$orig]['body'])) {\r
554 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
555 if ($redirectURL) {\r
556 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
557 $this->redirectQueue[$orig] = $redirectURL;\r
558 }\r
559 }\r
560 }\r
561 }\r
562 } else {\r
563 $this->debug('Error retrieving URL');\r
564 //print_r($req_url);\r
565 //print_r($http_response_header);\r
566 //print_r($html);\r
567 \r
568 // TODO: handle error - failed to retrieve URL\r
569 }\r
570 }\r
571 }\r
572 }\r
573 }\r
574 \r
575 public function handleCurlResponse($response, $info, $request) {\r
576 $orig = $request->url_original;\r
577 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);\r
578 $this->requests[$orig]['body'] = substr($response, $info['header_size']);\r
579 $this->requests[$orig]['method'] = $request->method;\r
580 $this->requests[$orig]['effective_url'] = $info['url'];\r
581 $this->requests[$orig]['status_code'] = (int)$info['http_code'];\r
582 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r
583 $this->requests[$orig]['location'] = trim($match[1]);\r
584 }\r
585 }\r
586 \r
587 protected function headersToString(array $headers, $associative=true) {\r
588 if (!$associative) {\r
589 return implode("\n", $headers);\r
590 } else {\r
591 $str = '';\r
592 foreach ($headers as $key => $val) {\r
593 if (is_array($val)) {\r
594 foreach ($val as $v) $str .= "$key: $v\n";\r
595 } else {\r
596 $str .= "$key: $val\n";\r
597 }\r
598 }\r
599 return rtrim($str);\r
600 }\r
601 }\r
602 \r
603 public function get($url, $remove=false, $gzdecode=true) {\r
604 $url = "$url";\r
605 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r
606 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");\r
607 $response = $this->requests[$url];\r
608 /*\r
609 } elseif ($this->isCached($url)) {\r
610 $this->debug("URL already fetched - in disk cache ($url)");\r
611 $response = $this->getCached($url);\r
612 $this->requests[$url] = $response;\r
613 */\r
614 } else {\r
615 $this->debug("Fetching URL ($url)");\r
616 $this->fetchAll(array($url));\r
617 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r
618 $response = $this->requests[$url];\r
619 } else {\r
620 $this->debug("Request failed");\r
621 $response = false;\r
622 }\r
623 }\r
624 /*\r
625 if ($this->minimiseMemoryUse && $response) {\r
626 $this->cache($url);\r
627 unset($this->requests[$url]);\r
628 }\r
629 */\r
630 if ($remove && $response) unset($this->requests[$url]);\r
631 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {\r
632 if ($html = gzdecode($response['body'])) {\r
633 $response['body'] = $html;\r
634 }\r
635 }\r
636 return $response;\r
637 }\r
638 \r
639 public function parallelSupport() {\r
640 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');\r
641 }\r
642 \r
643 private function headerOnlyType($headers) {\r
644 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {\r
645 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)\r
646 $match[1] = strtolower(trim($match[1]));\r
647 $match[2] = strtolower(trim($match[2]));\r
648 foreach (array($match[1], $match[2]) as $mime) {\r
649 if (in_array($mime, $this->headerOnlyTypes)) return true;\r
650 }\r
651 }\r
652 return false;\r
653 }\r
654 \r
655 private function possibleUnsupportedType($url) {\r
656 $path = @parse_url($url, PHP_URL_PATH);\r
657 if ($path && strpos($path, '.') !== false) {\r
658 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));\r
659 return in_array($ext, $this->headerOnlyClues);\r
660 }\r
661 return false;\r
662 }\r
663}\r
664\r
665// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930\r
666if (!function_exists('gzdecode')) {\r
667 function gzdecode($data,&$filename='',&$error='',$maxlength=null) \r
668 {\r
669 $len = strlen($data);\r
670 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {\r
671 $error = "Not in GZIP format.";\r
672 return null; // Not GZIP format (See RFC 1952)\r
673 }\r
674 $method = ord(substr($data,2,1)); // Compression method\r
675 $flags = ord(substr($data,3,1)); // Flags\r
676 if ($flags & 31 != $flags) {\r
677 $error = "Reserved bits not allowed.";\r
678 return null;\r
679 }\r
680 // NOTE: $mtime may be negative (PHP integer limitations)\r
681 $mtime = unpack("V", substr($data,4,4));\r
682 $mtime = $mtime[1];\r
683 $xfl = substr($data,8,1);\r
684 $os = substr($data,8,1);\r
685 $headerlen = 10;\r
686 $extralen = 0;\r
687 $extra = "";\r
688 if ($flags & 4) {\r
689 // 2-byte length prefixed EXTRA data in header\r
690 if ($len - $headerlen - 2 < 8) {\r
691 return false; // invalid\r
692 }\r
693 $extralen = unpack("v",substr($data,8,2));\r
694 $extralen = $extralen[1];\r
695 if ($len - $headerlen - 2 - $extralen < 8) {\r
696 return false; // invalid\r
697 }\r
698 $extra = substr($data,10,$extralen);\r
699 $headerlen += 2 + $extralen;\r
700 }\r
701 $filenamelen = 0;\r
702 $filename = "";\r
703 if ($flags & 8) {\r
704 // C-style string\r
705 if ($len - $headerlen - 1 < 8) {\r
706 return false; // invalid\r
707 }\r
708 $filenamelen = strpos(substr($data,$headerlen),chr(0));\r
709 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {\r
710 return false; // invalid\r
711 }\r
712 $filename = substr($data,$headerlen,$filenamelen);\r
713 $headerlen += $filenamelen + 1;\r
714 }\r
715 $commentlen = 0;\r
716 $comment = "";\r
717 if ($flags & 16) {\r
718 // C-style string COMMENT data in header\r
719 if ($len - $headerlen - 1 < 8) {\r
720 return false; // invalid\r
721 }\r
722 $commentlen = strpos(substr($data,$headerlen),chr(0));\r
723 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {\r
724 return false; // Invalid header format\r
725 }\r
726 $comment = substr($data,$headerlen,$commentlen);\r
727 $headerlen += $commentlen + 1;\r
728 }\r
729 $headercrc = "";\r
730 if ($flags & 2) {\r
731 // 2-bytes (lowest order) of CRC32 on header present\r
732 if ($len - $headerlen - 2 < 8) {\r
733 return false; // invalid\r
734 }\r
735 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;\r
736 $headercrc = unpack("v", substr($data,$headerlen,2));\r
737 $headercrc = $headercrc[1];\r
738 if ($headercrc != $calccrc) {\r
739 $error = "Header checksum failed.";\r
740 return false; // Bad header CRC\r
741 }\r
742 $headerlen += 2;\r
743 }\r
744 // GZIP FOOTER\r
745 $datacrc = unpack("V",substr($data,-8,4));\r
746 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);\r
747 $isize = unpack("V",substr($data,-4));\r
748 $isize = $isize[1];\r
749 // decompression:\r
750 $bodylen = $len-$headerlen-8;\r
751 if ($bodylen < 1) {\r
752 // IMPLEMENTATION BUG!\r
753 return null;\r
754 }\r
755 $body = substr($data,$headerlen,$bodylen);\r
756 $data = "";\r
757 if ($bodylen > 0) {\r
758 switch ($method) {\r
759 case 8:\r
760 // Currently the only supported compression method:\r
761 $data = gzinflate($body,$maxlength);\r
762 break;\r
763 default:\r
764 $error = "Unknown compression method.";\r
765 return false;\r
766 }\r
767 } // zero-byte body content is allowed\r
768 // Verifiy CRC32\r
769 $crc = sprintf("%u",crc32($data));\r
770 $crcOK = $crc == $datacrc;\r
771 $lenOK = $isize == strlen($data);\r
772 if (!$lenOK || !$crcOK) {\r
773 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');\r
774 return false;\r
775 }\r
776 return $data;\r
777 }\r
778}\r
ec397236 779?>