aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/humble-http-agent/HumbleHttpAgent.php
diff options
context:
space:
mode:
authorNicolas LÅ“uillet <nicolas.loeuillet@gmail.com>2013-08-25 12:12:53 -0700
committerNicolas LÅ“uillet <nicolas.loeuillet@gmail.com>2013-08-25 12:12:53 -0700
commitc51be6b697da573cdcf0788eb8617130ce5517a4 (patch)
tree642eaf70afb134dee5f274c84bf15b8aab00c117 /inc/3rdparty/humble-http-agent/HumbleHttpAgent.php
parent7ba37bd91a43321196e6d867caf9e298e82c6d6c (diff)
parent063fc1a7baaf6f7e1fb08eced058962a6140a471 (diff)
downloadwallabag-c51be6b697da573cdcf0788eb8617130ce5517a4.tar.gz
wallabag-c51be6b697da573cdcf0788eb8617130ce5517a4.tar.zst
wallabag-c51be6b697da573cdcf0788eb8617130ce5517a4.zip
Merge pull request #181 from inthepoche/dev
beta4
Diffstat (limited to 'inc/3rdparty/humble-http-agent/HumbleHttpAgent.php')
-rw-r--r--inc/3rdparty/humble-http-agent/HumbleHttpAgent.php720
1 files changed, 720 insertions, 0 deletions
diff --git a/inc/3rdparty/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/humble-http-agent/HumbleHttpAgent.php
new file mode 100644
index 00000000..7e5834ab
--- /dev/null
+++ b/inc/3rdparty/humble-http-agent/HumbleHttpAgent.php
@@ -0,0 +1,720 @@
1<?php
2/**
3 * Humble HTTP Agent
4 *
5 * This class is designed to take advantage of parallel HTTP requests
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
7 * For environments which do not have these options, it reverts to standard sequential
8 * requests (using file_get_contents())
9 *
10 * @version 1.0
11 * @date 2012-02-09
12 * @see http://php.net/HttpRequestPool
13 * @author Keyvan Minoukadeh
14 * @copyright 2011-2012 Keyvan Minoukadeh
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
16 */
17
18class HumbleHttpAgent
19{
20 const METHOD_REQUEST_POOL = 1;
21 const METHOD_CURL_MULTI = 2;
22 const METHOD_FILE_GET_CONTENTS = 4;
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
25 const UA_PHP = 'PHP/5.2';
26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
27
28 protected $requests = array();
29 protected $redirectQueue = array();
30 protected $requestOptions;
31 protected $maxParallelRequests = 5;
32 protected $cache = null; //TODO
33 protected $httpContext;
34 protected $minimiseMemoryUse = false; //TODO
35 protected $debug = false;
36 protected $method;
37 protected $cookieJar;
38 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
39 public $maxRedirects = 5;
40 public $userAgentMap = array();
41 public $rewriteUrls = array();
42 public $userAgentDefault;
43 public $referer;
44 //public $userAgent = 'Mozilla/5.0';
45
46 // Prevent certain file/mime types
47 // HTTP responses which match these content types will
48 // be returned without body.
49 public $headerOnlyTypes = array();
50 // URLs ending with one of these extensions will
51 // prompt Humble HTTP Agent to send a HEAD request first
52 // to see if returned content type matches $headerOnlyTypes.
53 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
54
55 //TODO: set max file size
56 //TODO: normalise headers
57
58 function __construct($requestOptions=null, $method=null) {
59 $this->userAgentDefault = self::UA_BROWSER;
60 $this->referer = self::REF_GOOGLE;
61 // set the request method
62 if (in_array($method, array(1,2,4))) {
63 $this->method = $method;
64 } else {
65 if (class_exists('HttpRequestPool')) {
66 $this->method = self::METHOD_REQUEST_POOL;
67 } elseif (function_exists('curl_multi_init')) {
68 $this->method = self::METHOD_CURL_MULTI;
69 } else {
70 $this->method = self::METHOD_FILE_GET_CONTENTS;
71 }
72 }
73 if ($this->method == self::METHOD_CURL_MULTI) {
74 require_once(dirname(__FILE__).'/RollingCurl.php');
75 }
76 // create cookie jar
77 $this->cookieJar = new CookieJar();
78 // set request options (redirect must be 0)
79 $this->requestOptions = array(
80 'timeout' => 15,
81 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
82 // TODO: test onprogress?
83 );
84 if (is_array($requestOptions)) {
85 $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
86 }
87 $this->httpContext = array(
88 'http' => array(
89 'ignore_errors' => true,
90 'timeout' => $this->requestOptions['timeout'],
91 'max_redirects' => $this->requestOptions['redirect'],
92 'header' => "Accept: */*\r\n"
93 )
94 );
95 }
96
97 protected function debug($msg) {
98 if ($this->debug) {
99 $mem = round(memory_get_usage()/1024, 2);
100 $memPeak = round(memory_get_peak_usage()/1024, 2);
101 echo '* ',$msg;
102 echo ' - mem used: ',$mem," (peak: $memPeak)\n";
103 ob_flush();
104 flush();
105 }
106 }
107
108 protected function getUserAgent($url, $asArray=false) {
109 $host = @parse_url($url, PHP_URL_HOST);
110 if (strtolower(substr($host, 0, 4)) == 'www.') {
111 $host = substr($host, 4);
112 }
113 if ($host) {
114 $try = array($host);
115 $split = explode('.', $host);
116 if (count($split) > 1) {
117 array_shift($split);
118 $try[] = '.'.implode('.', $split);
119 }
120 foreach ($try as $h) {
121 if (isset($this->userAgentMap[$h])) {
122 $ua = $this->userAgentMap[$h];
123 break;
124 }
125 }
126 }
127 if (!isset($ua)) $ua = $this->userAgentDefault;
128 if ($asArray) {
129 return array('User-Agent' => $ua);
130 } else {
131 return 'User-Agent: '.$ua;
132 }
133 }
134
135 public function rewriteHashbangFragment($url) {
136 // return $url if there's no '#!'
137 if (strpos($url, '#!') === false) return $url;
138 // split $url and rewrite
139 // TODO: is SimplePie_IRI included?
140 $iri = new SimplePie_IRI($url);
141 $fragment = substr($iri->fragment, 1); // strip '!'
142 $iri->fragment = null;
143 if (isset($iri->query)) {
144 parse_str($iri->query, $query);
145 } else {
146 $query = array();
147 }
148 $query['_escaped_fragment_'] = (string)$fragment;
149 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
150 return $iri->get_iri();
151 }
152
153 public function removeFragment($url) {
154 $pos = strpos($url, '#');
155 if ($pos === false) {
156 return $url;
157 } else {
158 return substr($url, 0, $pos);
159 }
160 }
161
162 public function rewriteUrls($url) {
163 foreach ($this->rewriteUrls as $find => $action) {
164 if (strpos($url, $find) !== false) {
165 if (is_array($action)) {
166 return strtr($url, $action);
167 }
168 }
169 }
170 return $url;
171 }
172
173 public function enableDebug($bool=true) {
174 $this->debug = (bool)$bool;
175 }
176
177 public function minimiseMemoryUse($bool = true) {
178 $this->minimiseMemoryUse = $bool;
179 }
180
181 public function setMaxParallelRequests($max) {
182 $this->maxParallelRequests = $max;
183 }
184
185 public function validateUrl($url) {
186 $url = filter_var($url, FILTER_SANITIZE_URL);
187 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
188 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
189 if ($test === false) {
190 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
191 }
192 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
193 return $url;
194 } else {
195 return false;
196 }
197 }
198
199 public function fetchAll(array $urls) {
200 $this->fetchAllOnce($urls, $isRedirect=false);
201 $redirects = 0;
202 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
203 $this->debug("Following redirects #$redirects...");
204 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
205 }
206 }
207
208 // fetch all URLs without following redirects
209 public function fetchAllOnce(array $urls, $isRedirect=false) {
210 if (!$isRedirect) $urls = array_unique($urls);
211 if (empty($urls)) return;
212
213 //////////////////////////////////////////////////////
214 // parallel (HttpRequestPool)
215 if ($this->method == self::METHOD_REQUEST_POOL) {
216 $this->debug('Starting parallel fetch (HttpRequestPool)');
217 try {
218 while (count($urls) > 0) {
219 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
220 $subset = array_splice($urls, 0, $this->maxParallelRequests);
221 $pool = new HttpRequestPool();
222 foreach ($subset as $orig => $url) {
223 if (!$isRedirect) $orig = $url;
224 unset($this->redirectQueue[$orig]);
225 $this->debug("...$url");
226 if (!$isRedirect && isset($this->requests[$url])) {
227 $this->debug("......in memory");
228 /*
229 } elseif ($this->isCached($url)) {
230 $this->debug("......is cached");
231 if (!$this->minimiseMemoryUse) {
232 $this->requests[$url] = $this->getCached($url);
233 }
234 */
235 } else {
236 $this->debug("......adding to pool");
237 $req_url = $this->rewriteUrls($url);
238 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
239 $req_url = $this->removeFragment($req_url);
240 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
241 $_meth = HttpRequest::METH_HEAD;
242 } else {
243 $_meth = HttpRequest::METH_GET;
244 unset($this->requests[$orig]['wrongGuess']);
245 }
246 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
247 // send cookies, if we have any
248 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
249 $this->debug("......sending cookies: $cookies");
250 $httpRequest->addHeaders(array('Cookie' => $cookies));
251 }
252 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
253 $httpRequest->addHeaders($this->getUserAgent($req_url, true));
254 // add referer for picky sites
255 $httpRequest->addheaders(array('Referer' => $this->referer));
256 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
257 $this->requests[$orig]['original_url'] = $orig;
258 $pool->attach($httpRequest);
259 }
260 }
261 // did we get anything into the pool?
262 if (count($pool) > 0) {
263 $this->debug('Sending request...');
264 try {
265 $pool->send();
266 } catch (HttpRequestPoolException $e) {
267 // do nothing
268 }
269 $this->debug('Received responses');
270 foreach($subset as $orig => $url) {
271 if (!$isRedirect) $orig = $url;
272 $request = $this->requests[$orig]['httpRequest'];
273 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
274 // getResponseHeader() doesn't return status line, so, for consistency...
275 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
276 // check content type
277 // TODO: use getResponseHeader('content-type') or getResponseInfo()
278 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
279 $this->requests[$orig]['body'] = '';
280 $_header_only_type = true;
281 $this->debug('Header only type returned');
282 } else {
283 $this->requests[$orig]['body'] = $request->getResponseBody();
284 $_header_only_type = false;
285 }
286 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
287 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
288 // is redirect?
289 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
290 $redirectURL = $request->getResponseHeader('location');
291 if (!preg_match('!^https?://!i', $redirectURL)) {
292 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
293 }
294 if ($this->validateURL($redirectURL)) {
295 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
296 // store any cookies
297 $cookies = $request->getResponseHeader('set-cookie');
298 if ($cookies && !is_array($cookies)) $cookies = array($cookies);
299 if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
300 $this->redirectQueue[$orig] = $redirectURL;
301 } else {
302 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
303 }
304 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
305 // the response content-type did not match our 'header only' types,
306 // but we'd issues a HEAD request because we assumed it would. So
307 // let's queue a proper GET request for this item...
308 $this->debug('Wrong guess at content-type, queing GET request');
309 $this->requests[$orig]['wrongGuess'] = true;
310 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
311 }
312 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
313 $pool->detach($request);
314 unset($this->requests[$orig]['httpRequest'], $request);
315 /*
316 if ($this->minimiseMemoryUse) {
317 if ($this->cache($url)) {
318 unset($this->requests[$url]);
319 }
320 }
321 */
322 }
323 }
324 }
325 } catch (HttpException $e) {
326 $this->debug($e);
327 return false;
328 }
329 }
330
331 //////////////////////////////////////////////////////////
332 // parallel (curl_multi_*)
333 elseif ($this->method == self::METHOD_CURL_MULTI) {
334 $this->debug('Starting parallel fetch (curl_multi_*)');
335 while (count($urls) > 0) {
336 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
337 $subset = array_splice($urls, 0, $this->maxParallelRequests);
338 $pool = new RollingCurl(array($this, 'handleCurlResponse'));
339 $pool->window_size = count($subset);
340
341 foreach ($subset as $orig => $url) {
342 if (!$isRedirect) $orig = $url;
343 unset($this->redirectQueue[$orig]);
344 $this->debug("...$url");
345 if (!$isRedirect && isset($this->requests[$url])) {
346 $this->debug("......in memory");
347 /*
348 } elseif ($this->isCached($url)) {
349 $this->debug("......is cached");
350 if (!$this->minimiseMemoryUse) {
351 $this->requests[$url] = $this->getCached($url);
352 }
353 */
354 } else {
355 $this->debug("......adding to pool");
356 $req_url = $this->rewriteUrls($url);
357 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
358 $req_url = $this->removeFragment($req_url);
359 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
360 $_meth = 'HEAD';
361 } else {
362 $_meth = 'GET';
363 unset($this->requests[$orig]['wrongGuess']);
364 }
365 $headers = array();
366 //$headers[] = 'User-Agent: '.$this->userAgent;
367 $headers[] = $this->getUserAgent($req_url);
368 // add referer for picky sites
369 $headers[] = 'Referer: '.$this->referer;
370 // send cookies, if we have any
371 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
372 $this->debug("......sending cookies: $cookies");
373 $headers[] = 'Cookie: '.$cookies;
374 }
375 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
376 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
377 CURLOPT_TIMEOUT => $this->requestOptions['timeout']
378 ));
379 $httpRequest->set_original_url($orig);
380 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
381 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
382 $pool->add($httpRequest);
383 }
384 }
385 // did we get anything into the pool?
386 if (count($pool) > 0) {
387 $this->debug('Sending request...');
388 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
389 $this->debug('Received responses');
390 foreach($subset as $orig => $url) {
391 if (!$isRedirect) $orig = $url;
392 // $this->requests[$orig]['headers']
393 // $this->requests[$orig]['body']
394 // $this->requests[$orig]['effective_url']
395 // check content type
396 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
397 $this->requests[$orig]['body'] = '';
398 $_header_only_type = true;
399 $this->debug('Header only type returned');
400 } else {
401 $_header_only_type = false;
402 }
403 $status_code = $this->requests[$orig]['status_code'];
404 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
405 $redirectURL = $this->requests[$orig]['location'];
406 if (!preg_match('!^https?://!i', $redirectURL)) {
407 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
408 }
409 if ($this->validateURL($redirectURL)) {
410 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
411 // store any cookies
412 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
413 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
414 $this->redirectQueue[$orig] = $redirectURL;
415 } else {
416 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
417 }
418 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
419 // the response content-type did not match our 'header only' types,
420 // but we'd issues a HEAD request because we assumed it would. So
421 // let's queue a proper GET request for this item...
422 $this->debug('Wrong guess at content-type, queing GET request');
423 $this->requests[$orig]['wrongGuess'] = true;
424 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
425 }
426 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
427 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
428 }
429 }
430 }
431 }
432
433 //////////////////////////////////////////////////////
434 // sequential (file_get_contents)
435 else {
436 $this->debug('Starting sequential fetch (file_get_contents)');
437 $this->debug('Processing set of '.count($urls));
438 foreach ($urls as $orig => $url) {
439 if (!$isRedirect) $orig = $url;
440 unset($this->redirectQueue[$orig]);
441 $this->debug("...$url");
442 if (!$isRedirect && isset($this->requests[$url])) {
443 $this->debug("......in memory");
444 /*
445 } elseif ($this->isCached($url)) {
446 $this->debug("......is cached");
447 if (!$this->minimiseMemoryUse) {
448 $this->requests[$url] = $this->getCached($url);
449 }
450 */
451 } else {
452 $this->debug("Sending request for $url");
453 $this->requests[$orig]['original_url'] = $orig;
454 $req_url = $this->rewriteUrls($url);
455 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
456 $req_url = $this->removeFragment($req_url);
457 // send cookies, if we have any
458 $httpContext = $this->httpContext;
459 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
460 // add referer for picky sites
461 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
462 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
463 $this->debug("......sending cookies: $cookies");
464 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
465 }
466 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
467 $this->debug('Received response');
468 // get status code
469 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
470 $this->debug('Error: no status code found');
471 // TODO: handle error - no status code
472 } else {
473 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
474 // check content type
475 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
476 $this->requests[$orig]['body'] = '';
477 } else {
478 $this->requests[$orig]['body'] = $html;
479 }
480 $this->requests[$orig]['effective_url'] = $req_url;
481 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
482 unset($match);
483 // handle redirect
484 if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) {
485 $this->requests[$orig]['location'] = trim($match[1]);
486 }
487 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
488 $redirectURL = $this->requests[$orig]['location'];
489 if (!preg_match('!^https?://!i', $redirectURL)) {
490 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
491 }
492 if ($this->validateURL($redirectURL)) {
493 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
494 // store any cookies
495 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
496 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
497 $this->redirectQueue[$orig] = $redirectURL;
498 } else {
499 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
500 }
501 }
502 }
503 } else {
504 $this->debug('Error retrieving URL');
505 //print_r($req_url);
506 //print_r($http_response_header);
507 //print_r($html);
508
509 // TODO: handle error - failed to retrieve URL
510 }
511 }
512 }
513 }
514 }
515
516 public function handleCurlResponse($response, $info, $request) {
517 $orig = $request->url_original;
518 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
519 $this->requests[$orig]['body'] = substr($response, $info['header_size']);
520 $this->requests[$orig]['method'] = $request->method;
521 $this->requests[$orig]['effective_url'] = $info['url'];
522 $this->requests[$orig]['status_code'] = (int)$info['http_code'];
523 if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) {
524 $this->requests[$orig]['location'] = trim($match[1]);
525 }
526 }
527
528 protected function headersToString(array $headers, $associative=true) {
529 if (!$associative) {
530 return implode("\n", $headers);
531 } else {
532 $str = '';
533 foreach ($headers as $key => $val) {
534 if (is_array($val)) {
535 foreach ($val as $v) $str .= "$key: $v\n";
536 } else {
537 $str .= "$key: $val\n";
538 }
539 }
540 return rtrim($str);
541 }
542 }
543
544 public function get($url, $remove=false, $gzdecode=true) {
545 $url = "$url";
546 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
547 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
548 $response = $this->requests[$url];
549 /*
550 } elseif ($this->isCached($url)) {
551 $this->debug("URL already fetched - in disk cache ($url)");
552 $response = $this->getCached($url);
553 $this->requests[$url] = $response;
554 */
555 } else {
556 $this->debug("Fetching URL ($url)");
557 $this->fetchAll(array($url));
558 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
559 $response = $this->requests[$url];
560 } else {
561 $this->debug("Request failed");
562 $response = false;
563 }
564 }
565 /*
566 if ($this->minimiseMemoryUse && $response) {
567 $this->cache($url);
568 unset($this->requests[$url]);
569 }
570 */
571 if ($remove && $response) unset($this->requests[$url]);
572 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
573 if ($html = gzdecode($response['body'])) {
574 $response['body'] = $html;
575 }
576 }
577 return $response;
578 }
579
580 public function parallelSupport() {
581 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
582 }
583
584 private function headerOnlyType($headers) {
585 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
586 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
587 $match[1] = strtolower(trim($match[1]));
588 $match[2] = strtolower(trim($match[2]));
589 foreach (array($match[1], $match[2]) as $mime) {
590 if (in_array($mime, $this->headerOnlyTypes)) return true;
591 }
592 }
593 return false;
594 }
595
596 private function possibleUnsupportedType($url) {
597 $path = @parse_url($url, PHP_URL_PATH);
598 if ($path && strpos($path, '.') !== false) {
599 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
600 return in_array($ext, $this->headerOnlyClues);
601 }
602 return false;
603 }
604}
605
606// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
607if (!function_exists('gzdecode')) {
608 function gzdecode($data,&$filename='',&$error='',$maxlength=null)
609 {
610 $len = strlen($data);
611 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
612 $error = "Not in GZIP format.";
613 return null; // Not GZIP format (See RFC 1952)
614 }
615 $method = ord(substr($data,2,1)); // Compression method
616 $flags = ord(substr($data,3,1)); // Flags
617 if ($flags & 31 != $flags) {
618 $error = "Reserved bits not allowed.";
619 return null;
620 }
621 // NOTE: $mtime may be negative (PHP integer limitations)
622 $mtime = unpack("V", substr($data,4,4));
623 $mtime = $mtime[1];
624 $xfl = substr($data,8,1);
625 $os = substr($data,8,1);
626 $headerlen = 10;
627 $extralen = 0;
628 $extra = "";
629 if ($flags & 4) {
630 // 2-byte length prefixed EXTRA data in header
631 if ($len - $headerlen - 2 < 8) {
632 return false; // invalid
633 }
634 $extralen = unpack("v",substr($data,8,2));
635 $extralen = $extralen[1];
636 if ($len - $headerlen - 2 - $extralen < 8) {
637 return false; // invalid
638 }
639 $extra = substr($data,10,$extralen);
640 $headerlen += 2 + $extralen;
641 }
642 $filenamelen = 0;
643 $filename = "";
644 if ($flags & 8) {
645 // C-style string
646 if ($len - $headerlen - 1 < 8) {
647 return false; // invalid
648 }
649 $filenamelen = strpos(substr($data,$headerlen),chr(0));
650 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
651 return false; // invalid
652 }
653 $filename = substr($data,$headerlen,$filenamelen);
654 $headerlen += $filenamelen + 1;
655 }
656 $commentlen = 0;
657 $comment = "";
658 if ($flags & 16) {
659 // C-style string COMMENT data in header
660 if ($len - $headerlen - 1 < 8) {
661 return false; // invalid
662 }
663 $commentlen = strpos(substr($data,$headerlen),chr(0));
664 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
665 return false; // Invalid header format
666 }
667 $comment = substr($data,$headerlen,$commentlen);
668 $headerlen += $commentlen + 1;
669 }
670 $headercrc = "";
671 if ($flags & 2) {
672 // 2-bytes (lowest order) of CRC32 on header present
673 if ($len - $headerlen - 2 < 8) {
674 return false; // invalid
675 }
676 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
677 $headercrc = unpack("v", substr($data,$headerlen,2));
678 $headercrc = $headercrc[1];
679 if ($headercrc != $calccrc) {
680 $error = "Header checksum failed.";
681 return false; // Bad header CRC
682 }
683 $headerlen += 2;
684 }
685 // GZIP FOOTER
686 $datacrc = unpack("V",substr($data,-8,4));
687 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
688 $isize = unpack("V",substr($data,-4));
689 $isize = $isize[1];
690 // decompression:
691 $bodylen = $len-$headerlen-8;
692 if ($bodylen < 1) {
693 // IMPLEMENTATION BUG!
694 return null;
695 }
696 $body = substr($data,$headerlen,$bodylen);
697 $data = "";
698 if ($bodylen > 0) {
699 switch ($method) {
700 case 8:
701 // Currently the only supported compression method:
702 $data = gzinflate($body,$maxlength);
703 break;
704 default:
705 $error = "Unknown compression method.";
706 return false;
707 }
708 } // zero-byte body content is allowed
709 // Verifiy CRC32
710 $crc = sprintf("%u",crc32($data));
711 $crcOK = $crc == $datacrc;
712 $lenOK = $isize == strlen($data);
713 if (!$lenOK || !$crcOK) {
714 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
715 return false;
716 }
717 return $data;
718 }
719}
720?> \ No newline at end of file