]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
htmlawed via composer
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / humble-http-agent / HumbleHttpAgent.php
CommitLineData
3ec62cf9
MR
1<?php
2/**
3 * Humble HTTP Agent
4 *
5 * This class is designed to take advantage of parallel HTTP requests
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
7 * For environments which do not have these options, it reverts to standard sequential
8 * requests (using file_get_contents())
9 *
10 * @version 1.4
11 * @date 2013-05-10
12 * @see http://php.net/HttpRequestPool
13 * @author Keyvan Minoukadeh
14 * @copyright 2011-2013 Keyvan Minoukadeh
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
16 */
17
18class HumbleHttpAgent
19{
20 const METHOD_REQUEST_POOL = 1;
21 const METHOD_CURL_MULTI = 2;
22 const METHOD_FILE_GET_CONTENTS = 4;
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
25 const UA_PHP = 'PHP/5.4';
26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
27
28 protected $requests = array();
29 protected $redirectQueue = array();
30 protected $requestOptions;
31 protected $maxParallelRequests = 5;
32 protected $cache = null; //TODO
33 protected $httpContext;
34 protected $minimiseMemoryUse = false; //TODO
35 protected $method;
36 protected $cookieJar;
37 public $debug = false;
38 public $debugVerbose = false;
39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
40 public $maxRedirects = 5;
41 public $userAgentMap = array();
42 public $rewriteUrls = array();
43 public $userAgentDefault;
44 public $referer;
45 //public $userAgent = 'Mozilla/5.0';
46
47 // Prevent certain file/mime types
48 // HTTP responses which match these content types will
49 // be returned without body.
50 public $headerOnlyTypes = array();
51 // URLs ending with one of these extensions will
52 // prompt Humble HTTP Agent to send a HEAD request first
53 // to see if returned content type matches $headerOnlyTypes.
54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
55 // AJAX triggers to search for.
56 // for AJAX sites, e.g. Blogger with its dynamic views templates.
57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
58
59 //TODO: set max file size
60 //TODO: normalise headers
61
62 function __construct($requestOptions=null, $method=null) {
63 $this->userAgentDefault = self::UA_BROWSER;
64 $this->referer = self::REF_GOOGLE;
65 // set the request method
66 if (in_array($method, array(1,2,4))) {
67 $this->method = $method;
68 } else {
69 if (class_exists('HttpRequestPool')) {
70 $this->method = self::METHOD_REQUEST_POOL;
71 } elseif (function_exists('curl_multi_init')) {
72 $this->method = self::METHOD_CURL_MULTI;
73 } else {
74 $this->method = self::METHOD_FILE_GET_CONTENTS;
75 }
76 }
77 if ($this->method == self::METHOD_CURL_MULTI) {
78 require_once(dirname(__FILE__).'/RollingCurl.php');
79 }
80 // create cookie jar
81 $this->cookieJar = new CookieJar();
82 // set request options (redirect must be 0)
83 $this->requestOptions = array(
84 'timeout' => 15,
85 'connecttimeout' => 15,
86 'dns_cache_timeout' => 300,
87 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
88 // TODO: test onprogress?
89 );
90 if (is_array($requestOptions)) {
91 $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
92 }
93 $this->httpContext = array(
94 'http' => array(
95 'ignore_errors' => true,
96 'timeout' => $this->requestOptions['timeout'],
97 'max_redirects' => $this->requestOptions['redirect'],
98 'header' => "Accept: */*\r\n"
99 )
100 );
101 }
102
103 protected function debug($msg) {
104 if ($this->debug) {
105 $mem = round(memory_get_usage()/1024, 2);
106 $memPeak = round(memory_get_peak_usage()/1024, 2);
107 echo '* ',$msg;
108 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
109 echo "\n";
110 ob_flush();
111 flush();
112 }
113 }
114
115 protected function getUserAgent($url, $asArray=false) {
116 $host = @parse_url($url, PHP_URL_HOST);
117 if (strtolower(substr($host, 0, 4)) == 'www.') {
118 $host = substr($host, 4);
119 }
120 if ($host) {
121 $try = array($host);
122 $split = explode('.', $host);
123 if (count($split) > 1) {
124 array_shift($split);
125 $try[] = '.'.implode('.', $split);
126 }
127 foreach ($try as $h) {
128 if (isset($this->userAgentMap[$h])) {
129 $ua = $this->userAgentMap[$h];
130 break;
131 }
132 }
133 }
134 if (!isset($ua)) $ua = $this->userAgentDefault;
135 if ($asArray) {
136 return array('User-Agent' => $ua);
137 } else {
138 return 'User-Agent: '.$ua;
139 }
140 }
141
142 public function rewriteHashbangFragment($url) {
143 // return $url if there's no '#!'
144 if (strpos($url, '#!') === false) return $url;
145 // split $url and rewrite
146 // TODO: is SimplePie_IRI included?
147 $iri = new SimplePie_IRI($url);
148 $fragment = substr($iri->fragment, 1); // strip '!'
149 $iri->fragment = null;
150 if (isset($iri->query)) {
151 parse_str($iri->query, $query);
152 } else {
153 $query = array();
154 }
155 $query['_escaped_fragment_'] = (string)$fragment;
156 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
157 return $iri->get_iri();
158 }
159
160 public function getRedirectURLfromHTML($url, $html) {
161 $redirect_url = $this->getMetaRefreshURL($url, $html);
162 if (!$redirect_url) {
163 $redirect_url = $this->getUglyURL($url, $html);
164 }
165 return $redirect_url;
166 }
167
168 public function getMetaRefreshURL($url, $html) {
169 if ($html == '') return false;
170 // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
171 if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
172 return false;
173 }
174 $redirect_url = $match[1];
175 if (preg_match('!^https?://!i', $redirect_url)) {
176 // already absolute
177 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
178 return $redirect_url;
179 }
180 // absolutize redirect URL
181 $base = new SimplePie_IRI($url);
182 // remove '//' in URL path (causes URLs not to resolve properly)
183 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
184 if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
185 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
186 return $absolute;
187 }
188 return false;
189 }
190
191 public function getUglyURL($url, $html) {
192 if ($html == '') return false;
193 $found = false;
194 foreach ($this->ajaxTriggers as $string) {
195 if (stripos($html, $string)) {
196 $found = true;
197 break;
198 }
199 }
200 if (!$found) return false;
201 $iri = new SimplePie_IRI($url);
202 if (isset($iri->query)) {
203 parse_str($iri->query, $query);
204 } else {
205 $query = array();
206 }
207 $query['_escaped_fragment_'] = '';
208 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
209 $ugly_url = $iri->get_iri();
210 $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
211 return $ugly_url;
212 }
213
214 public function removeFragment($url) {
215 $pos = strpos($url, '#');
216 if ($pos === false) {
217 return $url;
218 } else {
219 return substr($url, 0, $pos);
220 }
221 }
222
223 public function rewriteUrls($url) {
224 foreach ($this->rewriteUrls as $find => $action) {
225 if (strpos($url, $find) !== false) {
226 if (is_array($action)) {
227 return strtr($url, $action);
228 }
229 }
230 }
231 return $url;
232 }
233
234 public function enableDebug($bool=true) {
235 $this->debug = (bool)$bool;
236 }
237
238 public function minimiseMemoryUse($bool = true) {
239 $this->minimiseMemoryUse = $bool;
240 }
241
242 public function setMaxParallelRequests($max) {
243 $this->maxParallelRequests = $max;
244 }
245
246 public function validateUrl($url) {
247 $url = filter_var($url, FILTER_SANITIZE_URL);
248 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
249 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
250 if ($test === false) {
251 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
252 }
253 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
254 return $url;
255 } else {
256 return false;
257 }
258 }
259
260 public function fetchAll(array $urls) {
261 $this->fetchAllOnce($urls, $isRedirect=false);
262 $redirects = 0;
263 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
264 $this->debug("Following redirects #$redirects...");
265 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
266 }
267 }
268
269 // fetch all URLs without following redirects
270 public function fetchAllOnce(array $urls, $isRedirect=false) {
271 if (!$isRedirect) $urls = array_unique($urls);
272 if (empty($urls)) return;
273
274 //////////////////////////////////////////////////////
275 // parallel (HttpRequestPool)
276 if ($this->method == self::METHOD_REQUEST_POOL) {
277 $this->debug('Starting parallel fetch (HttpRequestPool)');
278 try {
279 while (count($urls) > 0) {
280 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
281 $subset = array_splice($urls, 0, $this->maxParallelRequests);
282 $pool = new HttpRequestPool();
283 foreach ($subset as $orig => $url) {
284 if (!$isRedirect) $orig = $url;
285 unset($this->redirectQueue[$orig]);
286 $this->debug("...$url");
287 if (!$isRedirect && isset($this->requests[$url])) {
288 $this->debug("......in memory");
289 /*
290 } elseif ($this->isCached($url)) {
291 $this->debug("......is cached");
292 if (!$this->minimiseMemoryUse) {
293 $this->requests[$url] = $this->getCached($url);
294 }
295 */
296 } else {
297 $this->debug("......adding to pool");
298 $req_url = $this->rewriteUrls($url);
299 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
300 $req_url = $this->removeFragment($req_url);
301 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
302 $_meth = HttpRequest::METH_HEAD;
303 } else {
304 $_meth = HttpRequest::METH_GET;
305 unset($this->requests[$orig]['wrongGuess']);
306 }
307 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
308 // send cookies, if we have any
309 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
310 $this->debug("......sending cookies: $cookies");
311 $httpRequest->addHeaders(array('Cookie' => $cookies));
312 }
313 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
314 $httpRequest->addHeaders($this->getUserAgent($req_url, true));
315 // add referer for picky sites
316 $httpRequest->addheaders(array('Referer' => $this->referer));
317 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
318 $this->requests[$orig]['original_url'] = $orig;
319 $pool->attach($httpRequest);
320 }
321 }
322 // did we get anything into the pool?
323 if (count($pool) > 0) {
324 $this->debug('Sending request...');
325 try {
326 $pool->send();
327 } catch (HttpRequestPoolException $e) {
328 // do nothing
329 }
330 $this->debug('Received responses');
331 foreach($subset as $orig => $url) {
332 if (!$isRedirect) $orig = $url;
333 $request = $this->requests[$orig]['httpRequest'];
334 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
335 // getResponseHeader() doesn't return status line, so, for consistency...
336 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
337 // check content type
338 // TODO: use getResponseHeader('content-type') or getResponseInfo()
339 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
340 $this->requests[$orig]['body'] = '';
341 $_header_only_type = true;
342 $this->debug('Header only type returned');
343 } else {
344 $this->requests[$orig]['body'] = $request->getResponseBody();
345 $_header_only_type = false;
346 }
347 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
348 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
349 // is redirect?
350 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
351 $redirectURL = $request->getResponseHeader('location');
352 if (!preg_match('!^https?://!i', $redirectURL)) {
353 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
354 }
355 if ($this->validateURL($redirectURL)) {
356 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
357 // store any cookies
358 $cookies = $request->getResponseHeader('set-cookie');
359 if ($cookies && !is_array($cookies)) $cookies = array($cookies);
360 if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
361 $this->redirectQueue[$orig] = $redirectURL;
362 } else {
363 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
364 }
365 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
366 // the response content-type did not match our 'header only' types,
367 // but we'd issues a HEAD request because we assumed it would. So
368 // let's queue a proper GET request for this item...
369 $this->debug('Wrong guess at content-type, queing GET request');
370 $this->requests[$orig]['wrongGuess'] = true;
371 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
372 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
373 // check for <meta name='fragment' content='!'/>
374 // for AJAX sites, e.g. Blogger with its dynamic views templates.
375 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
376 if (isset($this->requests[$orig]['body'])) {
377 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
378 if ($redirectURL) {
379 $this->redirectQueue[$orig] = $redirectURL;
380 }
381 }
382 }
383 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
384 $pool->detach($request);
385 unset($this->requests[$orig]['httpRequest'], $request);
386 /*
387 if ($this->minimiseMemoryUse) {
388 if ($this->cache($url)) {
389 unset($this->requests[$url]);
390 }
391 }
392 */
393 }
394 }
395 }
396 } catch (HttpException $e) {
397 $this->debug($e);
398 return false;
399 }
400 }
401
402 //////////////////////////////////////////////////////////
403 // parallel (curl_multi_*)
404 elseif ($this->method == self::METHOD_CURL_MULTI) {
405 $this->debug('Starting parallel fetch (curl_multi_*)');
406 while (count($urls) > 0) {
407 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
408 $subset = array_splice($urls, 0, $this->maxParallelRequests);
409 $pool = new RollingCurl(array($this, 'handleCurlResponse'));
410 $pool->window_size = count($subset);
411
412 foreach ($subset as $orig => $url) {
413 if (!$isRedirect) $orig = $url;
414 unset($this->redirectQueue[$orig]);
415 $this->debug("...$url");
416 if (!$isRedirect && isset($this->requests[$url])) {
417 $this->debug("......in memory");
418 /*
419 } elseif ($this->isCached($url)) {
420 $this->debug("......is cached");
421 if (!$this->minimiseMemoryUse) {
422 $this->requests[$url] = $this->getCached($url);
423 }
424 */
425 } else {
426 $this->debug("......adding to pool");
427 $req_url = $this->rewriteUrls($url);
428 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
429 $req_url = $this->removeFragment($req_url);
430 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
431 $_meth = 'HEAD';
432 } else {
433 $_meth = 'GET';
434 unset($this->requests[$orig]['wrongGuess']);
435 }
436 $headers = array();
437 //$headers[] = 'User-Agent: '.$this->userAgent;
438 $headers[] = $this->getUserAgent($req_url);
439 // add referer for picky sites
440 $headers[] = 'Referer: '.$this->referer;
441 // send cookies, if we have any
442 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
443 $this->debug("......sending cookies: $cookies");
444 $headers[] = 'Cookie: '.$cookies;
445 }
446 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
447 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
448 CURLOPT_TIMEOUT => $this->requestOptions['timeout']
449 ));
450 $httpRequest->set_original_url($orig);
451 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
452 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
453 $pool->add($httpRequest);
454 }
455 }
456 // did we get anything into the pool?
457 if (count($pool) > 0) {
458 $this->debug('Sending request...');
459 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
460 $this->debug('Received responses');
461 foreach($subset as $orig => $url) {
462 if (!$isRedirect) $orig = $url;
463 // $this->requests[$orig]['headers']
464 // $this->requests[$orig]['body']
465 // $this->requests[$orig]['effective_url']
466 // check content type
467 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
468 $this->requests[$orig]['body'] = '';
469 $_header_only_type = true;
470 $this->debug('Header only type returned');
471 } else {
472 $_header_only_type = false;
473 }
474 $status_code = $this->requests[$orig]['status_code'];
475 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
476 $redirectURL = $this->requests[$orig]['location'];
477 if (!preg_match('!^https?://!i', $redirectURL)) {
478 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
479 }
480 if ($this->validateURL($redirectURL)) {
481 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
482 // store any cookies
483 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
484 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
485 $this->redirectQueue[$orig] = $redirectURL;
486 } else {
487 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
488 }
489 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
490 // the response content-type did not match our 'header only' types,
491 // but we'd issues a HEAD request because we assumed it would. So
492 // let's queue a proper GET request for this item...
493 $this->debug('Wrong guess at content-type, queing GET request');
494 $this->requests[$orig]['wrongGuess'] = true;
495 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
496 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
497 // check for <meta name='fragment' content='!'/>
498 // for AJAX sites, e.g. Blogger with its dynamic views templates.
499 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
500 if (isset($this->requests[$orig]['body'])) {
501 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
502 if ($redirectURL) {
503 $this->redirectQueue[$orig] = $redirectURL;
504 }
505 }
506 }
507 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
508 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
509 }
510 }
511 }
512 }
513
514 //////////////////////////////////////////////////////
515 // sequential (file_get_contents)
516 else {
517 $this->debug('Starting sequential fetch (file_get_contents)');
518 $this->debug('Processing set of '.count($urls));
519 foreach ($urls as $orig => $url) {
520 if (!$isRedirect) $orig = $url;
521 unset($this->redirectQueue[$orig]);
522 $this->debug("...$url");
523 if (!$isRedirect && isset($this->requests[$url])) {
524 $this->debug("......in memory");
525 /*
526 } elseif ($this->isCached($url)) {
527 $this->debug("......is cached");
528 if (!$this->minimiseMemoryUse) {
529 $this->requests[$url] = $this->getCached($url);
530 }
531 */
532 } else {
533 $this->debug("Sending request for $url");
534 $this->requests[$orig]['original_url'] = $orig;
535 $req_url = $this->rewriteUrls($url);
536 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
537 $req_url = $this->removeFragment($req_url);
538 // send cookies, if we have any
539 $httpContext = $this->httpContext;
540 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
541 // add referer for picky sites
542 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
543 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
544 $this->debug("......sending cookies: $cookies");
545 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
546 }
547 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
548 $this->debug('Received response');
549 // get status code
550 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
551 $this->debug('Error: no status code found');
552 // TODO: handle error - no status code
553 } else {
554 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
555 // check content type
556 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
557 $this->requests[$orig]['body'] = '';
558 } else {
559 $this->requests[$orig]['body'] = $html;
560 }
561 $this->requests[$orig]['effective_url'] = $req_url;
562 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
563 unset($match);
564 // handle redirect
565 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
566 $this->requests[$orig]['location'] = trim($match[1]);
567 }
568 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
569 $redirectURL = $this->requests[$orig]['location'];
570 if (!preg_match('!^https?://!i', $redirectURL)) {
571 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
572 }
573 if ($this->validateURL($redirectURL)) {
574 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
575 // store any cookies
576 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
577 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
578 $this->redirectQueue[$orig] = $redirectURL;
579 } else {
580 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
581 }
582 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
583 // check for <meta name='fragment' content='!'/>
584 // for AJAX sites, e.g. Blogger with its dynamic views templates.
585 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
586 if (isset($this->requests[$orig]['body'])) {
587 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
588 if ($redirectURL) {
589 $this->redirectQueue[$orig] = $redirectURL;
590 }
591 }
592 }
593 }
594 } else {
595 $this->debug('Error retrieving URL');
596 //print_r($req_url);
597 //print_r($http_response_header);
598 //print_r($html);
599
600 // TODO: handle error - failed to retrieve URL
601 }
602 }
603 }
604 }
605 }
606
607 public function handleCurlResponse($response, $info, $request) {
608 $orig = $request->url_original;
609 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
610 $this->requests[$orig]['body'] = substr($response, $info['header_size']);
611 $this->requests[$orig]['method'] = $request->method;
612 $this->requests[$orig]['effective_url'] = $info['url'];
613 $this->requests[$orig]['status_code'] = (int)$info['http_code'];
614 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
615 $this->requests[$orig]['location'] = trim($match[1]);
616 }
617 }
618
619 protected function headersToString(array $headers, $associative=true) {
620 if (!$associative) {
621 return implode("\n", $headers);
622 } else {
623 $str = '';
624 foreach ($headers as $key => $val) {
625 if (is_array($val)) {
626 foreach ($val as $v) $str .= "$key: $v\n";
627 } else {
628 $str .= "$key: $val\n";
629 }
630 }
631 return rtrim($str);
632 }
633 }
634
635 public function get($url, $remove=false, $gzdecode=true) {
636 $url = "$url";
637 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
638 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
639 $response = $this->requests[$url];
640 /*
641 } elseif ($this->isCached($url)) {
642 $this->debug("URL already fetched - in disk cache ($url)");
643 $response = $this->getCached($url);
644 $this->requests[$url] = $response;
645 */
646 } else {
647 $this->debug("Fetching URL ($url)");
648 $this->fetchAll(array($url));
649 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
650 $response = $this->requests[$url];
651 } else {
652 $this->debug("Request failed");
653 $response = false;
654 }
655 }
656 /*
657 if ($this->minimiseMemoryUse && $response) {
658 $this->cache($url);
659 unset($this->requests[$url]);
660 }
661 */
662 if ($remove && $response) unset($this->requests[$url]);
663 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
664 if ($html = gzdecode($response['body'])) {
665 $response['body'] = $html;
666 }
667 }
668 return $response;
669 }
670
671 public function parallelSupport() {
672 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
673 }
674
675 private function headerOnlyType($headers) {
676 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
677 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
678 $match[1] = strtolower(trim($match[1]));
679 $match[2] = strtolower(trim($match[2]));
680 foreach (array($match[1], $match[2]) as $mime) {
681 if (in_array($mime, $this->headerOnlyTypes)) return true;
682 }
683 }
684 return false;
685 }
686
687 private function possibleUnsupportedType($url) {
688 $path = @parse_url($url, PHP_URL_PATH);
689 if ($path && strpos($path, '.') !== false) {
690 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
691 return in_array($ext, $this->headerOnlyClues);
692 }
693 return false;
694 }
695}
696
697// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
698if (!function_exists('gzdecode')) {
699 function gzdecode($data,&$filename='',&$error='',$maxlength=null)
700 {
701 $len = strlen($data);
702 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
703 $error = "Not in GZIP format.";
704 return null; // Not GZIP format (See RFC 1952)
705 }
706 $method = ord(substr($data,2,1)); // Compression method
707 $flags = ord(substr($data,3,1)); // Flags
708 if ($flags & 31 != $flags) {
709 $error = "Reserved bits not allowed.";
710 return null;
711 }
712 // NOTE: $mtime may be negative (PHP integer limitations)
713 $mtime = unpack("V", substr($data,4,4));
714 $mtime = $mtime[1];
715 $xfl = substr($data,8,1);
716 $os = substr($data,8,1);
717 $headerlen = 10;
718 $extralen = 0;
719 $extra = "";
720 if ($flags & 4) {
721 // 2-byte length prefixed EXTRA data in header
722 if ($len - $headerlen - 2 < 8) {
723 return false; // invalid
724 }
725 $extralen = unpack("v",substr($data,8,2));
726 $extralen = $extralen[1];
727 if ($len - $headerlen - 2 - $extralen < 8) {
728 return false; // invalid
729 }
730 $extra = substr($data,10,$extralen);
731 $headerlen += 2 + $extralen;
732 }
733 $filenamelen = 0;
734 $filename = "";
735 if ($flags & 8) {
736 // C-style string
737 if ($len - $headerlen - 1 < 8) {
738 return false; // invalid
739 }
740 $filenamelen = strpos(substr($data,$headerlen),chr(0));
741 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
742 return false; // invalid
743 }
744 $filename = substr($data,$headerlen,$filenamelen);
745 $headerlen += $filenamelen + 1;
746 }
747 $commentlen = 0;
748 $comment = "";
749 if ($flags & 16) {
750 // C-style string COMMENT data in header
751 if ($len - $headerlen - 1 < 8) {
752 return false; // invalid
753 }
754 $commentlen = strpos(substr($data,$headerlen),chr(0));
755 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
756 return false; // Invalid header format
757 }
758 $comment = substr($data,$headerlen,$commentlen);
759 $headerlen += $commentlen + 1;
760 }
761 $headercrc = "";
762 if ($flags & 2) {
763 // 2-bytes (lowest order) of CRC32 on header present
764 if ($len - $headerlen - 2 < 8) {
765 return false; // invalid
766 }
767 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
768 $headercrc = unpack("v", substr($data,$headerlen,2));
769 $headercrc = $headercrc[1];
770 if ($headercrc != $calccrc) {
771 $error = "Header checksum failed.";
772 return false; // Bad header CRC
773 }
774 $headerlen += 2;
775 }
776 // GZIP FOOTER
777 $datacrc = unpack("V",substr($data,-8,4));
778 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
779 $isize = unpack("V",substr($data,-4));
780 $isize = $isize[1];
781 // decompression:
782 $bodylen = $len-$headerlen-8;
783 if ($bodylen < 1) {
784 // IMPLEMENTATION BUG!
785 return null;
786 }
787 $body = substr($data,$headerlen,$bodylen);
788 $data = "";
789 if ($bodylen > 0) {
790 switch ($method) {
791 case 8:
792 // Currently the only supported compression method:
793 $data = gzinflate($body,$maxlength);
794 break;
795 default:
796 $error = "Unknown compression method.";
797 return false;
798 }
799 } // zero-byte body content is allowed
800 // Verifiy CRC32
801 $crc = sprintf("%u",crc32($data));
802 $crcOK = $crc == $datacrc;
803 $lenOK = $isize == strlen($data);
804 if (!$lenOK || !$crcOK) {
805 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
806 return false;
807 }
808 return $data;
809 }
810}