From 3ec62cf95ab4436923d4c665fad7aef226cbb822 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Thu, 22 May 2014 17:16:38 +0300 Subject: update to 3.2 version of full-text-rss, issue #694 --- .../libraries/humble-http-agent/CookieJar.php | 807 +++++----- .../humble-http-agent/HumbleHttpAgent.php | 1589 ++++++++++---------- .../SimplePie_HumbleHttpAgent.php | 157 +- 3 files changed, 1291 insertions(+), 1262 deletions(-) (limited to 'inc/3rdparty/libraries/humble-http-agent') diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php index 83e94f14..e4d5f495 100644 --- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php +++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php @@ -1,404 +1,403 @@ - - * - * This class should be used to handle cookies (storing cookies from HTTP response messages, and - * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org - * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ - * - * This class is mainly based on Cookies.pm from the libwww-perl collection . - * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. - * - * @version 0.5 - * @date 2011-03-15 - * @see http://php.net/HttpRequestPool - * @author Keyvan Minoukadeh - * @copyright 2011 Keyvan Minoukadeh - * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 - */ - -class CookieJar -{ - /** - * Cookies - array containing all cookies. - * - *
-    * Cookies are stored like this:
-    *   [domain][path][name] = array
-    * where array is:
-    *   0 => value, 1 => secure, 2 => expires
-    * 
- * @var array - * @access private - */ - public $cookies = array(); - public $debug = false; - - /** - * Constructor - */ - function __construct() { - } - - protected function debug($msg, $file=null, $line=null) { - if ($this->debug) { - $mem = round(memory_get_usage()/1024, 2); - $memPeak = round(memory_get_peak_usage()/1024, 2); - echo '* ',$msg; - if (isset($file, $line)) echo " ($file line $line)"; - echo ' - mem used: ',$mem," (peak: $memPeak)\n"; - ob_flush(); - flush(); - } - } - - /** - * Get matching cookies - * - * Only use this method if you cannot use add_cookie_header(), for example, if you want to use - * this cookie jar class without using the request class. - * - * @param array $param associative array containing 'domain', 'path', 'secure' keys - * @return string - * @see add_cookie_header() - */ - public function getMatchingCookies($url) - { - if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { - $param['domain'] = $parts['host']; - $param['path'] = $parts['path']; - $param['secure'] = (strtolower($parts['scheme']) == 'https'); - unset($parts); - } else { - return false; - } - // RFC 2965 notes: - // If multiple cookies satisfy the criteria above, they are ordered in - // the Cookie header such that those with more specific Path attributes - // precede those with less specific. Ordering with respect to other - // attributes (e.g., Domain) is unspecified. - $domain = $param['domain']; - if (strpos($domain, '.') === false) $domain .= '.local'; - $request_path = $param['path']; - if ($request_path == '') $request_path = '/'; - $request_secure = $param['secure']; - $now = time(); - $matched_cookies = array(); - // domain - find matching domains - $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); - while (strpos($domain, '.') !== false) { - if (isset($this->cookies[$domain])) { - $this->debug(' domain match found: '.$domain); - $cookies =& $this->cookies[$domain]; - } else { - $domain = $this->_reduce_domain($domain); - continue; - } - // paths - find matching paths starting from most specific - $this->debug(' - Finding matching paths for '.$request_path); - $paths = array_keys($cookies); - usort($paths, array($this, '_cmp_length')); - foreach ($paths as $path) { - // continue to next cookie if request path does not path-match cookie path - if (!$this->_path_match($request_path, $path)) continue; - // loop through cookie names - $this->debug(' path match found: '.$path); - foreach ($cookies[$path] as $name => $values) { - // if this cookie is secure but request isn't, continue to next cookie - if ($values[1] && !$request_secure) continue; - // if cookie is not a session cookie and has expired, continue to next cookie - if (is_int($values[2]) && ($values[2] < $now)) continue; - // cookie matches request - $this->debug(' cookie match: '.$name.'='.$values[0]); - $matched_cookies[] = $name.'='.$values[0]; - } - } - $domain = $this->_reduce_domain($domain); - } - // return cookies - return implode('; ', $matched_cookies); - } - - /** - * Parse Set-Cookie values. - * - * Only use this method if you cannot use extract_cookies(), for example, if you want to use - * this cookie jar class without using the response class. - * - * @param array $set_cookies array holding 1 or more "Set-Cookie" header values - * @param array $param associative array containing 'host', 'path' keys - * @return void - * @see extract_cookies() - */ - public function storeCookies($url, $set_cookies) - { - if (count($set_cookies) == 0) return; - $param = @parse_url($url); - if (!is_array($param) || !isset($param['host'])) return; - $request_host = $param['host']; - if (strpos($request_host, '.') === false) $request_host .= '.local'; - $request_path = @$param['path']; - if ($request_path == '') $request_path = '/'; - // - // loop through set-cookie headers - // - foreach ($set_cookies as $set_cookie) { - $this->debug('Parsing: '.$set_cookie); - // temporary cookie store (before adding to jar) - $tmp_cookie = array(); - $param = explode(';', $set_cookie); - // loop through params - for ($x=0; $x$key, 'value'=>$val); - continue; - } - $key = strtolower($key); - if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { - $tmp_cookie[$key] = $val; - } - } - // - // set cookie - // - // check domain - if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && - ($tmp_cookie['domain'] != ".$request_host")) { - $domain = $tmp_cookie['domain']; - if ((strpos($domain, '.') === false) && ($domain != 'local')) { - $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); - continue; - } - if (preg_match('/\.[0-9]+$/', $domain)) { - $this->debug(' - domain "'.$domain.'" appears to be an ip address'); - continue; - } - if (substr($domain, 0, 1) != '.') $domain = ".$domain"; - if (!$this->_domain_match($request_host, $domain)) { - $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); - continue; - } - } else { - // if domain is not specified in the set-cookie header, domain will default to - // the request host - $domain = $request_host; - } - // check path - if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { - $path = urldecode($tmp_cookie['path']); - if (!$this->_path_match($request_path, $path)) { - $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); - continue; - } - } else { - $path = $request_path; - $path = substr($path, 0, strrpos($path, '/')); - if ($path == '') $path = '/'; - } - // check if secure - $secure = (isset($tmp_cookie['secure'])) ? true : false; - // check expiry - if (isset($tmp_cookie['expires'])) { - if (($expires = strtotime($tmp_cookie['expires'])) < 0) { - $expires = null; - } - } else { - $expires = null; - } - // set cookie - $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); - } - } - - // return array of set-cookie values extracted from HTTP response headers (string $h) - public function extractCookies($h) { - $x = 0; - $lines = 0; - $headers = array(); - $last_match = false; - $h = explode("\n", $h); - foreach ($h as $line) { - $line = rtrim($line); - $lines++; - - $trimmed_line = trim($line); - if (isset($line_last)) { - // check if we have \r\n\r\n (indicating the end of headers) - // some servers will not use CRLF (\r\n), so we make CR (\r) optional. - // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { - // break; - // } - // As an alternative, we can check if the current trimmed line is empty - if ($trimmed_line == '') { - break; - } - - // check for continuation line... - // RFC 2616 Section 2.2 "Basic Rules": - // HTTP/1.1 header field values can be folded onto multiple lines if the - // continuation line begins with a space or horizontal tab. All linear - // white space, including folding, has the same semantics as SP. A - // recipient MAY replace any linear white space with a single SP before - // interpreting the field value or forwarding the message downstream. - if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { - // append to previous header value - $headers[$x-1] .= ' '.rtrim($match[1]); - continue; - } - } - $line_last = $line; - - // split header name and value - if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { - $headers[$x++] = rtrim($match[1]); - $last_match = true; - } else { - $last_match = false; - } - } - return $headers; - } - - /** - * Set Cookie - * @param string $domain - * @param string $path - * @param string $name cookie name - * @param string $value cookie value - * @param bool $secure - * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) - * @return void - */ - function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) - { - if ($domain == '') return; - if ($path == '') return; - if ($name == '') return; - // check if cookie needs to go - if (isset($expires) && ($expires <= 0)) { - if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); - return; - } - if ($value == '') return; - $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); - return; - } - - /** - * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. - * @param string $domain - * @param string $path - * @param string $name - * @return void - */ - function clear($domain=null, $path=null, $name=null) - { - if (!isset($domain)) { - $this->cookies = array(); - } elseif (!isset($path)) { - if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); - } elseif (!isset($name)) { - if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); - } elseif (isset($name)) { - if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); - } - } - - /** - * Compare string length - used for sorting - * @access private - * @return int - */ - function _cmp_length($a, $b) - { - $la = strlen($a); $lb = strlen($b); - if ($la == $lb) return 0; - return ($la > $lb) ? -1 : 1; - } - - /** - * Reduce domain - * @param string $domain - * @return string - * @access private - */ - function _reduce_domain($domain) - { - if ($domain == '') return ''; - if (substr($domain, 0, 1) == '.') return substr($domain, 1); - return substr($domain, strpos($domain, '.')); - } - - /** - * Path match - check if path1 path-matches path2 - * - * From RFC 2965: - * For two strings that represent paths, P1 and P2, P1 path-matches P2 - * if P2 is a prefix of P1 (including the case where P1 and P2 string- - * compare equal). Thus, the string /tec/waldo path-matches /tec. - * @param string $path1 - * @param string $path2 - * @return bool - * @access private - */ - function _path_match($path1, $path2) - { - return (substr($path1, 0, strlen($path2)) == $path2); - } - - /** - * Domain match - check if domain1 domain-matches domain2 - * - * A few extracts from RFC 2965: - * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com - * would be rejected, because H is y.x and contains a dot. - * - * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com - * would be accepted. - * - * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be - * rejected, because there is no embedded dot. - * - * - A Set-Cookie2 from request-host example for Domain=.local will - * be accepted, because the effective host name for the request- - * host is example.local, and example.local domain-matches .local. - * - * I'm ignoring the first point for now (must check to see how other browsers handle - * this rule for Set-Cookie headers) - * - * @param string $domain1 - * @param string $domain2 - * @return bool - * @access private - */ - function _domain_match($domain1, $domain2) - { - $domain1 = strtolower($domain1); - $domain2 = strtolower($domain2); - while (strpos($domain1, '.') !== false) { - if ($domain1 == $domain2) return true; - $domain1 = $this->_reduce_domain($domain1); - continue; - } - return false; - } -} -?> \ No newline at end of file + + * + * This class should be used to handle cookies (storing cookies from HTTP response messages, and + * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org + * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ + * + * This class is mainly based on Cookies.pm from the libwww-perl collection . + * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. + * + * @version 0.5 + * @date 2011-03-15 + * @see http://php.net/HttpRequestPool + * @author Keyvan Minoukadeh + * @copyright 2011 Keyvan Minoukadeh + * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 + */ + +class CookieJar +{ + /** + * Cookies - array containing all cookies. + * + *
+    * Cookies are stored like this:
+    *   [domain][path][name] = array
+    * where array is:
+    *   0 => value, 1 => secure, 2 => expires
+    * 
+ * @var array + * @access private + */ + public $cookies = array(); + public $debug = false; + + /** + * Constructor + */ + function __construct() { + } + + protected function debug($msg, $file=null, $line=null) { + if ($this->debug) { + $mem = round(memory_get_usage()/1024, 2); + $memPeak = round(memory_get_peak_usage()/1024, 2); + echo '* ',$msg; + if (isset($file, $line)) echo " ($file line $line)"; + echo ' - mem used: ',$mem," (peak: $memPeak)\n"; + ob_flush(); + flush(); + } + } + + /** + * Get matching cookies + * + * Only use this method if you cannot use add_cookie_header(), for example, if you want to use + * this cookie jar class without using the request class. + * + * @param array $param associative array containing 'domain', 'path', 'secure' keys + * @return string + * @see add_cookie_header() + */ + public function getMatchingCookies($url) + { + if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { + $param['domain'] = $parts['host']; + $param['path'] = $parts['path']; + $param['secure'] = (strtolower($parts['scheme']) == 'https'); + unset($parts); + } else { + return false; + } + // RFC 2965 notes: + // If multiple cookies satisfy the criteria above, they are ordered in + // the Cookie header such that those with more specific Path attributes + // precede those with less specific. Ordering with respect to other + // attributes (e.g., Domain) is unspecified. + $domain = $param['domain']; + if (strpos($domain, '.') === false) $domain .= '.local'; + $request_path = $param['path']; + if ($request_path == '') $request_path = '/'; + $request_secure = $param['secure']; + $now = time(); + $matched_cookies = array(); + // domain - find matching domains + $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); + while (strpos($domain, '.') !== false) { + if (isset($this->cookies[$domain])) { + $this->debug(' domain match found: '.$domain); + $cookies =& $this->cookies[$domain]; + } else { + $domain = $this->_reduce_domain($domain); + continue; + } + // paths - find matching paths starting from most specific + $this->debug(' - Finding matching paths for '.$request_path); + $paths = array_keys($cookies); + usort($paths, array($this, '_cmp_length')); + foreach ($paths as $path) { + // continue to next cookie if request path does not path-match cookie path + if (!$this->_path_match($request_path, $path)) continue; + // loop through cookie names + $this->debug(' path match found: '.$path); + foreach ($cookies[$path] as $name => $values) { + // if this cookie is secure but request isn't, continue to next cookie + if ($values[1] && !$request_secure) continue; + // if cookie is not a session cookie and has expired, continue to next cookie + if (is_int($values[2]) && ($values[2] < $now)) continue; + // cookie matches request + $this->debug(' cookie match: '.$name.'='.$values[0]); + $matched_cookies[] = $name.'='.$values[0]; + } + } + $domain = $this->_reduce_domain($domain); + } + // return cookies + return implode('; ', $matched_cookies); + } + + /** + * Parse Set-Cookie values. + * + * Only use this method if you cannot use extract_cookies(), for example, if you want to use + * this cookie jar class without using the response class. + * + * @param array $set_cookies array holding 1 or more "Set-Cookie" header values + * @param array $param associative array containing 'host', 'path' keys + * @return void + * @see extract_cookies() + */ + public function storeCookies($url, $set_cookies) + { + if (count($set_cookies) == 0) return; + $param = @parse_url($url); + if (!is_array($param) || !isset($param['host'])) return; + $request_host = $param['host']; + if (strpos($request_host, '.') === false) $request_host .= '.local'; + $request_path = @$param['path']; + if ($request_path == '') $request_path = '/'; + // + // loop through set-cookie headers + // + foreach ($set_cookies as $set_cookie) { + $this->debug('Parsing: '.$set_cookie); + // temporary cookie store (before adding to jar) + $tmp_cookie = array(); + $param = explode(';', $set_cookie); + // loop through params + for ($x=0; $x$key, 'value'=>$val); + continue; + } + $key = strtolower($key); + if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { + $tmp_cookie[$key] = $val; + } + } + // + // set cookie + // + // check domain + if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && + ($tmp_cookie['domain'] != ".$request_host")) { + $domain = $tmp_cookie['domain']; + if ((strpos($domain, '.') === false) && ($domain != 'local')) { + $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); + continue; + } + if (preg_match('/\.[0-9]+$/', $domain)) { + $this->debug(' - domain "'.$domain.'" appears to be an ip address'); + continue; + } + if (substr($domain, 0, 1) != '.') $domain = ".$domain"; + if (!$this->_domain_match($request_host, $domain)) { + $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); + continue; + } + } else { + // if domain is not specified in the set-cookie header, domain will default to + // the request host + $domain = $request_host; + } + // check path + if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { + $path = urldecode($tmp_cookie['path']); + if (!$this->_path_match($request_path, $path)) { + $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); + continue; + } + } else { + $path = $request_path; + $path = substr($path, 0, strrpos($path, '/')); + if ($path == '') $path = '/'; + } + // check if secure + $secure = (isset($tmp_cookie['secure'])) ? true : false; + // check expiry + if (isset($tmp_cookie['expires'])) { + if (($expires = strtotime($tmp_cookie['expires'])) < 0) { + $expires = null; + } + } else { + $expires = null; + } + // set cookie + $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); + } + } + + // return array of set-cookie values extracted from HTTP response headers (string $h) + public function extractCookies($h) { + $x = 0; + $lines = 0; + $headers = array(); + $last_match = false; + $h = explode("\n", $h); + foreach ($h as $line) { + $line = rtrim($line); + $lines++; + + $trimmed_line = trim($line); + if (isset($line_last)) { + // check if we have \r\n\r\n (indicating the end of headers) + // some servers will not use CRLF (\r\n), so we make CR (\r) optional. + // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { + // break; + // } + // As an alternative, we can check if the current trimmed line is empty + if ($trimmed_line == '') { + break; + } + + // check for continuation line... + // RFC 2616 Section 2.2 "Basic Rules": + // HTTP/1.1 header field values can be folded onto multiple lines if the + // continuation line begins with a space or horizontal tab. All linear + // white space, including folding, has the same semantics as SP. A + // recipient MAY replace any linear white space with a single SP before + // interpreting the field value or forwarding the message downstream. + if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { + // append to previous header value + $headers[$x-1] .= ' '.rtrim($match[1]); + continue; + } + } + $line_last = $line; + + // split header name and value + if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { + $headers[$x++] = rtrim($match[1]); + $last_match = true; + } else { + $last_match = false; + } + } + return $headers; + } + + /** + * Set Cookie + * @param string $domain + * @param string $path + * @param string $name cookie name + * @param string $value cookie value + * @param bool $secure + * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) + * @return void + */ + function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) + { + if ($domain == '') return; + if ($path == '') return; + if ($name == '') return; + // check if cookie needs to go + if (isset($expires) && ($expires <= 0)) { + if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); + return; + } + if ($value == '') return; + $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); + return; + } + + /** + * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. + * @param string $domain + * @param string $path + * @param string $name + * @return void + */ + function clear($domain=null, $path=null, $name=null) + { + if (!isset($domain)) { + $this->cookies = array(); + } elseif (!isset($path)) { + if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); + } elseif (!isset($name)) { + if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); + } elseif (isset($name)) { + if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); + } + } + + /** + * Compare string length - used for sorting + * @access private + * @return int + */ + function _cmp_length($a, $b) + { + $la = strlen($a); $lb = strlen($b); + if ($la == $lb) return 0; + return ($la > $lb) ? -1 : 1; + } + + /** + * Reduce domain + * @param string $domain + * @return string + * @access private + */ + function _reduce_domain($domain) + { + if ($domain == '') return ''; + if (substr($domain, 0, 1) == '.') return substr($domain, 1); + return substr($domain, strpos($domain, '.')); + } + + /** + * Path match - check if path1 path-matches path2 + * + * From RFC 2965: + * For two strings that represent paths, P1 and P2, P1 path-matches P2 + * if P2 is a prefix of P1 (including the case where P1 and P2 string- + * compare equal). Thus, the string /tec/waldo path-matches /tec. + * @param string $path1 + * @param string $path2 + * @return bool + * @access private + */ + function _path_match($path1, $path2) + { + return (substr($path1, 0, strlen($path2)) == $path2); + } + + /** + * Domain match - check if domain1 domain-matches domain2 + * + * A few extracts from RFC 2965: + * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com + * would be rejected, because H is y.x and contains a dot. + * + * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com + * would be accepted. + * + * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be + * rejected, because there is no embedded dot. + * + * - A Set-Cookie2 from request-host example for Domain=.local will + * be accepted, because the effective host name for the request- + * host is example.local, and example.local domain-matches .local. + * + * I'm ignoring the first point for now (must check to see how other browsers handle + * this rule for Set-Cookie headers) + * + * @param string $domain1 + * @param string $domain2 + * @return bool + * @access private + */ + function _domain_match($domain1, $domain2) + { + $domain1 = strtolower($domain1); + $domain2 = strtolower($domain2); + while (strpos($domain1, '.') !== false) { + if ($domain1 == $domain2) return true; + $domain1 = $this->_reduce_domain($domain1); + continue; + } + return false; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php index e4f1b3b3..963f0c05 100644 --- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php @@ -1,779 +1,810 @@ -userAgentDefault = self::UA_BROWSER; - $this->referer = self::REF_GOOGLE; - // set the request method - if (in_array($method, array(1,2,4))) { - $this->method = $method; - } else { - if (class_exists('HttpRequestPool')) { - $this->method = self::METHOD_REQUEST_POOL; - } elseif (function_exists('curl_multi_init')) { - $this->method = self::METHOD_CURL_MULTI; - } else { - $this->method = self::METHOD_FILE_GET_CONTENTS; - } - } - if ($this->method == self::METHOD_CURL_MULTI) { - require_once(dirname(__FILE__).'/RollingCurl.php'); - } - // create cookie jar - $this->cookieJar = new CookieJar(); - // set request options (redirect must be 0) - $this->requestOptions = array( - 'timeout' => 15, - 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web - // TODO: test onprogress? - ); - if (is_array($requestOptions)) { - $this->requestOptions = array_merge($this->requestOptions, $requestOptions); - } - $this->httpContext = array( - 'http' => array( - 'ignore_errors' => true, - 'timeout' => $this->requestOptions['timeout'], - 'max_redirects' => $this->requestOptions['redirect'], - 'header' => "Accept: */*\r\n" - ) - ); - } - - protected function debug($msg) { - if ($this->debug) { - $mem = round(memory_get_usage()/1024, 2); - $memPeak = round(memory_get_peak_usage()/1024, 2); - echo '* ',$msg; - if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; - echo "\n"; - ob_flush(); - flush(); - } - } - - protected function getUserAgent($url, $asArray=false) { - $host = @parse_url($url, PHP_URL_HOST); - if (strtolower(substr($host, 0, 4)) == 'www.') { - $host = substr($host, 4); - } - if ($host) { - $try = array($host); - $split = explode('.', $host); - if (count($split) > 1) { - array_shift($split); - $try[] = '.'.implode('.', $split); - } - foreach ($try as $h) { - if (isset($this->userAgentMap[$h])) { - $ua = $this->userAgentMap[$h]; - break; - } - } - } - if (!isset($ua)) $ua = $this->userAgentDefault; - if ($asArray) { - return array('User-Agent' => $ua); - } else { - return 'User-Agent: '.$ua; - } - } - - public function rewriteHashbangFragment($url) { - // return $url if there's no '#!' - if (strpos($url, '#!') === false) return $url; - // split $url and rewrite - // TODO: is SimplePie_IRI included? - $iri = new SimplePie_IRI($url); - $fragment = substr($iri->fragment, 1); // strip '!' - $iri->fragment = null; - if (isset($iri->query)) { - parse_str($iri->query, $query); - } else { - $query = array(); - } - $query['_escaped_fragment_'] = (string)$fragment; - $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites - return $iri->get_iri(); - } - - public function getUglyURL($url, $html) { - if ($html == '') return false; - $found = false; - foreach ($this->ajaxTriggers as $string) { - if (stripos($html, $string)) { - $found = true; - break; - } - } - if (!$found) return false; - $iri = new SimplePie_IRI($url); - if (isset($iri->query)) { - parse_str($iri->query, $query); - } else { - $query = array(); - } - $query['_escaped_fragment_'] = ''; - $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites - return $iri->get_iri(); - } - - public function removeFragment($url) { - $pos = strpos($url, '#'); - if ($pos === false) { - return $url; - } else { - return substr($url, 0, $pos); - } - } - - public function rewriteUrls($url) { - foreach ($this->rewriteUrls as $find => $action) { - if (strpos($url, $find) !== false) { - if (is_array($action)) { - return strtr($url, $action); - } - } - } - return $url; - } - - public function enableDebug($bool=true) { - $this->debug = (bool)$bool; - } - - public function minimiseMemoryUse($bool = true) { - $this->minimiseMemoryUse = $bool; - } - - public function setMaxParallelRequests($max) { - $this->maxParallelRequests = $max; - } - - public function validateUrl($url) { - $url = filter_var($url, FILTER_SANITIZE_URL); - $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); - // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) - if ($test === false) { - $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); - } - if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { - return $url; - } else { - return false; - } - } - - public function fetchAll(array $urls) { - $this->fetchAllOnce($urls, $isRedirect=false); - $redirects = 0; - while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { - $this->debug("Following redirects #$redirects..."); - $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); - } - } - - // fetch all URLs without following redirects - public function fetchAllOnce(array $urls, $isRedirect=false) { - if (!$isRedirect) $urls = array_unique($urls); - if (empty($urls)) return; - - ////////////////////////////////////////////////////// - // parallel (HttpRequestPool) - if ($this->method == self::METHOD_REQUEST_POOL) { - $this->debug('Starting parallel fetch (HttpRequestPool)'); - try { - while (count($urls) > 0) { - $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); - $subset = array_splice($urls, 0, $this->maxParallelRequests); - $pool = new HttpRequestPool(); - foreach ($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - unset($this->redirectQueue[$orig]); - $this->debug("...$url"); - if (!$isRedirect && isset($this->requests[$url])) { - $this->debug("......in memory"); - /* - } elseif ($this->isCached($url)) { - $this->debug("......is cached"); - if (!$this->minimiseMemoryUse) { - $this->requests[$url] = $this->getCached($url); - } - */ - } else { - $this->debug("......adding to pool"); - $req_url = $this->rewriteUrls($url); - $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; - $req_url = $this->removeFragment($req_url); - if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { - $_meth = HttpRequest::METH_HEAD; - } else { - $_meth = HttpRequest::METH_GET; - unset($this->requests[$orig]['wrongGuess']); - } - $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); - // send cookies, if we have any - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { - $this->debug("......sending cookies: $cookies"); - $httpRequest->addHeaders(array('Cookie' => $cookies)); - } - //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); - $httpRequest->addHeaders($this->getUserAgent($req_url, true)); - // add referer for picky sites - $httpRequest->addheaders(array('Referer' => $this->referer)); - $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); - $this->requests[$orig]['original_url'] = $orig; - $pool->attach($httpRequest); - } - } - // did we get anything into the pool? - if (count($pool) > 0) { - $this->debug('Sending request...'); - try { - $pool->send(); - } catch (HttpRequestPoolException $e) { - // do nothing - } - $this->debug('Received responses'); - foreach($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - $request = $this->requests[$orig]['httpRequest']; - //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); - // getResponseHeader() doesn't return status line, so, for consistency... - $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); - // check content type - // TODO: use getResponseHeader('content-type') or getResponseInfo() - if ($this->headerOnlyType($this->requests[$orig]['headers'])) { - $this->requests[$orig]['body'] = ''; - $_header_only_type = true; - $this->debug('Header only type returned'); - } else { - $this->requests[$orig]['body'] = $request->getResponseBody(); - $_header_only_type = false; - } - $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); - $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); - // is redirect? - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { - $redirectURL = $request->getResponseHeader('location'); - if (!preg_match('!^https?://!i', $redirectURL)) { - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); - } - if ($this->validateURL($redirectURL)) { - $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $request->getResponseHeader('set-cookie'); - if ($cookies && !is_array($cookies)) $cookies = array($cookies); - if ($cookies) $this->cookieJar->storeCookies($url, $cookies); - $this->redirectQueue[$orig] = $redirectURL; - } else { - $this->debug('Redirect detected. Invalid URL: '.$redirectURL); - } - } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { - // the response content-type did not match our 'header only' types, - // but we'd issues a HEAD request because we assumed it would. So - // let's queue a proper GET request for this item... - $this->debug('Wrong guess at content-type, queing GET request'); - $this->requests[$orig]['wrongGuess'] = true; - $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; - } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); - if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); - $this->redirectQueue[$orig] = $redirectURL; - } - } - } - //die($url.' -multi- '.$request->getResponseInfo('effective_url')); - $pool->detach($request); - unset($this->requests[$orig]['httpRequest'], $request); - /* - if ($this->minimiseMemoryUse) { - if ($this->cache($url)) { - unset($this->requests[$url]); - } - } - */ - } - } - } - } catch (HttpException $e) { - $this->debug($e); - return false; - } - } - - ////////////////////////////////////////////////////////// - // parallel (curl_multi_*) - elseif ($this->method == self::METHOD_CURL_MULTI) { - $this->debug('Starting parallel fetch (curl_multi_*)'); - while (count($urls) > 0) { - $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); - $subset = array_splice($urls, 0, $this->maxParallelRequests); - $pool = new RollingCurl(array($this, 'handleCurlResponse')); - $pool->window_size = count($subset); - - foreach ($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - unset($this->redirectQueue[$orig]); - $this->debug("...$url"); - if (!$isRedirect && isset($this->requests[$url])) { - $this->debug("......in memory"); - /* - } elseif ($this->isCached($url)) { - $this->debug("......is cached"); - if (!$this->minimiseMemoryUse) { - $this->requests[$url] = $this->getCached($url); - } - */ - } else { - $this->debug("......adding to pool"); - $req_url = $this->rewriteUrls($url); - $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; - $req_url = $this->removeFragment($req_url); - if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { - $_meth = 'HEAD'; - } else { - $_meth = 'GET'; - unset($this->requests[$orig]['wrongGuess']); - } - $headers = array(); - //$headers[] = 'User-Agent: '.$this->userAgent; - $headers[] = $this->getUserAgent($req_url); - // add referer for picky sites - $headers[] = 'Referer: '.$this->referer; - // send cookies, if we have any - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { - $this->debug("......sending cookies: $cookies"); - $headers[] = 'Cookie: '.$cookies; - } - $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( - CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], - CURLOPT_TIMEOUT => $this->requestOptions['timeout'] - )); - $httpRequest->set_original_url($orig); - $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); - $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? - $pool->add($httpRequest); - } - } - // did we get anything into the pool? - if (count($pool) > 0) { - $this->debug('Sending request...'); - $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] - $this->debug('Received responses'); - foreach($subset as $orig => $url) { - if (!$isRedirect) $orig = $url; - // $this->requests[$orig]['headers'] - // $this->requests[$orig]['body'] - // $this->requests[$orig]['effective_url'] - // check content type - if ($this->headerOnlyType($this->requests[$orig]['headers'])) { - $this->requests[$orig]['body'] = ''; - $_header_only_type = true; - $this->debug('Header only type returned'); - } else { - $_header_only_type = false; - } - $status_code = $this->requests[$orig]['status_code']; - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { - $redirectURL = $this->requests[$orig]['location']; - if (!preg_match('!^https?://!i', $redirectURL)) { - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); - } - if ($this->validateURL($redirectURL)) { - $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); - if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); - $this->redirectQueue[$orig] = $redirectURL; - } else { - $this->debug('Redirect detected. Invalid URL: '.$redirectURL); - } - } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { - // the response content-type did not match our 'header only' types, - // but we'd issues a HEAD request because we assumed it would. So - // let's queue a proper GET request for this item... - $this->debug('Wrong guess at content-type, queing GET request'); - $this->requests[$orig]['wrongGuess'] = true; - $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; - } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); - if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); - $this->redirectQueue[$orig] = $redirectURL; - } - } - } - // die($url.' -multi- '.$request->getResponseInfo('effective_url')); - unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); - } - } - } - } - - ////////////////////////////////////////////////////// - // sequential (file_get_contents) - else { - $this->debug('Starting sequential fetch (file_get_contents)'); - $this->debug('Processing set of '.count($urls)); - foreach ($urls as $orig => $url) { - if (!$isRedirect) $orig = $url; - unset($this->redirectQueue[$orig]); - $this->debug("...$url"); - if (!$isRedirect && isset($this->requests[$url])) { - $this->debug("......in memory"); - /* - } elseif ($this->isCached($url)) { - $this->debug("......is cached"); - if (!$this->minimiseMemoryUse) { - $this->requests[$url] = $this->getCached($url); - } - */ - } else { - $this->debug("Sending request for $url"); - $this->requests[$orig]['original_url'] = $orig; - $req_url = $this->rewriteUrls($url); - $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; - $req_url = $this->removeFragment($req_url); - // send cookies, if we have any - $httpContext = $this->httpContext; - $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; - // add referer for picky sites - $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { - $this->debug("......sending cookies: $cookies"); - $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; - } - if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { - $this->debug('Received response'); - // get status code - if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { - $this->debug('Error: no status code found'); - // TODO: handle error - no status code - } else { - $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); - // check content type - if ($this->headerOnlyType($this->requests[$orig]['headers'])) { - $this->requests[$orig]['body'] = ''; - } else { - $this->requests[$orig]['body'] = $html; - } - $this->requests[$orig]['effective_url'] = $req_url; - $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; - unset($match); - // handle redirect - if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { - $this->requests[$orig]['location'] = trim($match[1]); - } - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { - $redirectURL = $this->requests[$orig]['location']; - if (!preg_match('!^https?://!i', $redirectURL)) { - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); - } - if ($this->validateURL($redirectURL)) { - $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); - if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); - $this->redirectQueue[$orig] = $redirectURL; - } else { - $this->debug('Redirect detected. Invalid URL: '.$redirectURL); - } - } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { - // check for - // for AJAX sites, e.g. Blogger with its dynamic views templates. - // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification - if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); - if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); - $this->redirectQueue[$orig] = $redirectURL; - } - } - } - } - } else { - $this->debug('Error retrieving URL'); - //print_r($req_url); - //print_r($http_response_header); - //print_r($html); - - // TODO: handle error - failed to retrieve URL - } - } - } - } - } - - public function handleCurlResponse($response, $info, $request) { - $orig = $request->url_original; - $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); - $this->requests[$orig]['body'] = substr($response, $info['header_size']); - $this->requests[$orig]['method'] = $request->method; - $this->requests[$orig]['effective_url'] = $info['url']; - $this->requests[$orig]['status_code'] = (int)$info['http_code']; - if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { - $this->requests[$orig]['location'] = trim($match[1]); - } - } - - protected function headersToString(array $headers, $associative=true) { - if (!$associative) { - return implode("\n", $headers); - } else { - $str = ''; - foreach ($headers as $key => $val) { - if (is_array($val)) { - foreach ($val as $v) $str .= "$key: $v\n"; - } else { - $str .= "$key: $val\n"; - } - } - return rtrim($str); - } - } - - public function get($url, $remove=false, $gzdecode=true) { - $url = "$url"; - if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { - $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); - $response = $this->requests[$url]; - /* - } elseif ($this->isCached($url)) { - $this->debug("URL already fetched - in disk cache ($url)"); - $response = $this->getCached($url); - $this->requests[$url] = $response; - */ - } else { - $this->debug("Fetching URL ($url)"); - $this->fetchAll(array($url)); - if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { - $response = $this->requests[$url]; - } else { - $this->debug("Request failed"); - $response = false; - } - } - /* - if ($this->minimiseMemoryUse && $response) { - $this->cache($url); - unset($this->requests[$url]); - } - */ - if ($remove && $response) unset($this->requests[$url]); - if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { - if ($html = gzdecode($response['body'])) { - $response['body'] = $html; - } - } - return $response; - } - - public function parallelSupport() { - return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); - } - - private function headerOnlyType($headers) { - if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { - // look for full mime type (e.g. image/jpeg) or just type (e.g. image) - $match[1] = strtolower(trim($match[1])); - $match[2] = strtolower(trim($match[2])); - foreach (array($match[1], $match[2]) as $mime) { - if (in_array($mime, $this->headerOnlyTypes)) return true; - } - } - return false; - } - - private function possibleUnsupportedType($url) { - $path = @parse_url($url, PHP_URL_PATH); - if ($path && strpos($path, '.') !== false) { - $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); - return in_array($ext, $this->headerOnlyClues); - } - return false; - } -} - -// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 -if (!function_exists('gzdecode')) { - function gzdecode($data,&$filename='',&$error='',$maxlength=null) - { - $len = strlen($data); - if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { - $error = "Not in GZIP format."; - return null; // Not GZIP format (See RFC 1952) - } - $method = ord(substr($data,2,1)); // Compression method - $flags = ord(substr($data,3,1)); // Flags - if ($flags & 31 != $flags) { - $error = "Reserved bits not allowed."; - return null; - } - // NOTE: $mtime may be negative (PHP integer limitations) - $mtime = unpack("V", substr($data,4,4)); - $mtime = $mtime[1]; - $xfl = substr($data,8,1); - $os = substr($data,8,1); - $headerlen = 10; - $extralen = 0; - $extra = ""; - if ($flags & 4) { - // 2-byte length prefixed EXTRA data in header - if ($len - $headerlen - 2 < 8) { - return false; // invalid - } - $extralen = unpack("v",substr($data,8,2)); - $extralen = $extralen[1]; - if ($len - $headerlen - 2 - $extralen < 8) { - return false; // invalid - } - $extra = substr($data,10,$extralen); - $headerlen += 2 + $extralen; - } - $filenamelen = 0; - $filename = ""; - if ($flags & 8) { - // C-style string - if ($len - $headerlen - 1 < 8) { - return false; // invalid - } - $filenamelen = strpos(substr($data,$headerlen),chr(0)); - if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { - return false; // invalid - } - $filename = substr($data,$headerlen,$filenamelen); - $headerlen += $filenamelen + 1; - } - $commentlen = 0; - $comment = ""; - if ($flags & 16) { - // C-style string COMMENT data in header - if ($len - $headerlen - 1 < 8) { - return false; // invalid - } - $commentlen = strpos(substr($data,$headerlen),chr(0)); - if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { - return false; // Invalid header format - } - $comment = substr($data,$headerlen,$commentlen); - $headerlen += $commentlen + 1; - } - $headercrc = ""; - if ($flags & 2) { - // 2-bytes (lowest order) of CRC32 on header present - if ($len - $headerlen - 2 < 8) { - return false; // invalid - } - $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; - $headercrc = unpack("v", substr($data,$headerlen,2)); - $headercrc = $headercrc[1]; - if ($headercrc != $calccrc) { - $error = "Header checksum failed."; - return false; // Bad header CRC - } - $headerlen += 2; - } - // GZIP FOOTER - $datacrc = unpack("V",substr($data,-8,4)); - $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); - $isize = unpack("V",substr($data,-4)); - $isize = $isize[1]; - // decompression: - $bodylen = $len-$headerlen-8; - if ($bodylen < 1) { - // IMPLEMENTATION BUG! - return null; - } - $body = substr($data,$headerlen,$bodylen); - $data = ""; - if ($bodylen > 0) { - switch ($method) { - case 8: - // Currently the only supported compression method: - $data = gzinflate($body,$maxlength); - break; - default: - $error = "Unknown compression method."; - return false; - } - } // zero-byte body content is allowed - // Verifiy CRC32 - $crc = sprintf("%u",crc32($data)); - $crcOK = $crc == $datacrc; - $lenOK = $isize == strlen($data); - if (!$lenOK || !$crcOK) { - $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); - return false; - } - return $data; - } -} -?> \ No newline at end of file +userAgentDefault = self::UA_BROWSER; + $this->referer = self::REF_GOOGLE; + // set the request method + if (in_array($method, array(1,2,4))) { + $this->method = $method; + } else { + if (class_exists('HttpRequestPool')) { + $this->method = self::METHOD_REQUEST_POOL; + } elseif (function_exists('curl_multi_init')) { + $this->method = self::METHOD_CURL_MULTI; + } else { + $this->method = self::METHOD_FILE_GET_CONTENTS; + } + } + if ($this->method == self::METHOD_CURL_MULTI) { + require_once(dirname(__FILE__).'/RollingCurl.php'); + } + // create cookie jar + $this->cookieJar = new CookieJar(); + // set request options (redirect must be 0) + $this->requestOptions = array( + 'timeout' => 15, + 'connecttimeout' => 15, + 'dns_cache_timeout' => 300, + 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web + // TODO: test onprogress? + ); + if (is_array($requestOptions)) { + $this->requestOptions = array_merge($this->requestOptions, $requestOptions); + } + $this->httpContext = array( + 'http' => array( + 'ignore_errors' => true, + 'timeout' => $this->requestOptions['timeout'], + 'max_redirects' => $this->requestOptions['redirect'], + 'header' => "Accept: */*\r\n" + ) + ); + } + + protected function debug($msg) { + if ($this->debug) { + $mem = round(memory_get_usage()/1024, 2); + $memPeak = round(memory_get_peak_usage()/1024, 2); + echo '* ',$msg; + if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; + echo "\n"; + ob_flush(); + flush(); + } + } + + protected function getUserAgent($url, $asArray=false) { + $host = @parse_url($url, PHP_URL_HOST); + if (strtolower(substr($host, 0, 4)) == 'www.') { + $host = substr($host, 4); + } + if ($host) { + $try = array($host); + $split = explode('.', $host); + if (count($split) > 1) { + array_shift($split); + $try[] = '.'.implode('.', $split); + } + foreach ($try as $h) { + if (isset($this->userAgentMap[$h])) { + $ua = $this->userAgentMap[$h]; + break; + } + } + } + if (!isset($ua)) $ua = $this->userAgentDefault; + if ($asArray) { + return array('User-Agent' => $ua); + } else { + return 'User-Agent: '.$ua; + } + } + + public function rewriteHashbangFragment($url) { + // return $url if there's no '#!' + if (strpos($url, '#!') === false) return $url; + // split $url and rewrite + // TODO: is SimplePie_IRI included? + $iri = new SimplePie_IRI($url); + $fragment = substr($iri->fragment, 1); // strip '!' + $iri->fragment = null; + if (isset($iri->query)) { + parse_str($iri->query, $query); + } else { + $query = array(); + } + $query['_escaped_fragment_'] = (string)$fragment; + $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites + return $iri->get_iri(); + } + + public function getRedirectURLfromHTML($url, $html) { + $redirect_url = $this->getMetaRefreshURL($url, $html); + if (!$redirect_url) { + $redirect_url = $this->getUglyURL($url, $html); + } + return $redirect_url; + } + + public function getMetaRefreshURL($url, $html) { + if ($html == '') return false; + // + if (!preg_match('!]+)["\']*>!i', $html, $match)) { + return false; + } + $redirect_url = $match[1]; + if (preg_match('!^https?://!i', $redirect_url)) { + // already absolute + $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); + return $redirect_url; + } + // absolutize redirect URL + $base = new SimplePie_IRI($url); + // remove '//' in URL path (causes URLs not to resolve properly) + if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); + if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { + $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); + return $absolute; + } + return false; + } + + public function getUglyURL($url, $html) { + if ($html == '') return false; + $found = false; + foreach ($this->ajaxTriggers as $string) { + if (stripos($html, $string)) { + $found = true; + break; + } + } + if (!$found) return false; + $iri = new SimplePie_IRI($url); + if (isset($iri->query)) { + parse_str($iri->query, $query); + } else { + $query = array(); + } + $query['_escaped_fragment_'] = ''; + $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites + $ugly_url = $iri->get_iri(); + $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); + return $ugly_url; + } + + public function removeFragment($url) { + $pos = strpos($url, '#'); + if ($pos === false) { + return $url; + } else { + return substr($url, 0, $pos); + } + } + + public function rewriteUrls($url) { + foreach ($this->rewriteUrls as $find => $action) { + if (strpos($url, $find) !== false) { + if (is_array($action)) { + return strtr($url, $action); + } + } + } + return $url; + } + + public function enableDebug($bool=true) { + $this->debug = (bool)$bool; + } + + public function minimiseMemoryUse($bool = true) { + $this->minimiseMemoryUse = $bool; + } + + public function setMaxParallelRequests($max) { + $this->maxParallelRequests = $max; + } + + public function validateUrl($url) { + $url = filter_var($url, FILTER_SANITIZE_URL); + $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) + if ($test === false) { + $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + } + if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { + return $url; + } else { + return false; + } + } + + public function fetchAll(array $urls) { + $this->fetchAllOnce($urls, $isRedirect=false); + $redirects = 0; + while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { + $this->debug("Following redirects #$redirects..."); + $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); + } + } + + // fetch all URLs without following redirects + public function fetchAllOnce(array $urls, $isRedirect=false) { + if (!$isRedirect) $urls = array_unique($urls); + if (empty($urls)) return; + + ////////////////////////////////////////////////////// + // parallel (HttpRequestPool) + if ($this->method == self::METHOD_REQUEST_POOL) { + $this->debug('Starting parallel fetch (HttpRequestPool)'); + try { + while (count($urls) > 0) { + $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); + $subset = array_splice($urls, 0, $this->maxParallelRequests); + $pool = new HttpRequestPool(); + foreach ($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("......adding to pool"); + $req_url = $this->rewriteUrls($url); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; + $req_url = $this->removeFragment($req_url); + if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { + $_meth = HttpRequest::METH_HEAD; + } else { + $_meth = HttpRequest::METH_GET; + unset($this->requests[$orig]['wrongGuess']); + } + $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); + // send cookies, if we have any + if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + $this->debug("......sending cookies: $cookies"); + $httpRequest->addHeaders(array('Cookie' => $cookies)); + } + //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); + $httpRequest->addHeaders($this->getUserAgent($req_url, true)); + // add referer for picky sites + $httpRequest->addheaders(array('Referer' => $this->referer)); + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $this->requests[$orig]['original_url'] = $orig; + $pool->attach($httpRequest); + } + } + // did we get anything into the pool? + if (count($pool) > 0) { + $this->debug('Sending request...'); + try { + $pool->send(); + } catch (HttpRequestPoolException $e) { + // do nothing + } + $this->debug('Received responses'); + foreach($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + $request = $this->requests[$orig]['httpRequest']; + //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); + // getResponseHeader() doesn't return status line, so, for consistency... + $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); + // check content type + // TODO: use getResponseHeader('content-type') or getResponseInfo() + if ($this->headerOnlyType($this->requests[$orig]['headers'])) { + $this->requests[$orig]['body'] = ''; + $_header_only_type = true; + $this->debug('Header only type returned'); + } else { + $this->requests[$orig]['body'] = $request->getResponseBody(); + $_header_only_type = false; + } + $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); + $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); + // is redirect? + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { + $redirectURL = $request->getResponseHeader('location'); + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + $cookies = $request->getResponseHeader('set-cookie'); + if ($cookies && !is_array($cookies)) $cookies = array($cookies); + if ($cookies) $this->cookieJar->storeCookies($url, $cookies); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { + // the response content-type did not match our 'header only' types, + // but we'd issues a HEAD request because we assumed it would. So + // let's queue a proper GET request for this item... + $this->debug('Wrong guess at content-type, queing GET request'); + $this->requests[$orig]['wrongGuess'] = true; + $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; + } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { + // check for + // for AJAX sites, e.g. Blogger with its dynamic views templates. + // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification + if (isset($this->requests[$orig]['body'])) { + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + if ($redirectURL) { + $this->redirectQueue[$orig] = $redirectURL; + } + } + } + //die($url.' -multi- '.$request->getResponseInfo('effective_url')); + $pool->detach($request); + unset($this->requests[$orig]['httpRequest'], $request); + /* + if ($this->minimiseMemoryUse) { + if ($this->cache($url)) { + unset($this->requests[$url]); + } + } + */ + } + } + } + } catch (HttpException $e) { + $this->debug($e); + return false; + } + } + + ////////////////////////////////////////////////////////// + // parallel (curl_multi_*) + elseif ($this->method == self::METHOD_CURL_MULTI) { + $this->debug('Starting parallel fetch (curl_multi_*)'); + while (count($urls) > 0) { + $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); + $subset = array_splice($urls, 0, $this->maxParallelRequests); + $pool = new RollingCurl(array($this, 'handleCurlResponse')); + $pool->window_size = count($subset); + + foreach ($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("......adding to pool"); + $req_url = $this->rewriteUrls($url); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; + $req_url = $this->removeFragment($req_url); + if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { + $_meth = 'HEAD'; + } else { + $_meth = 'GET'; + unset($this->requests[$orig]['wrongGuess']); + } + $headers = array(); + //$headers[] = 'User-Agent: '.$this->userAgent; + $headers[] = $this->getUserAgent($req_url); + // add referer for picky sites + $headers[] = 'Referer: '.$this->referer; + // send cookies, if we have any + if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + $this->debug("......sending cookies: $cookies"); + $headers[] = 'Cookie: '.$cookies; + } + $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( + CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], + CURLOPT_TIMEOUT => $this->requestOptions['timeout'] + )); + $httpRequest->set_original_url($orig); + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? + $pool->add($httpRequest); + } + } + // did we get anything into the pool? + if (count($pool) > 0) { + $this->debug('Sending request...'); + $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] + $this->debug('Received responses'); + foreach($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + // $this->requests[$orig]['headers'] + // $this->requests[$orig]['body'] + // $this->requests[$orig]['effective_url'] + // check content type + if ($this->headerOnlyType($this->requests[$orig]['headers'])) { + $this->requests[$orig]['body'] = ''; + $_header_only_type = true; + $this->debug('Header only type returned'); + } else { + $_header_only_type = false; + } + $status_code = $this->requests[$orig]['status_code']; + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { + $redirectURL = $this->requests[$orig]['location']; + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); + if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { + // the response content-type did not match our 'header only' types, + // but we'd issues a HEAD request because we assumed it would. So + // let's queue a proper GET request for this item... + $this->debug('Wrong guess at content-type, queing GET request'); + $this->requests[$orig]['wrongGuess'] = true; + $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; + } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { + // check for + // for AJAX sites, e.g. Blogger with its dynamic views templates. + // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification + if (isset($this->requests[$orig]['body'])) { + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + if ($redirectURL) { + $this->redirectQueue[$orig] = $redirectURL; + } + } + } + // die($url.' -multi- '.$request->getResponseInfo('effective_url')); + unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); + } + } + } + } + + ////////////////////////////////////////////////////// + // sequential (file_get_contents) + else { + $this->debug('Starting sequential fetch (file_get_contents)'); + $this->debug('Processing set of '.count($urls)); + foreach ($urls as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("Sending request for $url"); + $this->requests[$orig]['original_url'] = $orig; + $req_url = $this->rewriteUrls($url); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; + $req_url = $this->removeFragment($req_url); + // send cookies, if we have any + $httpContext = $this->httpContext; + $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; + // add referer for picky sites + $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; + if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + $this->debug("......sending cookies: $cookies"); + $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; + } + if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { + $this->debug('Received response'); + // get status code + if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { + $this->debug('Error: no status code found'); + // TODO: handle error - no status code + } else { + $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); + // check content type + if ($this->headerOnlyType($this->requests[$orig]['headers'])) { + $this->requests[$orig]['body'] = ''; + } else { + $this->requests[$orig]['body'] = $html; + } + $this->requests[$orig]['effective_url'] = $req_url; + $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; + unset($match); + // handle redirect + if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { + $this->requests[$orig]['location'] = trim($match[1]); + } + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { + $redirectURL = $this->requests[$orig]['location']; + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); + if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { + // check for + // for AJAX sites, e.g. Blogger with its dynamic views templates. + // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification + if (isset($this->requests[$orig]['body'])) { + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + if ($redirectURL) { + $this->redirectQueue[$orig] = $redirectURL; + } + } + } + } + } else { + $this->debug('Error retrieving URL'); + //print_r($req_url); + //print_r($http_response_header); + //print_r($html); + + // TODO: handle error - failed to retrieve URL + } + } + } + } + } + + public function handleCurlResponse($response, $info, $request) { + $orig = $request->url_original; + $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); + $this->requests[$orig]['body'] = substr($response, $info['header_size']); + $this->requests[$orig]['method'] = $request->method; + $this->requests[$orig]['effective_url'] = $info['url']; + $this->requests[$orig]['status_code'] = (int)$info['http_code']; + if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { + $this->requests[$orig]['location'] = trim($match[1]); + } + } + + protected function headersToString(array $headers, $associative=true) { + if (!$associative) { + return implode("\n", $headers); + } else { + $str = ''; + foreach ($headers as $key => $val) { + if (is_array($val)) { + foreach ($val as $v) $str .= "$key: $v\n"; + } else { + $str .= "$key: $val\n"; + } + } + return rtrim($str); + } + } + + public function get($url, $remove=false, $gzdecode=true) { + $url = "$url"; + if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { + $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); + $response = $this->requests[$url]; + /* + } elseif ($this->isCached($url)) { + $this->debug("URL already fetched - in disk cache ($url)"); + $response = $this->getCached($url); + $this->requests[$url] = $response; + */ + } else { + $this->debug("Fetching URL ($url)"); + $this->fetchAll(array($url)); + if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { + $response = $this->requests[$url]; + } else { + $this->debug("Request failed"); + $response = false; + } + } + /* + if ($this->minimiseMemoryUse && $response) { + $this->cache($url); + unset($this->requests[$url]); + } + */ + if ($remove && $response) unset($this->requests[$url]); + if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { + if ($html = gzdecode($response['body'])) { + $response['body'] = $html; + } + } + return $response; + } + + public function parallelSupport() { + return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); + } + + private function headerOnlyType($headers) { + if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { + // look for full mime type (e.g. image/jpeg) or just type (e.g. image) + $match[1] = strtolower(trim($match[1])); + $match[2] = strtolower(trim($match[2])); + foreach (array($match[1], $match[2]) as $mime) { + if (in_array($mime, $this->headerOnlyTypes)) return true; + } + } + return false; + } + + private function possibleUnsupportedType($url) { + $path = @parse_url($url, PHP_URL_PATH); + if ($path && strpos($path, '.') !== false) { + $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); + return in_array($ext, $this->headerOnlyClues); + } + return false; + } +} + +// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 +if (!function_exists('gzdecode')) { + function gzdecode($data,&$filename='',&$error='',$maxlength=null) + { + $len = strlen($data); + if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { + $error = "Not in GZIP format."; + return null; // Not GZIP format (See RFC 1952) + } + $method = ord(substr($data,2,1)); // Compression method + $flags = ord(substr($data,3,1)); // Flags + if ($flags & 31 != $flags) { + $error = "Reserved bits not allowed."; + return null; + } + // NOTE: $mtime may be negative (PHP integer limitations) + $mtime = unpack("V", substr($data,4,4)); + $mtime = $mtime[1]; + $xfl = substr($data,8,1); + $os = substr($data,8,1); + $headerlen = 10; + $extralen = 0; + $extra = ""; + if ($flags & 4) { + // 2-byte length prefixed EXTRA data in header + if ($len - $headerlen - 2 < 8) { + return false; // invalid + } + $extralen = unpack("v",substr($data,8,2)); + $extralen = $extralen[1]; + if ($len - $headerlen - 2 - $extralen < 8) { + return false; // invalid + } + $extra = substr($data,10,$extralen); + $headerlen += 2 + $extralen; + } + $filenamelen = 0; + $filename = ""; + if ($flags & 8) { + // C-style string + if ($len - $headerlen - 1 < 8) { + return false; // invalid + } + $filenamelen = strpos(substr($data,$headerlen),chr(0)); + if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { + return false; // invalid + } + $filename = substr($data,$headerlen,$filenamelen); + $headerlen += $filenamelen + 1; + } + $commentlen = 0; + $comment = ""; + if ($flags & 16) { + // C-style string COMMENT data in header + if ($len - $headerlen - 1 < 8) { + return false; // invalid + } + $commentlen = strpos(substr($data,$headerlen),chr(0)); + if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { + return false; // Invalid header format + } + $comment = substr($data,$headerlen,$commentlen); + $headerlen += $commentlen + 1; + } + $headercrc = ""; + if ($flags & 2) { + // 2-bytes (lowest order) of CRC32 on header present + if ($len - $headerlen - 2 < 8) { + return false; // invalid + } + $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; + $headercrc = unpack("v", substr($data,$headerlen,2)); + $headercrc = $headercrc[1]; + if ($headercrc != $calccrc) { + $error = "Header checksum failed."; + return false; // Bad header CRC + } + $headerlen += 2; + } + // GZIP FOOTER + $datacrc = unpack("V",substr($data,-8,4)); + $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); + $isize = unpack("V",substr($data,-4)); + $isize = $isize[1]; + // decompression: + $bodylen = $len-$headerlen-8; + if ($bodylen < 1) { + // IMPLEMENTATION BUG! + return null; + } + $body = substr($data,$headerlen,$bodylen); + $data = ""; + if ($bodylen > 0) { + switch ($method) { + case 8: + // Currently the only supported compression method: + $data = gzinflate($body,$maxlength); + break; + default: + $error = "Unknown compression method."; + return false; + } + } // zero-byte body content is allowed + // Verifiy CRC32 + $crc = sprintf("%u",crc32($data)); + $crcOK = $crc == $datacrc; + $lenOK = $isize == strlen($data); + if (!$lenOK || !$crcOK) { + $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); + return false; + } + return $data; + } +} \ No newline at end of file diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php index ecd46d5f..c524a1ee 100644 --- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php @@ -1,79 +1,78 @@ -encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); - } - $this->url = $url; - $this->useragent = $useragent; - if (preg_match('/^http(s)?:\/\//i', $url)) - { - if (!is_array($headers)) - { - $headers = array(); - } - $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; - $headers2 = array(); - foreach ($headers as $key => $value) { - $headers2[] = "$key: $value"; - } - //TODO: allow for HTTP headers - // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); - - $response = self::$agent->get($url); - - if ($response === false || !isset($response['status_code'])) { - $this->error = 'failed to fetch URL'; - $this->success = false; - } else { - // The extra lines at the end are there to satisfy SimplePie's HTTP parser. - // The class expects a full HTTP message, whereas we're giving it only - // headers - the new lines indicate the start of the body. - $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); - if ($parser->parse()) { - $this->headers = $parser->headers; - //$this->body = $parser->body; - $this->body = $response['body']; - $this->status_code = $parser->status_code; - } - } - } - else - { - $this->error = 'invalid URL'; - $this->success = false; - } - } -} -?> \ No newline at end of file +encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); + } + $this->url = $url; + $this->useragent = $useragent; + if (preg_match('/^http(s)?:\/\//i', $url)) + { + if (!is_array($headers)) + { + $headers = array(); + } + $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; + $headers2 = array(); + foreach ($headers as $key => $value) { + $headers2[] = "$key: $value"; + } + //TODO: allow for HTTP headers + // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); + + $response = self::$agent->get($url); + + if ($response === false || !isset($response['status_code'])) { + $this->error = 'failed to fetch URL'; + $this->success = false; + } else { + // The extra lines at the end are there to satisfy SimplePie's HTTP parser. + // The class expects a full HTTP message, whereas we're giving it only + // headers - the new lines indicate the start of the body. + $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); + if ($parser->parse()) { + $this->headers = $parser->headers; + //$this->body = $parser->body; + $this->body = $response['body']; + $this->status_code = $parser->status_code; + } + } + } + else + { + $this->error = 'invalid URL'; + $this->success = false; + } + } +} \ No newline at end of file -- cgit v1.2.3