diff options
author | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-22 17:16:38 +0300 |
---|---|---|
committer | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-22 17:16:38 +0300 |
commit | 3ec62cf95ab4436923d4c665fad7aef226cbb822 (patch) | |
tree | f657024faaaf4c0b33ae27f7aea999f2b18cc8ab /inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | |
parent | ab157bbb75ba226917145c9bf906cbf764a85cd0 (diff) | |
download | wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.tar.gz wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.tar.zst wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.zip |
update to 3.2 version of full-text-rss, issue #694
Diffstat (limited to 'inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php')
-rw-r--r-- | inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | 1589 |
1 files changed, 810 insertions, 779 deletions
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php index e4f1b3b3..963f0c05 100644 --- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | |||
@@ -1,779 +1,810 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent | 3 | * Humble HTTP Agent |
4 | * | 4 | * |
5 | * This class is designed to take advantage of parallel HTTP requests | 5 | * This class is designed to take advantage of parallel HTTP requests |
6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. | 6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. |
7 | * For environments which do not have these options, it reverts to standard sequential | 7 | * For environments which do not have these options, it reverts to standard sequential |
8 | * requests (using file_get_contents()) | 8 | * requests (using file_get_contents()) |
9 | * | 9 | * |
10 | * @version 1.1 | 10 | * @version 1.4 |
11 | * @date 2012-08-20 | 11 | * @date 2013-05-10 |
12 | * @see http://php.net/HttpRequestPool | 12 | * @see http://php.net/HttpRequestPool |
13 | * @author Keyvan Minoukadeh | 13 | * @author Keyvan Minoukadeh |
14 | * @copyright 2011-2012 Keyvan Minoukadeh | 14 | * @copyright 2011-2013 Keyvan Minoukadeh |
15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
16 | */ | 16 | */ |
17 | 17 | ||
18 | class HumbleHttpAgent | 18 | class HumbleHttpAgent |
19 | { | 19 | { |
20 | const METHOD_REQUEST_POOL = 1; | 20 | const METHOD_REQUEST_POOL = 1; |
21 | const METHOD_CURL_MULTI = 2; | 21 | const METHOD_CURL_MULTI = 2; |
22 | const METHOD_FILE_GET_CONTENTS = 4; | 22 | const METHOD_FILE_GET_CONTENTS = 4; |
23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; | 23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; |
24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; | 24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; |
25 | const UA_PHP = 'PHP/5.2'; | 25 | const UA_PHP = 'PHP/5.4'; |
26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; | 26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; |
27 | 27 | ||
28 | protected $requests = array(); | 28 | protected $requests = array(); |
29 | protected $redirectQueue = array(); | 29 | protected $redirectQueue = array(); |
30 | protected $requestOptions; | 30 | protected $requestOptions; |
31 | protected $maxParallelRequests = 5; | 31 | protected $maxParallelRequests = 5; |
32 | protected $cache = null; //TODO | 32 | protected $cache = null; //TODO |
33 | protected $httpContext; | 33 | protected $httpContext; |
34 | protected $minimiseMemoryUse = false; //TODO | 34 | protected $minimiseMemoryUse = false; //TODO |
35 | protected $method; | 35 | protected $method; |
36 | protected $cookieJar; | 36 | protected $cookieJar; |
37 | public $debug = false; | 37 | public $debug = false; |
38 | public $debugVerbose = false; | 38 | public $debugVerbose = false; |
39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html | 39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html |
40 | public $maxRedirects = 5; | 40 | public $maxRedirects = 5; |
41 | public $userAgentMap = array(); | 41 | public $userAgentMap = array(); |
42 | public $rewriteUrls = array(); | 42 | public $rewriteUrls = array(); |
43 | public $userAgentDefault; | 43 | public $userAgentDefault; |
44 | public $referer; | 44 | public $referer; |
45 | //public $userAgent = 'Mozilla/5.0'; | 45 | //public $userAgent = 'Mozilla/5.0'; |
46 | 46 | ||
47 | // Prevent certain file/mime types | 47 | // Prevent certain file/mime types |
48 | // HTTP responses which match these content types will | 48 | // HTTP responses which match these content types will |
49 | // be returned without body. | 49 | // be returned without body. |
50 | public $headerOnlyTypes = array(); | 50 | public $headerOnlyTypes = array(); |
51 | // URLs ending with one of these extensions will | 51 | // URLs ending with one of these extensions will |
52 | // prompt Humble HTTP Agent to send a HEAD request first | 52 | // prompt Humble HTTP Agent to send a HEAD request first |
53 | // to see if returned content type matches $headerOnlyTypes. | 53 | // to see if returned content type matches $headerOnlyTypes. |
54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); | 54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); |
55 | // AJAX triggers to search for. | 55 | // AJAX triggers to search for. |
56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); | 57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); |
58 | 58 | ||
59 | //TODO: set max file size | 59 | //TODO: set max file size |
60 | //TODO: normalise headers | 60 | //TODO: normalise headers |
61 | 61 | ||
62 | function __construct($requestOptions=null, $method=null) { | 62 | function __construct($requestOptions=null, $method=null) { |
63 | $this->userAgentDefault = self::UA_BROWSER; | 63 | $this->userAgentDefault = self::UA_BROWSER; |
64 | $this->referer = self::REF_GOOGLE; | 64 | $this->referer = self::REF_GOOGLE; |
65 | // set the request method | 65 | // set the request method |
66 | if (in_array($method, array(1,2,4))) { | 66 | if (in_array($method, array(1,2,4))) { |
67 | $this->method = $method; | 67 | $this->method = $method; |
68 | } else { | 68 | } else { |
69 | if (class_exists('HttpRequestPool')) { | 69 | if (class_exists('HttpRequestPool')) { |
70 | $this->method = self::METHOD_REQUEST_POOL; | 70 | $this->method = self::METHOD_REQUEST_POOL; |
71 | } elseif (function_exists('curl_multi_init')) { | 71 | } elseif (function_exists('curl_multi_init')) { |
72 | $this->method = self::METHOD_CURL_MULTI; | 72 | $this->method = self::METHOD_CURL_MULTI; |
73 | } else { | 73 | } else { |
74 | $this->method = self::METHOD_FILE_GET_CONTENTS; | 74 | $this->method = self::METHOD_FILE_GET_CONTENTS; |
75 | } | 75 | } |
76 | } | 76 | } |
77 | if ($this->method == self::METHOD_CURL_MULTI) { | 77 | if ($this->method == self::METHOD_CURL_MULTI) { |
78 | require_once(dirname(__FILE__).'/RollingCurl.php'); | 78 | require_once(dirname(__FILE__).'/RollingCurl.php'); |
79 | } | 79 | } |
80 | // create cookie jar | 80 | // create cookie jar |
81 | $this->cookieJar = new CookieJar(); | 81 | $this->cookieJar = new CookieJar(); |
82 | // set request options (redirect must be 0) | 82 | // set request options (redirect must be 0) |
83 | $this->requestOptions = array( | 83 | $this->requestOptions = array( |
84 | 'timeout' => 15, | 84 | 'timeout' => 15, |
85 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web | 85 | 'connecttimeout' => 15, |
86 | // TODO: test onprogress? | 86 | 'dns_cache_timeout' => 300, |
87 | ); | 87 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web |
88 | if (is_array($requestOptions)) { | 88 | // TODO: test onprogress? |
89 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); | 89 | ); |
90 | } | 90 | if (is_array($requestOptions)) { |
91 | $this->httpContext = array( | 91 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); |
92 | 'http' => array( | 92 | } |
93 | 'ignore_errors' => true, | 93 | $this->httpContext = array( |
94 | 'timeout' => $this->requestOptions['timeout'], | 94 | 'http' => array( |
95 | 'max_redirects' => $this->requestOptions['redirect'], | 95 | 'ignore_errors' => true, |
96 | 'header' => "Accept: */*\r\n" | 96 | 'timeout' => $this->requestOptions['timeout'], |
97 | ) | 97 | 'max_redirects' => $this->requestOptions['redirect'], |
98 | ); | 98 | 'header' => "Accept: */*\r\n" |
99 | } | 99 | ) |
100 | 100 | ); | |
101 | protected function debug($msg) { | 101 | } |
102 | if ($this->debug) { | 102 | |
103 | $mem = round(memory_get_usage()/1024, 2); | 103 | protected function debug($msg) { |
104 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | if ($this->debug) { |
105 | echo '* ',$msg; | 105 | $mem = round(memory_get_usage()/1024, 2); |
106 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 106 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
107 | echo "\n"; | 107 | echo '* ',$msg; |
108 | ob_flush(); | 108 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
109 | flush(); | 109 | echo "\n"; |
110 | } | 110 | ob_flush(); |
111 | } | 111 | flush(); |
112 | 112 | } | |
113 | protected function getUserAgent($url, $asArray=false) { | 113 | } |
114 | $host = @parse_url($url, PHP_URL_HOST); | 114 | |
115 | if (strtolower(substr($host, 0, 4)) == 'www.') { | 115 | protected function getUserAgent($url, $asArray=false) { |
116 | $host = substr($host, 4); | 116 | $host = @parse_url($url, PHP_URL_HOST); |
117 | } | 117 | if (strtolower(substr($host, 0, 4)) == 'www.') { |
118 | if ($host) { | 118 | $host = substr($host, 4); |
119 | $try = array($host); | 119 | } |
120 | $split = explode('.', $host); | 120 | if ($host) { |
121 | if (count($split) > 1) { | 121 | $try = array($host); |
122 | array_shift($split); | 122 | $split = explode('.', $host); |
123 | $try[] = '.'.implode('.', $split); | 123 | if (count($split) > 1) { |
124 | } | 124 | array_shift($split); |
125 | foreach ($try as $h) { | 125 | $try[] = '.'.implode('.', $split); |
126 | if (isset($this->userAgentMap[$h])) { | 126 | } |
127 | $ua = $this->userAgentMap[$h]; | 127 | foreach ($try as $h) { |
128 | break; | 128 | if (isset($this->userAgentMap[$h])) { |
129 | } | 129 | $ua = $this->userAgentMap[$h]; |
130 | } | 130 | break; |
131 | } | 131 | } |
132 | if (!isset($ua)) $ua = $this->userAgentDefault; | 132 | } |
133 | if ($asArray) { | 133 | } |
134 | return array('User-Agent' => $ua); | 134 | if (!isset($ua)) $ua = $this->userAgentDefault; |
135 | } else { | 135 | if ($asArray) { |
136 | return 'User-Agent: '.$ua; | 136 | return array('User-Agent' => $ua); |
137 | } | 137 | } else { |
138 | } | 138 | return 'User-Agent: '.$ua; |
139 | 139 | } | |
140 | public function rewriteHashbangFragment($url) { | 140 | } |
141 | // return $url if there's no '#!' | 141 | |
142 | if (strpos($url, '#!') === false) return $url; | 142 | public function rewriteHashbangFragment($url) { |
143 | // split $url and rewrite | 143 | // return $url if there's no '#!' |
144 | // TODO: is SimplePie_IRI included? | 144 | if (strpos($url, '#!') === false) return $url; |
145 | $iri = new SimplePie_IRI($url); | 145 | // split $url and rewrite |
146 | $fragment = substr($iri->fragment, 1); // strip '!' | 146 | // TODO: is SimplePie_IRI included? |
147 | $iri->fragment = null; | 147 | $iri = new SimplePie_IRI($url); |
148 | if (isset($iri->query)) { | 148 | $fragment = substr($iri->fragment, 1); // strip '!' |
149 | parse_str($iri->query, $query); | 149 | $iri->fragment = null; |
150 | } else { | 150 | if (isset($iri->query)) { |
151 | $query = array(); | 151 | parse_str($iri->query, $query); |
152 | } | 152 | } else { |
153 | $query['_escaped_fragment_'] = (string)$fragment; | 153 | $query = array(); |
154 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 154 | } |
155 | return $iri->get_iri(); | 155 | $query['_escaped_fragment_'] = (string)$fragment; |
156 | } | 156 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
157 | 157 | return $iri->get_iri(); | |
158 | public function getUglyURL($url, $html) { | 158 | } |
159 | if ($html == '') return false; | 159 | |
160 | $found = false; | 160 | public function getRedirectURLfromHTML($url, $html) { |
161 | foreach ($this->ajaxTriggers as $string) { | 161 | $redirect_url = $this->getMetaRefreshURL($url, $html); |
162 | if (stripos($html, $string)) { | 162 | if (!$redirect_url) { |
163 | $found = true; | 163 | $redirect_url = $this->getUglyURL($url, $html); |
164 | break; | 164 | } |
165 | } | 165 | return $redirect_url; |
166 | } | 166 | } |
167 | if (!$found) return false; | 167 | |
168 | $iri = new SimplePie_IRI($url); | 168 | public function getMetaRefreshURL($url, $html) { |
169 | if (isset($iri->query)) { | 169 | if ($html == '') return false; |
170 | parse_str($iri->query, $query); | 170 | // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513"> |
171 | } else { | 171 | if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) { |
172 | $query = array(); | 172 | return false; |
173 | } | 173 | } |
174 | $query['_escaped_fragment_'] = ''; | 174 | $redirect_url = $match[1]; |
175 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 175 | if (preg_match('!^https?://!i', $redirect_url)) { |
176 | return $iri->get_iri(); | 176 | // already absolute |
177 | } | 177 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); |
178 | 178 | return $redirect_url; | |
179 | public function removeFragment($url) { | 179 | } |
180 | $pos = strpos($url, '#'); | 180 | // absolutize redirect URL |
181 | if ($pos === false) { | 181 | $base = new SimplePie_IRI($url); |
182 | return $url; | 182 | // remove '//' in URL path (causes URLs not to resolve properly) |
183 | } else { | 183 | if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); |
184 | return substr($url, 0, $pos); | 184 | if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { |
185 | } | 185 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); |
186 | } | 186 | return $absolute; |
187 | 187 | } | |
188 | public function rewriteUrls($url) { | 188 | return false; |
189 | foreach ($this->rewriteUrls as $find => $action) { | 189 | } |
190 | if (strpos($url, $find) !== false) { | 190 | |
191 | if (is_array($action)) { | 191 | public function getUglyURL($url, $html) { |
192 | return strtr($url, $action); | 192 | if ($html == '') return false; |
193 | } | 193 | $found = false; |
194 | } | 194 | foreach ($this->ajaxTriggers as $string) { |
195 | } | 195 | if (stripos($html, $string)) { |
196 | return $url; | 196 | $found = true; |
197 | } | 197 | break; |
198 | 198 | } | |
199 | public function enableDebug($bool=true) { | 199 | } |
200 | $this->debug = (bool)$bool; | 200 | if (!$found) return false; |
201 | } | 201 | $iri = new SimplePie_IRI($url); |
202 | 202 | if (isset($iri->query)) { | |
203 | public function minimiseMemoryUse($bool = true) { | 203 | parse_str($iri->query, $query); |
204 | $this->minimiseMemoryUse = $bool; | 204 | } else { |
205 | } | 205 | $query = array(); |
206 | 206 | } | |
207 | public function setMaxParallelRequests($max) { | 207 | $query['_escaped_fragment_'] = ''; |
208 | $this->maxParallelRequests = $max; | 208 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
209 | } | 209 | $ugly_url = $iri->get_iri(); |
210 | 210 | $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); | |
211 | public function validateUrl($url) { | 211 | return $ugly_url; |
212 | $url = filter_var($url, FILTER_SANITIZE_URL); | 212 | } |
213 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 213 | |
214 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) | 214 | public function removeFragment($url) { |
215 | if ($test === false) { | 215 | $pos = strpos($url, '#'); |
216 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 216 | if ($pos === false) { |
217 | } | 217 | return $url; |
218 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { | 218 | } else { |
219 | return $url; | 219 | return substr($url, 0, $pos); |
220 | } else { | 220 | } |
221 | return false; | 221 | } |
222 | } | 222 | |
223 | } | 223 | public function rewriteUrls($url) { |
224 | 224 | foreach ($this->rewriteUrls as $find => $action) { | |
225 | public function fetchAll(array $urls) { | 225 | if (strpos($url, $find) !== false) { |
226 | $this->fetchAllOnce($urls, $isRedirect=false); | 226 | if (is_array($action)) { |
227 | $redirects = 0; | 227 | return strtr($url, $action); |
228 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { | 228 | } |
229 | $this->debug("Following redirects #$redirects..."); | 229 | } |
230 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); | 230 | } |
231 | } | 231 | return $url; |
232 | } | 232 | } |
233 | 233 | ||
234 | // fetch all URLs without following redirects | 234 | public function enableDebug($bool=true) { |
235 | public function fetchAllOnce(array $urls, $isRedirect=false) { | 235 | $this->debug = (bool)$bool; |
236 | if (!$isRedirect) $urls = array_unique($urls); | 236 | } |
237 | if (empty($urls)) return; | 237 | |
238 | 238 | public function minimiseMemoryUse($bool = true) { | |
239 | ////////////////////////////////////////////////////// | 239 | $this->minimiseMemoryUse = $bool; |
240 | // parallel (HttpRequestPool) | 240 | } |
241 | if ($this->method == self::METHOD_REQUEST_POOL) { | 241 | |
242 | $this->debug('Starting parallel fetch (HttpRequestPool)'); | 242 | public function setMaxParallelRequests($max) { |
243 | try { | 243 | $this->maxParallelRequests = $max; |
244 | while (count($urls) > 0) { | 244 | } |
245 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 245 | |
246 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 246 | public function validateUrl($url) { |
247 | $pool = new HttpRequestPool(); | 247 | $url = filter_var($url, FILTER_SANITIZE_URL); |
248 | foreach ($subset as $orig => $url) { | 248 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
249 | if (!$isRedirect) $orig = $url; | 249 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) |
250 | unset($this->redirectQueue[$orig]); | 250 | if ($test === false) { |
251 | $this->debug("...$url"); | 251 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
252 | if (!$isRedirect && isset($this->requests[$url])) { | 252 | } |
253 | $this->debug("......in memory"); | 253 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { |
254 | /* | 254 | return $url; |
255 | } elseif ($this->isCached($url)) { | 255 | } else { |
256 | $this->debug("......is cached"); | 256 | return false; |
257 | if (!$this->minimiseMemoryUse) { | 257 | } |
258 | $this->requests[$url] = $this->getCached($url); | 258 | } |
259 | } | 259 | |
260 | */ | 260 | public function fetchAll(array $urls) { |
261 | } else { | 261 | $this->fetchAllOnce($urls, $isRedirect=false); |
262 | $this->debug("......adding to pool"); | 262 | $redirects = 0; |
263 | $req_url = $this->rewriteUrls($url); | 263 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { |
264 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 264 | $this->debug("Following redirects #$redirects..."); |
265 | $req_url = $this->removeFragment($req_url); | 265 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); |
266 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 266 | } |
267 | $_meth = HttpRequest::METH_HEAD; | 267 | } |
268 | } else { | 268 | |
269 | $_meth = HttpRequest::METH_GET; | 269 | // fetch all URLs without following redirects |
270 | unset($this->requests[$orig]['wrongGuess']); | 270 | public function fetchAllOnce(array $urls, $isRedirect=false) { |
271 | } | 271 | if (!$isRedirect) $urls = array_unique($urls); |
272 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); | 272 | if (empty($urls)) return; |
273 | // send cookies, if we have any | 273 | |
274 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 274 | ////////////////////////////////////////////////////// |
275 | $this->debug("......sending cookies: $cookies"); | 275 | // parallel (HttpRequestPool) |
276 | $httpRequest->addHeaders(array('Cookie' => $cookies)); | 276 | if ($this->method == self::METHOD_REQUEST_POOL) { |
277 | } | 277 | $this->debug('Starting parallel fetch (HttpRequestPool)'); |
278 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); | 278 | try { |
279 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); | 279 | while (count($urls) > 0) { |
280 | // add referer for picky sites | 280 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
281 | $httpRequest->addheaders(array('Referer' => $this->referer)); | 281 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
282 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 282 | $pool = new HttpRequestPool(); |
283 | $this->requests[$orig]['original_url'] = $orig; | 283 | foreach ($subset as $orig => $url) { |
284 | $pool->attach($httpRequest); | 284 | if (!$isRedirect) $orig = $url; |
285 | } | 285 | unset($this->redirectQueue[$orig]); |
286 | } | 286 | $this->debug("...$url"); |
287 | // did we get anything into the pool? | 287 | if (!$isRedirect && isset($this->requests[$url])) { |
288 | if (count($pool) > 0) { | 288 | $this->debug("......in memory"); |
289 | $this->debug('Sending request...'); | 289 | /* |
290 | try { | 290 | } elseif ($this->isCached($url)) { |
291 | $pool->send(); | 291 | $this->debug("......is cached"); |
292 | } catch (HttpRequestPoolException $e) { | 292 | if (!$this->minimiseMemoryUse) { |
293 | // do nothing | 293 | $this->requests[$url] = $this->getCached($url); |
294 | } | 294 | } |
295 | $this->debug('Received responses'); | 295 | */ |
296 | foreach($subset as $orig => $url) { | 296 | } else { |
297 | if (!$isRedirect) $orig = $url; | 297 | $this->debug("......adding to pool"); |
298 | $request = $this->requests[$orig]['httpRequest']; | 298 | $req_url = $this->rewriteUrls($url); |
299 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); | 299 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
300 | // getResponseHeader() doesn't return status line, so, for consistency... | 300 | $req_url = $this->removeFragment($req_url); |
301 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); | 301 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
302 | // check content type | 302 | $_meth = HttpRequest::METH_HEAD; |
303 | // TODO: use getResponseHeader('content-type') or getResponseInfo() | 303 | } else { |
304 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 304 | $_meth = HttpRequest::METH_GET; |
305 | $this->requests[$orig]['body'] = ''; | 305 | unset($this->requests[$orig]['wrongGuess']); |
306 | $_header_only_type = true; | 306 | } |
307 | $this->debug('Header only type returned'); | 307 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); |
308 | } else { | 308 | // send cookies, if we have any |
309 | $this->requests[$orig]['body'] = $request->getResponseBody(); | 309 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
310 | $_header_only_type = false; | 310 | $this->debug("......sending cookies: $cookies"); |
311 | } | 311 | $httpRequest->addHeaders(array('Cookie' => $cookies)); |
312 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); | 312 | } |
313 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); | 313 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); |
314 | // is redirect? | 314 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); |
315 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { | 315 | // add referer for picky sites |
316 | $redirectURL = $request->getResponseHeader('location'); | 316 | $httpRequest->addheaders(array('Referer' => $this->referer)); |
317 | if (!preg_match('!^https?://!i', $redirectURL)) { | 317 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
318 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 318 | $this->requests[$orig]['original_url'] = $orig; |
319 | } | 319 | $pool->attach($httpRequest); |
320 | if ($this->validateURL($redirectURL)) { | 320 | } |
321 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 321 | } |
322 | // store any cookies | 322 | // did we get anything into the pool? |
323 | $cookies = $request->getResponseHeader('set-cookie'); | 323 | if (count($pool) > 0) { |
324 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); | 324 | $this->debug('Sending request...'); |
325 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); | 325 | try { |
326 | $this->redirectQueue[$orig] = $redirectURL; | 326 | $pool->send(); |
327 | } else { | 327 | } catch (HttpRequestPoolException $e) { |
328 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 328 | // do nothing |
329 | } | 329 | } |
330 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { | 330 | $this->debug('Received responses'); |
331 | // the response content-type did not match our 'header only' types, | 331 | foreach($subset as $orig => $url) { |
332 | // but we'd issues a HEAD request because we assumed it would. So | 332 | if (!$isRedirect) $orig = $url; |
333 | // let's queue a proper GET request for this item... | 333 | $request = $this->requests[$orig]['httpRequest']; |
334 | $this->debug('Wrong guess at content-type, queing GET request'); | 334 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); |
335 | $this->requests[$orig]['wrongGuess'] = true; | 335 | // getResponseHeader() doesn't return status line, so, for consistency... |
336 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 336 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); |
337 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 337 | // check content type |
338 | // check for <meta name='fragment' content='!'/> | 338 | // TODO: use getResponseHeader('content-type') or getResponseInfo() |
339 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 339 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
340 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 340 | $this->requests[$orig]['body'] = ''; |
341 | if (isset($this->requests[$orig]['body'])) { | 341 | $_header_only_type = true; |
342 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 342 | $this->debug('Header only type returned'); |
343 | if ($redirectURL) { | 343 | } else { |
344 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 344 | $this->requests[$orig]['body'] = $request->getResponseBody(); |
345 | $this->redirectQueue[$orig] = $redirectURL; | 345 | $_header_only_type = false; |
346 | } | 346 | } |
347 | } | 347 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); |
348 | } | 348 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); |
349 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 349 | // is redirect? |
350 | $pool->detach($request); | 350 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { |
351 | unset($this->requests[$orig]['httpRequest'], $request); | 351 | $redirectURL = $request->getResponseHeader('location'); |
352 | /* | 352 | if (!preg_match('!^https?://!i', $redirectURL)) { |
353 | if ($this->minimiseMemoryUse) { | 353 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
354 | if ($this->cache($url)) { | 354 | } |
355 | unset($this->requests[$url]); | 355 | if ($this->validateURL($redirectURL)) { |
356 | } | 356 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
357 | } | 357 | // store any cookies |
358 | */ | 358 | $cookies = $request->getResponseHeader('set-cookie'); |
359 | } | 359 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); |
360 | } | 360 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); |
361 | } | 361 | $this->redirectQueue[$orig] = $redirectURL; |
362 | } catch (HttpException $e) { | 362 | } else { |
363 | $this->debug($e); | 363 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
364 | return false; | 364 | } |
365 | } | 365 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { |
366 | } | 366 | // the response content-type did not match our 'header only' types, |
367 | 367 | // but we'd issues a HEAD request because we assumed it would. So | |
368 | ////////////////////////////////////////////////////////// | 368 | // let's queue a proper GET request for this item... |
369 | // parallel (curl_multi_*) | 369 | $this->debug('Wrong guess at content-type, queing GET request'); |
370 | elseif ($this->method == self::METHOD_CURL_MULTI) { | 370 | $this->requests[$orig]['wrongGuess'] = true; |
371 | $this->debug('Starting parallel fetch (curl_multi_*)'); | 371 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
372 | while (count($urls) > 0) { | 372 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
373 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 373 | // check for <meta name='fragment' content='!'/> |
374 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 374 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
375 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); | 375 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
376 | $pool->window_size = count($subset); | 376 | if (isset($this->requests[$orig]['body'])) { |
377 | 377 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | |
378 | foreach ($subset as $orig => $url) { | 378 | if ($redirectURL) { |
379 | if (!$isRedirect) $orig = $url; | 379 | $this->redirectQueue[$orig] = $redirectURL; |
380 | unset($this->redirectQueue[$orig]); | 380 | } |
381 | $this->debug("...$url"); | 381 | } |
382 | if (!$isRedirect && isset($this->requests[$url])) { | 382 | } |
383 | $this->debug("......in memory"); | 383 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
384 | /* | 384 | $pool->detach($request); |
385 | } elseif ($this->isCached($url)) { | 385 | unset($this->requests[$orig]['httpRequest'], $request); |
386 | $this->debug("......is cached"); | 386 | /* |
387 | if (!$this->minimiseMemoryUse) { | 387 | if ($this->minimiseMemoryUse) { |
388 | $this->requests[$url] = $this->getCached($url); | 388 | if ($this->cache($url)) { |
389 | } | 389 | unset($this->requests[$url]); |
390 | */ | 390 | } |
391 | } else { | 391 | } |
392 | $this->debug("......adding to pool"); | 392 | */ |
393 | $req_url = $this->rewriteUrls($url); | 393 | } |
394 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 394 | } |
395 | $req_url = $this->removeFragment($req_url); | 395 | } |
396 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 396 | } catch (HttpException $e) { |
397 | $_meth = 'HEAD'; | 397 | $this->debug($e); |
398 | } else { | 398 | return false; |
399 | $_meth = 'GET'; | 399 | } |
400 | unset($this->requests[$orig]['wrongGuess']); | 400 | } |
401 | } | 401 | |
402 | $headers = array(); | 402 | ////////////////////////////////////////////////////////// |
403 | //$headers[] = 'User-Agent: '.$this->userAgent; | 403 | // parallel (curl_multi_*) |
404 | $headers[] = $this->getUserAgent($req_url); | 404 | elseif ($this->method == self::METHOD_CURL_MULTI) { |
405 | // add referer for picky sites | 405 | $this->debug('Starting parallel fetch (curl_multi_*)'); |
406 | $headers[] = 'Referer: '.$this->referer; | 406 | while (count($urls) > 0) { |
407 | // send cookies, if we have any | 407 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
408 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 408 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
409 | $this->debug("......sending cookies: $cookies"); | 409 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); |
410 | $headers[] = 'Cookie: '.$cookies; | 410 | $pool->window_size = count($subset); |
411 | } | 411 | |
412 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( | 412 | foreach ($subset as $orig => $url) { |
413 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], | 413 | if (!$isRedirect) $orig = $url; |
414 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] | 414 | unset($this->redirectQueue[$orig]); |
415 | )); | 415 | $this->debug("...$url"); |
416 | $httpRequest->set_original_url($orig); | 416 | if (!$isRedirect && isset($this->requests[$url])) { |
417 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 417 | $this->debug("......in memory"); |
418 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? | 418 | /* |
419 | $pool->add($httpRequest); | 419 | } elseif ($this->isCached($url)) { |
420 | } | 420 | $this->debug("......is cached"); |
421 | } | 421 | if (!$this->minimiseMemoryUse) { |
422 | // did we get anything into the pool? | 422 | $this->requests[$url] = $this->getCached($url); |
423 | if (count($pool) > 0) { | 423 | } |
424 | $this->debug('Sending request...'); | 424 | */ |
425 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] | 425 | } else { |
426 | $this->debug('Received responses'); | 426 | $this->debug("......adding to pool"); |
427 | foreach($subset as $orig => $url) { | 427 | $req_url = $this->rewriteUrls($url); |
428 | if (!$isRedirect) $orig = $url; | 428 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
429 | // $this->requests[$orig]['headers'] | 429 | $req_url = $this->removeFragment($req_url); |
430 | // $this->requests[$orig]['body'] | 430 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
431 | // $this->requests[$orig]['effective_url'] | 431 | $_meth = 'HEAD'; |
432 | // check content type | 432 | } else { |
433 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 433 | $_meth = 'GET'; |
434 | $this->requests[$orig]['body'] = ''; | 434 | unset($this->requests[$orig]['wrongGuess']); |
435 | $_header_only_type = true; | 435 | } |
436 | $this->debug('Header only type returned'); | 436 | $headers = array(); |
437 | } else { | 437 | //$headers[] = 'User-Agent: '.$this->userAgent; |
438 | $_header_only_type = false; | 438 | $headers[] = $this->getUserAgent($req_url); |
439 | } | 439 | // add referer for picky sites |
440 | $status_code = $this->requests[$orig]['status_code']; | 440 | $headers[] = 'Referer: '.$this->referer; |
441 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 441 | // send cookies, if we have any |
442 | $redirectURL = $this->requests[$orig]['location']; | 442 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
443 | if (!preg_match('!^https?://!i', $redirectURL)) { | 443 | $this->debug("......sending cookies: $cookies"); |
444 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 444 | $headers[] = 'Cookie: '.$cookies; |
445 | } | 445 | } |
446 | if ($this->validateURL($redirectURL)) { | 446 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( |
447 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 447 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], |
448 | // store any cookies | 448 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] |
449 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 449 | )); |
450 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 450 | $httpRequest->set_original_url($orig); |
451 | $this->redirectQueue[$orig] = $redirectURL; | 451 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
452 | } else { | 452 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? |
453 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 453 | $pool->add($httpRequest); |
454 | } | 454 | } |
455 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { | 455 | } |
456 | // the response content-type did not match our 'header only' types, | 456 | // did we get anything into the pool? |
457 | // but we'd issues a HEAD request because we assumed it would. So | 457 | if (count($pool) > 0) { |
458 | // let's queue a proper GET request for this item... | 458 | $this->debug('Sending request...'); |
459 | $this->debug('Wrong guess at content-type, queing GET request'); | 459 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] |
460 | $this->requests[$orig]['wrongGuess'] = true; | 460 | $this->debug('Received responses'); |
461 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 461 | foreach($subset as $orig => $url) { |
462 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 462 | if (!$isRedirect) $orig = $url; |
463 | // check for <meta name='fragment' content='!'/> | 463 | // $this->requests[$orig]['headers'] |
464 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 464 | // $this->requests[$orig]['body'] |
465 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 465 | // $this->requests[$orig]['effective_url'] |
466 | if (isset($this->requests[$orig]['body'])) { | 466 | // check content type |
467 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 467 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
468 | if ($redirectURL) { | 468 | $this->requests[$orig]['body'] = ''; |
469 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 469 | $_header_only_type = true; |
470 | $this->redirectQueue[$orig] = $redirectURL; | 470 | $this->debug('Header only type returned'); |
471 | } | 471 | } else { |
472 | } | 472 | $_header_only_type = false; |
473 | } | 473 | } |
474 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 474 | $status_code = $this->requests[$orig]['status_code']; |
475 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); | 475 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
476 | } | 476 | $redirectURL = $this->requests[$orig]['location']; |
477 | } | 477 | if (!preg_match('!^https?://!i', $redirectURL)) { |
478 | } | 478 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
479 | } | 479 | } |
480 | 480 | if ($this->validateURL($redirectURL)) { | |
481 | ////////////////////////////////////////////////////// | 481 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
482 | // sequential (file_get_contents) | 482 | // store any cookies |
483 | else { | 483 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
484 | $this->debug('Starting sequential fetch (file_get_contents)'); | 484 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
485 | $this->debug('Processing set of '.count($urls)); | 485 | $this->redirectQueue[$orig] = $redirectURL; |
486 | foreach ($urls as $orig => $url) { | 486 | } else { |
487 | if (!$isRedirect) $orig = $url; | 487 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
488 | unset($this->redirectQueue[$orig]); | 488 | } |
489 | $this->debug("...$url"); | 489 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { |
490 | if (!$isRedirect && isset($this->requests[$url])) { | 490 | // the response content-type did not match our 'header only' types, |
491 | $this->debug("......in memory"); | 491 | // but we'd issues a HEAD request because we assumed it would. So |
492 | /* | 492 | // let's queue a proper GET request for this item... |
493 | } elseif ($this->isCached($url)) { | 493 | $this->debug('Wrong guess at content-type, queing GET request'); |
494 | $this->debug("......is cached"); | 494 | $this->requests[$orig]['wrongGuess'] = true; |
495 | if (!$this->minimiseMemoryUse) { | 495 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
496 | $this->requests[$url] = $this->getCached($url); | 496 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
497 | } | 497 | // check for <meta name='fragment' content='!'/> |
498 | */ | 498 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
499 | } else { | 499 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
500 | $this->debug("Sending request for $url"); | 500 | if (isset($this->requests[$orig]['body'])) { |
501 | $this->requests[$orig]['original_url'] = $orig; | 501 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
502 | $req_url = $this->rewriteUrls($url); | 502 | if ($redirectURL) { |
503 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 503 | $this->redirectQueue[$orig] = $redirectURL; |
504 | $req_url = $this->removeFragment($req_url); | 504 | } |
505 | // send cookies, if we have any | 505 | } |
506 | $httpContext = $this->httpContext; | 506 | } |
507 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; | 507 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
508 | // add referer for picky sites | 508 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); |
509 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; | 509 | } |
510 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 510 | } |
511 | $this->debug("......sending cookies: $cookies"); | 511 | } |
512 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; | 512 | } |
513 | } | 513 | |
514 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { | 514 | ////////////////////////////////////////////////////// |
515 | $this->debug('Received response'); | 515 | // sequential (file_get_contents) |
516 | // get status code | 516 | else { |
517 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { | 517 | $this->debug('Starting sequential fetch (file_get_contents)'); |
518 | $this->debug('Error: no status code found'); | 518 | $this->debug('Processing set of '.count($urls)); |
519 | // TODO: handle error - no status code | 519 | foreach ($urls as $orig => $url) { |
520 | } else { | 520 | if (!$isRedirect) $orig = $url; |
521 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); | 521 | unset($this->redirectQueue[$orig]); |
522 | // check content type | 522 | $this->debug("...$url"); |
523 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 523 | if (!$isRedirect && isset($this->requests[$url])) { |
524 | $this->requests[$orig]['body'] = ''; | 524 | $this->debug("......in memory"); |
525 | } else { | 525 | /* |
526 | $this->requests[$orig]['body'] = $html; | 526 | } elseif ($this->isCached($url)) { |
527 | } | 527 | $this->debug("......is cached"); |
528 | $this->requests[$orig]['effective_url'] = $req_url; | 528 | if (!$this->minimiseMemoryUse) { |
529 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; | 529 | $this->requests[$url] = $this->getCached($url); |
530 | unset($match); | 530 | } |
531 | // handle redirect | 531 | */ |
532 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 532 | } else { |
533 | $this->requests[$orig]['location'] = trim($match[1]); | 533 | $this->debug("Sending request for $url"); |
534 | } | 534 | $this->requests[$orig]['original_url'] = $orig; |
535 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 535 | $req_url = $this->rewriteUrls($url); |
536 | $redirectURL = $this->requests[$orig]['location']; | 536 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
537 | if (!preg_match('!^https?://!i', $redirectURL)) { | 537 | $req_url = $this->removeFragment($req_url); |
538 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 538 | // send cookies, if we have any |
539 | } | 539 | $httpContext = $this->httpContext; |
540 | if ($this->validateURL($redirectURL)) { | 540 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; |
541 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 541 | // add referer for picky sites |
542 | // store any cookies | 542 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; |
543 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 543 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
544 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 544 | $this->debug("......sending cookies: $cookies"); |
545 | $this->redirectQueue[$orig] = $redirectURL; | 545 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; |
546 | } else { | 546 | } |
547 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 547 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { |
548 | } | 548 | $this->debug('Received response'); |
549 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 549 | // get status code |
550 | // check for <meta name='fragment' content='!'/> | 550 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { |
551 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 551 | $this->debug('Error: no status code found'); |
552 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 552 | // TODO: handle error - no status code |
553 | if (isset($this->requests[$orig]['body'])) { | 553 | } else { |
554 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 554 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); |
555 | if ($redirectURL) { | 555 | // check content type |
556 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 556 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
557 | $this->redirectQueue[$orig] = $redirectURL; | 557 | $this->requests[$orig]['body'] = ''; |
558 | } | 558 | } else { |
559 | } | 559 | $this->requests[$orig]['body'] = $html; |
560 | } | 560 | } |
561 | } | 561 | $this->requests[$orig]['effective_url'] = $req_url; |
562 | } else { | 562 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; |
563 | $this->debug('Error retrieving URL'); | 563 | unset($match); |
564 | //print_r($req_url); | 564 | // handle redirect |
565 | //print_r($http_response_header); | 565 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
566 | //print_r($html); | 566 | $this->requests[$orig]['location'] = trim($match[1]); |
567 | 567 | } | |
568 | // TODO: handle error - failed to retrieve URL | 568 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
569 | } | 569 | $redirectURL = $this->requests[$orig]['location']; |
570 | } | 570 | if (!preg_match('!^https?://!i', $redirectURL)) { |
571 | } | 571 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
572 | } | 572 | } |
573 | } | 573 | if ($this->validateURL($redirectURL)) { |
574 | 574 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | |
575 | public function handleCurlResponse($response, $info, $request) { | 575 | // store any cookies |
576 | $orig = $request->url_original; | 576 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
577 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); | 577 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
578 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); | 578 | $this->redirectQueue[$orig] = $redirectURL; |
579 | $this->requests[$orig]['method'] = $request->method; | 579 | } else { |
580 | $this->requests[$orig]['effective_url'] = $info['url']; | 580 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
581 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; | 581 | } |
582 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 582 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
583 | $this->requests[$orig]['location'] = trim($match[1]); | 583 | // check for <meta name='fragment' content='!'/> |
584 | } | 584 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
585 | } | 585 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
586 | 586 | if (isset($this->requests[$orig]['body'])) { | |
587 | protected function headersToString(array $headers, $associative=true) { | 587 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
588 | if (!$associative) { | 588 | if ($redirectURL) { |
589 | return implode("\n", $headers); | 589 | $this->redirectQueue[$orig] = $redirectURL; |
590 | } else { | 590 | } |
591 | $str = ''; | 591 | } |
592 | foreach ($headers as $key => $val) { | 592 | } |
593 | if (is_array($val)) { | 593 | } |
594 | foreach ($val as $v) $str .= "$key: $v\n"; | 594 | } else { |
595 | } else { | 595 | $this->debug('Error retrieving URL'); |
596 | $str .= "$key: $val\n"; | 596 | //print_r($req_url); |
597 | } | 597 | //print_r($http_response_header); |
598 | } | 598 | //print_r($html); |
599 | return rtrim($str); | 599 | |
600 | } | 600 | // TODO: handle error - failed to retrieve URL |
601 | } | 601 | } |
602 | 602 | } | |
603 | public function get($url, $remove=false, $gzdecode=true) { | 603 | } |
604 | $url = "$url"; | 604 | } |
605 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 605 | } |
606 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | 606 | |
607 | $response = $this->requests[$url]; | 607 | public function handleCurlResponse($response, $info, $request) { |
608 | /* | 608 | $orig = $request->url_original; |
609 | } elseif ($this->isCached($url)) { | 609 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); |
610 | $this->debug("URL already fetched - in disk cache ($url)"); | 610 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); |
611 | $response = $this->getCached($url); | 611 | $this->requests[$orig]['method'] = $request->method; |
612 | $this->requests[$url] = $response; | 612 | $this->requests[$orig]['effective_url'] = $info['url']; |
613 | */ | 613 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; |
614 | } else { | 614 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
615 | $this->debug("Fetching URL ($url)"); | 615 | $this->requests[$orig]['location'] = trim($match[1]); |
616 | $this->fetchAll(array($url)); | 616 | } |
617 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 617 | } |
618 | $response = $this->requests[$url]; | 618 | |
619 | } else { | 619 | protected function headersToString(array $headers, $associative=true) { |
620 | $this->debug("Request failed"); | 620 | if (!$associative) { |
621 | $response = false; | 621 | return implode("\n", $headers); |
622 | } | 622 | } else { |
623 | } | 623 | $str = ''; |
624 | /* | 624 | foreach ($headers as $key => $val) { |
625 | if ($this->minimiseMemoryUse && $response) { | 625 | if (is_array($val)) { |
626 | $this->cache($url); | 626 | foreach ($val as $v) $str .= "$key: $v\n"; |
627 | unset($this->requests[$url]); | 627 | } else { |
628 | } | 628 | $str .= "$key: $val\n"; |
629 | */ | 629 | } |
630 | if ($remove && $response) unset($this->requests[$url]); | 630 | } |
631 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { | 631 | return rtrim($str); |
632 | if ($html = gzdecode($response['body'])) { | 632 | } |
633 | $response['body'] = $html; | 633 | } |
634 | } | 634 | |
635 | } | 635 | public function get($url, $remove=false, $gzdecode=true) { |
636 | return $response; | 636 | $url = "$url"; |
637 | } | 637 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
638 | 638 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | |
639 | public function parallelSupport() { | 639 | $response = $this->requests[$url]; |
640 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); | 640 | /* |
641 | } | 641 | } elseif ($this->isCached($url)) { |
642 | 642 | $this->debug("URL already fetched - in disk cache ($url)"); | |
643 | private function headerOnlyType($headers) { | 643 | $response = $this->getCached($url); |
644 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { | 644 | $this->requests[$url] = $response; |
645 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) | 645 | */ |
646 | $match[1] = strtolower(trim($match[1])); | 646 | } else { |
647 | $match[2] = strtolower(trim($match[2])); | 647 | $this->debug("Fetching URL ($url)"); |
648 | foreach (array($match[1], $match[2]) as $mime) { | 648 | $this->fetchAll(array($url)); |
649 | if (in_array($mime, $this->headerOnlyTypes)) return true; | 649 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
650 | } | 650 | $response = $this->requests[$url]; |
651 | } | 651 | } else { |
652 | return false; | 652 | $this->debug("Request failed"); |
653 | } | 653 | $response = false; |
654 | 654 | } | |
655 | private function possibleUnsupportedType($url) { | 655 | } |
656 | $path = @parse_url($url, PHP_URL_PATH); | 656 | /* |
657 | if ($path && strpos($path, '.') !== false) { | 657 | if ($this->minimiseMemoryUse && $response) { |
658 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); | 658 | $this->cache($url); |
659 | return in_array($ext, $this->headerOnlyClues); | 659 | unset($this->requests[$url]); |
660 | } | 660 | } |
661 | return false; | 661 | */ |
662 | } | 662 | if ($remove && $response) unset($this->requests[$url]); |
663 | } | 663 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { |
664 | 664 | if ($html = gzdecode($response['body'])) { | |
665 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 | 665 | $response['body'] = $html; |
666 | if (!function_exists('gzdecode')) { | 666 | } |
667 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) | 667 | } |
668 | { | 668 | return $response; |
669 | $len = strlen($data); | 669 | } |
670 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { | 670 | |
671 | $error = "Not in GZIP format."; | 671 | public function parallelSupport() { |
672 | return null; // Not GZIP format (See RFC 1952) | 672 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); |
673 | } | 673 | } |
674 | $method = ord(substr($data,2,1)); // Compression method | 674 | |
675 | $flags = ord(substr($data,3,1)); // Flags | 675 | private function headerOnlyType($headers) { |
676 | if ($flags & 31 != $flags) { | 676 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { |
677 | $error = "Reserved bits not allowed."; | 677 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) |
678 | return null; | 678 | $match[1] = strtolower(trim($match[1])); |
679 | } | 679 | $match[2] = strtolower(trim($match[2])); |
680 | // NOTE: $mtime may be negative (PHP integer limitations) | 680 | foreach (array($match[1], $match[2]) as $mime) { |
681 | $mtime = unpack("V", substr($data,4,4)); | 681 | if (in_array($mime, $this->headerOnlyTypes)) return true; |
682 | $mtime = $mtime[1]; | 682 | } |
683 | $xfl = substr($data,8,1); | 683 | } |
684 | $os = substr($data,8,1); | 684 | return false; |
685 | $headerlen = 10; | 685 | } |
686 | $extralen = 0; | 686 | |
687 | $extra = ""; | 687 | private function possibleUnsupportedType($url) { |
688 | if ($flags & 4) { | 688 | $path = @parse_url($url, PHP_URL_PATH); |
689 | // 2-byte length prefixed EXTRA data in header | 689 | if ($path && strpos($path, '.') !== false) { |
690 | if ($len - $headerlen - 2 < 8) { | 690 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); |
691 | return false; // invalid | 691 | return in_array($ext, $this->headerOnlyClues); |
692 | } | 692 | } |
693 | $extralen = unpack("v",substr($data,8,2)); | 693 | return false; |
694 | $extralen = $extralen[1]; | 694 | } |
695 | if ($len - $headerlen - 2 - $extralen < 8) { | 695 | } |
696 | return false; // invalid | 696 | |
697 | } | 697 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 |
698 | $extra = substr($data,10,$extralen); | 698 | if (!function_exists('gzdecode')) { |
699 | $headerlen += 2 + $extralen; | 699 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) |
700 | } | 700 | { |
701 | $filenamelen = 0; | 701 | $len = strlen($data); |
702 | $filename = ""; | 702 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { |
703 | if ($flags & 8) { | 703 | $error = "Not in GZIP format."; |
704 | // C-style string | 704 | return null; // Not GZIP format (See RFC 1952) |
705 | if ($len - $headerlen - 1 < 8) { | 705 | } |
706 | return false; // invalid | 706 | $method = ord(substr($data,2,1)); // Compression method |
707 | } | 707 | $flags = ord(substr($data,3,1)); // Flags |
708 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); | 708 | if ($flags & 31 != $flags) { |
709 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { | 709 | $error = "Reserved bits not allowed."; |
710 | return false; // invalid | 710 | return null; |
711 | } | 711 | } |
712 | $filename = substr($data,$headerlen,$filenamelen); | 712 | // NOTE: $mtime may be negative (PHP integer limitations) |
713 | $headerlen += $filenamelen + 1; | 713 | $mtime = unpack("V", substr($data,4,4)); |
714 | } | 714 | $mtime = $mtime[1]; |
715 | $commentlen = 0; | 715 | $xfl = substr($data,8,1); |
716 | $comment = ""; | 716 | $os = substr($data,8,1); |
717 | if ($flags & 16) { | 717 | $headerlen = 10; |
718 | // C-style string COMMENT data in header | 718 | $extralen = 0; |
719 | if ($len - $headerlen - 1 < 8) { | 719 | $extra = ""; |
720 | return false; // invalid | 720 | if ($flags & 4) { |
721 | } | 721 | // 2-byte length prefixed EXTRA data in header |
722 | $commentlen = strpos(substr($data,$headerlen),chr(0)); | 722 | if ($len - $headerlen - 2 < 8) { |
723 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { | 723 | return false; // invalid |
724 | return false; // Invalid header format | 724 | } |
725 | } | 725 | $extralen = unpack("v",substr($data,8,2)); |
726 | $comment = substr($data,$headerlen,$commentlen); | 726 | $extralen = $extralen[1]; |
727 | $headerlen += $commentlen + 1; | 727 | if ($len - $headerlen - 2 - $extralen < 8) { |
728 | } | 728 | return false; // invalid |
729 | $headercrc = ""; | 729 | } |
730 | if ($flags & 2) { | 730 | $extra = substr($data,10,$extralen); |
731 | // 2-bytes (lowest order) of CRC32 on header present | 731 | $headerlen += 2 + $extralen; |
732 | if ($len - $headerlen - 2 < 8) { | 732 | } |
733 | return false; // invalid | 733 | $filenamelen = 0; |
734 | } | 734 | $filename = ""; |
735 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; | 735 | if ($flags & 8) { |
736 | $headercrc = unpack("v", substr($data,$headerlen,2)); | 736 | // C-style string |
737 | $headercrc = $headercrc[1]; | 737 | if ($len - $headerlen - 1 < 8) { |
738 | if ($headercrc != $calccrc) { | 738 | return false; // invalid |
739 | $error = "Header checksum failed."; | 739 | } |
740 | return false; // Bad header CRC | 740 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); |
741 | } | 741 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { |
742 | $headerlen += 2; | 742 | return false; // invalid |
743 | } | 743 | } |
744 | // GZIP FOOTER | 744 | $filename = substr($data,$headerlen,$filenamelen); |
745 | $datacrc = unpack("V",substr($data,-8,4)); | 745 | $headerlen += $filenamelen + 1; |
746 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); | 746 | } |
747 | $isize = unpack("V",substr($data,-4)); | 747 | $commentlen = 0; |
748 | $isize = $isize[1]; | 748 | $comment = ""; |
749 | // decompression: | 749 | if ($flags & 16) { |
750 | $bodylen = $len-$headerlen-8; | 750 | // C-style string COMMENT data in header |
751 | if ($bodylen < 1) { | 751 | if ($len - $headerlen - 1 < 8) { |
752 | // IMPLEMENTATION BUG! | 752 | return false; // invalid |
753 | return null; | 753 | } |
754 | } | 754 | $commentlen = strpos(substr($data,$headerlen),chr(0)); |
755 | $body = substr($data,$headerlen,$bodylen); | 755 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { |
756 | $data = ""; | 756 | return false; // Invalid header format |
757 | if ($bodylen > 0) { | 757 | } |
758 | switch ($method) { | 758 | $comment = substr($data,$headerlen,$commentlen); |
759 | case 8: | 759 | $headerlen += $commentlen + 1; |
760 | // Currently the only supported compression method: | 760 | } |
761 | $data = gzinflate($body,$maxlength); | 761 | $headercrc = ""; |
762 | break; | 762 | if ($flags & 2) { |
763 | default: | 763 | // 2-bytes (lowest order) of CRC32 on header present |
764 | $error = "Unknown compression method."; | 764 | if ($len - $headerlen - 2 < 8) { |
765 | return false; | 765 | return false; // invalid |
766 | } | 766 | } |
767 | } // zero-byte body content is allowed | 767 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; |
768 | // Verifiy CRC32 | 768 | $headercrc = unpack("v", substr($data,$headerlen,2)); |
769 | $crc = sprintf("%u",crc32($data)); | 769 | $headercrc = $headercrc[1]; |
770 | $crcOK = $crc == $datacrc; | 770 | if ($headercrc != $calccrc) { |
771 | $lenOK = $isize == strlen($data); | 771 | $error = "Header checksum failed."; |
772 | if (!$lenOK || !$crcOK) { | 772 | return false; // Bad header CRC |
773 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | 773 | } |
774 | return false; | 774 | $headerlen += 2; |
775 | } | 775 | } |
776 | return $data; | 776 | // GZIP FOOTER |
777 | } | 777 | $datacrc = unpack("V",substr($data,-8,4)); |
778 | } | 778 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); |
779 | ?> \ No newline at end of file | 779 | $isize = unpack("V",substr($data,-4)); |
780 | $isize = $isize[1]; | ||
781 | // decompression: | ||
782 | $bodylen = $len-$headerlen-8; | ||
783 | if ($bodylen < 1) { | ||
784 | // IMPLEMENTATION BUG! | ||
785 | return null; | ||
786 | } | ||
787 | $body = substr($data,$headerlen,$bodylen); | ||
788 | $data = ""; | ||
789 | if ($bodylen > 0) { | ||
790 | switch ($method) { | ||
791 | case 8: | ||
792 | // Currently the only supported compression method: | ||
793 | $data = gzinflate($body,$maxlength); | ||
794 | break; | ||
795 | default: | ||
796 | $error = "Unknown compression method."; | ||
797 | return false; | ||
798 | } | ||
799 | } // zero-byte body content is allowed | ||
800 | // Verifiy CRC32 | ||
801 | $crc = sprintf("%u",crc32($data)); | ||
802 | $crcOK = $crc == $datacrc; | ||
803 | $lenOK = $isize == strlen($data); | ||
804 | if (!$lenOK || !$crcOK) { | ||
805 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | ||
806 | return false; | ||
807 | } | ||
808 | return $data; | ||
809 | } | ||
810 | } \ No newline at end of file | ||