diff options
author | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-22 17:16:38 +0300 |
---|---|---|
committer | Maryana Rozhankivska <mariroz@mr.lviv.ua> | 2014-05-22 17:16:38 +0300 |
commit | 3ec62cf95ab4436923d4c665fad7aef226cbb822 (patch) | |
tree | f657024faaaf4c0b33ae27f7aea999f2b18cc8ab /inc/3rdparty/libraries | |
parent | ab157bbb75ba226917145c9bf906cbf764a85cd0 (diff) | |
download | wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.tar.gz wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.tar.zst wallabag-3ec62cf95ab4436923d4c665fad7aef226cbb822.zip |
update to 3.2 version of full-text-rss, issue #694
Diffstat (limited to 'inc/3rdparty/libraries')
-rw-r--r-- | inc/3rdparty/libraries/content-extractor/ContentExtractor.php | 1455 | ||||
-rw-r--r-- | inc/3rdparty/libraries/content-extractor/SiteConfig.php | 681 | ||||
-rwxr-xr-x[-rw-r--r--] | inc/3rdparty/libraries/feedwriter/FeedItem.php | 100 | ||||
-rwxr-xr-x | inc/3rdparty/libraries/feedwriter/FeedWriter.php | 17 | ||||
-rw-r--r-- | inc/3rdparty/libraries/html5/TreeBuilder.php | 13 | ||||
-rw-r--r-- | inc/3rdparty/libraries/humble-http-agent/CookieJar.php | 807 | ||||
-rw-r--r-- | inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | 1589 | ||||
-rw-r--r-- | inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php | 157 | ||||
-rw-r--r-- | inc/3rdparty/libraries/language-detect/LanguageDetect.php | 992 | ||||
-rw-r--r-- | inc/3rdparty/libraries/readability/Readability.php | 2274 |
10 files changed, 4097 insertions, 3988 deletions
diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php index ddd33bb5..21e693e7 100644 --- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php +++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php | |||
@@ -1,728 +1,727 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Content Extractor | 3 | * Content Extractor |
4 | * | 4 | * |
5 | * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) | 5 | * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) |
6 | * to extract content from HTML files. | 6 | * to extract content from HTML files. |
7 | * | 7 | * |
8 | * @version 1.0 | 8 | * @version 1.0 |
9 | * @date 2013-02-05 | 9 | * @date 2013-02-05 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2013 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class ContentExtractor | 15 | class ContentExtractor |
16 | { | 16 | { |
17 | protected static $tidy_config = array( | 17 | protected static $tidy_config = array( |
18 | 'clean' => true, | 18 | 'clean' => true, |
19 | 'output-xhtml' => true, | 19 | 'output-xhtml' => true, |
20 | 'logical-emphasis' => true, | 20 | 'logical-emphasis' => true, |
21 | 'show-body-only' => false, | 21 | 'show-body-only' => false, |
22 | 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', | 22 | 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', |
23 | 'new-inline-tags' => 'mark, time, meter, progress, data', | 23 | 'new-inline-tags' => 'mark, time, meter, progress, data', |
24 | 'wrap' => 0, | 24 | 'wrap' => 0, |
25 | 'drop-empty-paras' => true, | 25 | 'drop-empty-paras' => true, |
26 | 'drop-proprietary-attributes' => false, | 26 | 'drop-proprietary-attributes' => false, |
27 | 'enclose-text' => true, | 27 | 'enclose-text' => true, |
28 | 'enclose-block-text' => true, | 28 | 'enclose-block-text' => true, |
29 | 'merge-divs' => true, | 29 | 'merge-divs' => true, |
30 | 'merge-spans' => true, | 30 | 'merge-spans' => true, |
31 | 'char-encoding' => 'utf8', | 31 | 'char-encoding' => 'utf8', |
32 | 'hide-comments' => true | 32 | 'hide-comments' => true |
33 | ); | 33 | ); |
34 | protected $html; | 34 | protected $html; |
35 | protected $config; | 35 | protected $config; |
36 | protected $title; | 36 | protected $title; |
37 | protected $author = array(); | 37 | protected $author = array(); |
38 | protected $language; | 38 | protected $language; |
39 | protected $date; | 39 | protected $date; |
40 | protected $body; | 40 | protected $body; |
41 | protected $success = false; | 41 | protected $success = false; |
42 | protected $nextPageUrl; | 42 | protected $nextPageUrl; |
43 | public $allowedParsers = array('libxml', 'html5lib'); | 43 | public $allowedParsers = array('libxml', 'html5lib'); |
44 | public $fingerprints = array(); | 44 | public $fingerprints = array(); |
45 | public $readability; | 45 | public $readability; |
46 | public $debug = false; | 46 | public $debug = false; |
47 | public $debugVerbose = false; | 47 | public $debugVerbose = false; |
48 | 48 | ||
49 | function __construct($path, $fallback=null) { | 49 | function __construct($path, $fallback=null) { |
50 | SiteConfig::set_config_path($path, $fallback); | 50 | SiteConfig::set_config_path($path, $fallback); |
51 | } | 51 | } |
52 | 52 | ||
53 | protected function debug($msg) { | 53 | protected function debug($msg) { |
54 | if ($this->debug) { | 54 | if ($this->debug) { |
55 | $mem = round(memory_get_usage()/1024, 2); | 55 | $mem = round(memory_get_usage()/1024, 2); |
56 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 56 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
57 | echo '* ',$msg; | 57 | echo '* ',$msg; |
58 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 58 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
59 | echo "\n"; | 59 | echo "\n"; |
60 | ob_flush(); | 60 | ob_flush(); |
61 | flush(); | 61 | flush(); |
62 | } | 62 | } |
63 | } | 63 | } |
64 | 64 | ||
65 | public function reset() { | 65 | public function reset() { |
66 | $this->html = null; | 66 | $this->html = null; |
67 | $this->readability = null; | 67 | $this->readability = null; |
68 | $this->config = null; | 68 | $this->config = null; |
69 | $this->title = null; | 69 | $this->title = null; |
70 | $this->body = null; | 70 | $this->body = null; |
71 | $this->author = array(); | 71 | $this->author = array(); |
72 | $this->language = null; | 72 | $this->language = null; |
73 | $this->date = null; | 73 | $this->date = null; |
74 | $this->nextPageUrl = null; | 74 | $this->nextPageUrl = null; |
75 | $this->success = false; | 75 | $this->success = false; |
76 | } | 76 | } |
77 | 77 | ||
78 | public function findHostUsingFingerprints($html) { | 78 | public function findHostUsingFingerprints($html) { |
79 | $this->debug('Checking fingerprints...'); | 79 | $this->debug('Checking fingerprints...'); |
80 | $head = substr($html, 0, 8000); | 80 | $head = substr($html, 0, 8000); |
81 | foreach ($this->fingerprints as $_fp => $_fphost) { | 81 | foreach ($this->fingerprints as $_fp => $_fphost) { |
82 | $lookin = 'html'; | 82 | $lookin = 'html'; |
83 | if (is_array($_fphost)) { | 83 | if (is_array($_fphost)) { |
84 | if (isset($_fphost['head']) && $_fphost['head']) { | 84 | if (isset($_fphost['head']) && $_fphost['head']) { |
85 | $lookin = 'head'; | 85 | $lookin = 'head'; |
86 | } | 86 | } |
87 | $_fphost = $_fphost['hostname']; | 87 | $_fphost = $_fphost['hostname']; |
88 | } | 88 | } |
89 | if (strpos($$lookin, $_fp) !== false) { | 89 | if (strpos($$lookin, $_fp) !== false) { |
90 | $this->debug("Found match: $_fphost"); | 90 | $this->debug("Found match: $_fphost"); |
91 | return $_fphost; | 91 | return $_fphost; |
92 | } | 92 | } |
93 | } | 93 | } |
94 | $this->debug('No fingerprint matches'); | 94 | $this->debug('No fingerprint matches'); |
95 | return false; | 95 | return false; |
96 | } | 96 | } |
97 | 97 | ||
98 | // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) | 98 | // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) |
99 | public function buildSiteConfig($url, $html='', $add_to_cache=true) { | 99 | public function buildSiteConfig($url, $html='', $add_to_cache=true) { |
100 | // extract host name | 100 | // extract host name |
101 | $host = @parse_url($url, PHP_URL_HOST); | 101 | $host = @parse_url($url, PHP_URL_HOST); |
102 | $host = strtolower($host); | 102 | $host = strtolower($host); |
103 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 103 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
104 | // is merged version already cached? | 104 | // is merged version already cached? |
105 | if (SiteConfig::is_cached("$host.merged")) { | 105 | if (SiteConfig::is_cached("$host.merged")) { |
106 | $this->debug("Returning cached and merged site config for $host"); | 106 | $this->debug("Returning cached and merged site config for $host"); |
107 | return SiteConfig::build("$host.merged"); | 107 | return SiteConfig::build("$host.merged"); |
108 | } | 108 | } |
109 | // let's build from site_config/custom/ and standard/ | 109 | // let's build from site_config/custom/ and standard/ |
110 | $config = SiteConfig::build($host); | 110 | $config = SiteConfig::build($host); |
111 | if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { | 111 | if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { |
112 | SiteConfig::add_to_cache($host, $config); | 112 | SiteConfig::add_to_cache($host, $config); |
113 | } | 113 | } |
114 | // if no match, use defaults | 114 | // if no match, use defaults |
115 | if (!$config) $config = new SiteConfig(); | 115 | if (!$config) $config = new SiteConfig(); |
116 | // load fingerprint config? | 116 | // load fingerprint config? |
117 | if ($config->autodetect_on_failure()) { | 117 | if ($config->autodetect_on_failure()) { |
118 | // check HTML for fingerprints | 118 | // check HTML for fingerprints |
119 | if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { | 119 | if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { |
120 | if ($config_fingerprint = SiteConfig::build($_fphost)) { | 120 | if ($config_fingerprint = SiteConfig::build($_fphost)) { |
121 | $this->debug("Appending site config settings from $_fphost (fingerprint match)"); | 121 | $this->debug("Appending site config settings from $_fphost (fingerprint match)"); |
122 | $config->append($config_fingerprint); | 122 | $config->append($config_fingerprint); |
123 | if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { | 123 | if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { |
124 | //$config_fingerprint->cache_in_apc = true; | 124 | //$config_fingerprint->cache_in_apc = true; |
125 | SiteConfig::add_to_cache($_fphost, $config_fingerprint); | 125 | SiteConfig::add_to_cache($_fphost, $config_fingerprint); |
126 | } | 126 | } |
127 | } | 127 | } |
128 | } | 128 | } |
129 | } | 129 | } |
130 | // load global config? | 130 | // load global config? |
131 | if ($config->autodetect_on_failure()) { | 131 | if ($config->autodetect_on_failure()) { |
132 | if ($config_global = SiteConfig::build('global', true)) { | 132 | if ($config_global = SiteConfig::build('global', true)) { |
133 | $this->debug('Appending site config settings from global.txt'); | 133 | $this->debug('Appending site config settings from global.txt'); |
134 | $config->append($config_global); | 134 | $config->append($config_global); |
135 | if ($add_to_cache && !SiteConfig::is_cached('global')) { | 135 | if ($add_to_cache && !SiteConfig::is_cached('global')) { |
136 | //$config_global->cache_in_apc = true; | 136 | //$config_global->cache_in_apc = true; |
137 | SiteConfig::add_to_cache('global', $config_global); | 137 | SiteConfig::add_to_cache('global', $config_global); |
138 | } | 138 | } |
139 | } | 139 | } |
140 | } | 140 | } |
141 | // store copy of merged config | 141 | // store copy of merged config |
142 | if ($add_to_cache) { | 142 | if ($add_to_cache) { |
143 | // do not store in APC if wildcard match | 143 | // do not store in APC if wildcard match |
144 | $use_apc = ($host == $config->cache_key); | 144 | $use_apc = ($host == $config->cache_key); |
145 | $config->cache_key = null; | 145 | $config->cache_key = null; |
146 | SiteConfig::add_to_cache("$host.merged", $config, $use_apc); | 146 | SiteConfig::add_to_cache("$host.merged", $config, $use_apc); |
147 | } | 147 | } |
148 | return $config; | 148 | return $config; |
149 | } | 149 | } |
150 | 150 | ||
151 | // returns true on success, false on failure | 151 | // returns true on success, false on failure |
152 | // $smart_tidy indicates that if tidy is used and no results are produced, we will | 152 | // $smart_tidy indicates that if tidy is used and no results are produced, we will |
153 | // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time | 153 | // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time |
154 | // but it has problems of its own which we try to avoid with this option. | 154 | // but it has problems of its own which we try to avoid with this option. |
155 | public function process($html, $url, $smart_tidy=true) { | 155 | public function process($html, $url, $smart_tidy=true) { |
156 | $this->reset(); | 156 | $this->reset(); |
157 | $this->config = $this->buildSiteConfig($url, $html); | 157 | $this->config = $this->buildSiteConfig($url, $html); |
158 | 158 | ||
159 | // do string replacements | 159 | // do string replacements |
160 | if (!empty($this->config->find_string)) { | 160 | if (!empty($this->config->find_string)) { |
161 | if (count($this->config->find_string) == count($this->config->replace_string)) { | 161 | if (count($this->config->find_string) == count($this->config->replace_string)) { |
162 | $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); | 162 | $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); |
163 | $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); | 163 | $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); |
164 | } else { | 164 | } else { |
165 | $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); | 165 | $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); |
166 | } | 166 | } |
167 | unset($_count); | 167 | unset($_count); |
168 | } | 168 | } |
169 | 169 | ||
170 | // use tidy (if it exists)? | 170 | // use tidy (if it exists)? |
171 | // This fixes problems with some sites which would otherwise | 171 | // This fixes problems with some sites which would otherwise |
172 | // trouble DOMDocument's HTML parsing. (Although sometimes it | 172 | // trouble DOMDocument's HTML parsing. (Although sometimes it |
173 | // makes matters worse, which is why you can override it in site config files.) | 173 | // makes matters worse, which is why you can override it in site config files.) |
174 | $tidied = false; | 174 | $tidied = false; |
175 | if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { | 175 | if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { |
176 | $this->debug('Using Tidy'); | 176 | $this->debug('Using Tidy'); |
177 | $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); | 177 | $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); |
178 | if (tidy_clean_repair($tidy)) { | 178 | if (tidy_clean_repair($tidy)) { |
179 | $original_html = $html; | 179 | $original_html = $html; |
180 | $tidied = true; | 180 | $tidied = true; |
181 | $html = $tidy->value; | 181 | $html = $tidy->value; |
182 | } | 182 | } |
183 | unset($tidy); | 183 | unset($tidy); |
184 | } | 184 | } |
185 | 185 | ||
186 | // load and parse html | 186 | // load and parse html |
187 | $_parser = $this->config->parser(); | 187 | $_parser = $this->config->parser(); |
188 | if (!in_array($_parser, $this->allowedParsers)) { | 188 | if (!in_array($_parser, $this->allowedParsers)) { |
189 | $this->debug("HTML parser $_parser not listed, using libxml instead"); | 189 | $this->debug("HTML parser $_parser not listed, using libxml instead"); |
190 | $_parser = 'libxml'; | 190 | $_parser = 'libxml'; |
191 | } | 191 | } |
192 | $this->debug("Attempting to parse HTML with $_parser"); | 192 | $this->debug("Attempting to parse HTML with $_parser"); |
193 | $this->readability = new Readability($html, $url, $_parser); | 193 | $this->readability = new Readability($html, $url, $_parser); |
194 | 194 | ||
195 | // we use xpath to find elements in the given HTML document | 195 | // we use xpath to find elements in the given HTML document |
196 | // see http://en.wikipedia.org/wiki/XPath_1.0 | 196 | // see http://en.wikipedia.org/wiki/XPath_1.0 |
197 | $xpath = new DOMXPath($this->readability->dom); | 197 | $xpath = new DOMXPath($this->readability->dom); |
198 | 198 | ||
199 | // try to get next page link | 199 | // try to get next page link |
200 | foreach ($this->config->next_page_link as $pattern) { | 200 | foreach ($this->config->next_page_link as $pattern) { |
201 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 201 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
202 | if (is_string($elems)) { | 202 | if (is_string($elems)) { |
203 | $this->nextPageUrl = trim($elems); | 203 | $this->nextPageUrl = trim($elems); |
204 | break; | 204 | break; |
205 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 205 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
206 | foreach ($elems as $item) { | 206 | foreach ($elems as $item) { |
207 | if ($item instanceof DOMElement && $item->hasAttribute('href')) { | 207 | if ($item instanceof DOMElement && $item->hasAttribute('href')) { |
208 | $this->nextPageUrl = $item->getAttribute('href'); | 208 | $this->nextPageUrl = $item->getAttribute('href'); |
209 | break 2; | 209 | break 2; |
210 | } elseif ($item instanceof DOMAttr && $item->value) { | 210 | } elseif ($item instanceof DOMAttr && $item->value) { |
211 | $this->nextPageUrl = $item->value; | 211 | $this->nextPageUrl = $item->value; |
212 | break 2; | 212 | break 2; |
213 | } | 213 | } |
214 | } | 214 | } |
215 | } | 215 | } |
216 | } | 216 | } |
217 | 217 | ||
218 | // try to get title | 218 | // try to get title |
219 | foreach ($this->config->title as $pattern) { | 219 | foreach ($this->config->title as $pattern) { |
220 | // $this->debug("Trying $pattern"); | 220 | // $this->debug("Trying $pattern"); |
221 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 221 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
222 | if (is_string($elems)) { | 222 | if (is_string($elems)) { |
223 | $this->title = trim($elems); | 223 | $this->title = trim($elems); |
224 | $this->debug('Title expression evaluated as string: '.$this->title); | 224 | $this->debug('Title expression evaluated as string: '.$this->title); |
225 | $this->debug("...XPath match: $pattern"); | 225 | $this->debug("...XPath match: $pattern"); |
226 | break; | 226 | break; |
227 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 227 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
228 | $this->title = $elems->item(0)->textContent; | 228 | $this->title = $elems->item(0)->textContent; |
229 | $this->debug('Title matched: '.$this->title); | 229 | $this->debug('Title matched: '.$this->title); |
230 | $this->debug("...XPath match: $pattern"); | 230 | $this->debug("...XPath match: $pattern"); |
231 | // remove title from document | 231 | // remove title from document |
232 | try { | 232 | try { |
233 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 233 | @$elems->item(0)->parentNode->removeChild($elems->item(0)); |
234 | } catch (DOMException $e) { | 234 | } catch (DOMException $e) { |
235 | // do nothing | 235 | // do nothing |
236 | } | 236 | } |
237 | break; | 237 | break; |
238 | } | 238 | } |
239 | } | 239 | } |
240 | 240 | ||
241 | // try to get author (if it hasn't already been set) | 241 | // try to get author (if it hasn't already been set) |
242 | if (empty($this->author)) { | 242 | if (empty($this->author)) { |
243 | foreach ($this->config->author as $pattern) { | 243 | foreach ($this->config->author as $pattern) { |
244 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 244 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
245 | if (is_string($elems)) { | 245 | if (is_string($elems)) { |
246 | if (trim($elems) != '') { | 246 | if (trim($elems) != '') { |
247 | $this->author[] = trim($elems); | 247 | $this->author[] = trim($elems); |
248 | $this->debug('Author expression evaluated as string: '.trim($elems)); | 248 | $this->debug('Author expression evaluated as string: '.trim($elems)); |
249 | $this->debug("...XPath match: $pattern"); | 249 | $this->debug("...XPath match: $pattern"); |
250 | break; | 250 | break; |
251 | } | 251 | } |
252 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 252 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
253 | foreach ($elems as $elem) { | 253 | foreach ($elems as $elem) { |
254 | if (!isset($elem->parentNode)) continue; | 254 | if (!isset($elem->parentNode)) continue; |
255 | $this->author[] = trim($elem->textContent); | 255 | $this->author[] = trim($elem->textContent); |
256 | $this->debug('Author matched: '.trim($elem->textContent)); | 256 | $this->debug('Author matched: '.trim($elem->textContent)); |
257 | } | 257 | } |
258 | if (!empty($this->author)) { | 258 | if (!empty($this->author)) { |
259 | $this->debug("...XPath match: $pattern"); | 259 | $this->debug("...XPath match: $pattern"); |
260 | break; | 260 | break; |
261 | } | 261 | } |
262 | } | 262 | } |
263 | } | 263 | } |
264 | } | 264 | } |
265 | 265 | ||
266 | // try to get language | 266 | // try to get language |
267 | $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); | 267 | $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); |
268 | foreach ($_lang_xpath as $pattern) { | 268 | foreach ($_lang_xpath as $pattern) { |
269 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 269 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
270 | if (is_string($elems)) { | 270 | if (is_string($elems)) { |
271 | if (trim($elems) != '') { | 271 | if (trim($elems) != '') { |
272 | $this->language = trim($elems); | 272 | $this->language = trim($elems); |
273 | $this->debug('Language matched: '.$this->language); | 273 | $this->debug('Language matched: '.$this->language); |
274 | break; | 274 | break; |
275 | } | 275 | } |
276 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 276 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
277 | foreach ($elems as $elem) { | 277 | foreach ($elems as $elem) { |
278 | if (!isset($elem->parentNode)) continue; | 278 | if (!isset($elem->parentNode)) continue; |
279 | $this->language = trim($elem->textContent); | 279 | $this->language = trim($elem->textContent); |
280 | $this->debug('Language matched: '.$this->language); | 280 | $this->debug('Language matched: '.$this->language); |
281 | } | 281 | } |
282 | if ($this->language) break; | 282 | if ($this->language) break; |
283 | } | 283 | } |
284 | } | 284 | } |
285 | 285 | ||
286 | // try to get date | 286 | // try to get date |
287 | foreach ($this->config->date as $pattern) { | 287 | foreach ($this->config->date as $pattern) { |
288 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 288 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
289 | if (is_string($elems)) { | 289 | if (is_string($elems)) { |
290 | $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); | 290 | $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); |
291 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 291 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
292 | $this->date = $elems->item(0)->textContent; | 292 | $this->date = $elems->item(0)->textContent; |
293 | $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); | 293 | $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); |
294 | // remove date from document | 294 | // remove date from document |
295 | // $elems->item(0)->parentNode->removeChild($elems->item(0)); | 295 | // $elems->item(0)->parentNode->removeChild($elems->item(0)); |
296 | } | 296 | } |
297 | if (!$this->date) { | 297 | if (!$this->date) { |
298 | $this->date = null; | 298 | $this->date = null; |
299 | } else { | 299 | } else { |
300 | $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); | 300 | $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); |
301 | $this->debug("...XPath match: $pattern"); | 301 | $this->debug("...XPath match: $pattern"); |
302 | break; | 302 | break; |
303 | } | 303 | } |
304 | } | 304 | } |
305 | 305 | ||
306 | // strip elements (using xpath expressions) | 306 | // strip elements (using xpath expressions) |
307 | foreach ($this->config->strip as $pattern) { | 307 | foreach ($this->config->strip as $pattern) { |
308 | $elems = @$xpath->query($pattern, $this->readability->dom); | 308 | $elems = @$xpath->query($pattern, $this->readability->dom); |
309 | // check for matches | 309 | // check for matches |
310 | if ($elems && $elems->length > 0) { | 310 | if ($elems && $elems->length > 0) { |
311 | $this->debug('Stripping '.$elems->length.' elements (strip)'); | 311 | $this->debug('Stripping '.$elems->length.' elements (strip)'); |
312 | for ($i=$elems->length-1; $i >= 0; $i--) { | 312 | for ($i=$elems->length-1; $i >= 0; $i--) { |
313 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 313 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
314 | } | 314 | } |
315 | } | 315 | } |
316 | } | 316 | } |
317 | 317 | ||
318 | // strip elements (using id and class attribute values) | 318 | // strip elements (using id and class attribute values) |
319 | foreach ($this->config->strip_id_or_class as $string) { | 319 | foreach ($this->config->strip_id_or_class as $string) { |
320 | $string = strtr($string, array("'"=>'', '"'=>'')); | 320 | $string = strtr($string, array("'"=>'', '"'=>'')); |
321 | $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); | 321 | $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); |
322 | // check for matches | 322 | // check for matches |
323 | if ($elems && $elems->length > 0) { | 323 | if ($elems && $elems->length > 0) { |
324 | $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); | 324 | $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); |
325 | for ($i=$elems->length-1; $i >= 0; $i--) { | 325 | for ($i=$elems->length-1; $i >= 0; $i--) { |
326 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 326 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
327 | } | 327 | } |
328 | } | 328 | } |
329 | } | 329 | } |
330 | 330 | ||
331 | // strip images (using src attribute values) | 331 | // strip images (using src attribute values) |
332 | foreach ($this->config->strip_image_src as $string) { | 332 | foreach ($this->config->strip_image_src as $string) { |
333 | $string = strtr($string, array("'"=>'', '"'=>'')); | 333 | $string = strtr($string, array("'"=>'', '"'=>'')); |
334 | $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); | 334 | $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); |
335 | // check for matches | 335 | // check for matches |
336 | if ($elems && $elems->length > 0) { | 336 | if ($elems && $elems->length > 0) { |
337 | $this->debug('Stripping '.$elems->length.' image elements'); | 337 | $this->debug('Stripping '.$elems->length.' image elements'); |
338 | for ($i=$elems->length-1; $i >= 0; $i--) { | 338 | for ($i=$elems->length-1; $i >= 0; $i--) { |
339 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 339 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
340 | } | 340 | } |
341 | } | 341 | } |
342 | } | 342 | } |
343 | // strip elements using Readability.com and Instapaper.com ignore class names | 343 | // strip elements using Readability.com and Instapaper.com ignore class names |
344 | // .entry-unrelated and .instapaper_ignore | 344 | // .entry-unrelated and .instapaper_ignore |
345 | // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines | 345 | // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines |
346 | // and http://blog.instapaper.com/post/730281947 | 346 | // and http://blog.instapaper.com/post/730281947 |
347 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); | 347 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); |
348 | // check for matches | 348 | // check for matches |
349 | if ($elems && $elems->length > 0) { | 349 | if ($elems && $elems->length > 0) { |
350 | $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); | 350 | $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); |
351 | for ($i=$elems->length-1; $i >= 0; $i--) { | 351 | for ($i=$elems->length-1; $i >= 0; $i--) { |
352 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 352 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
353 | } | 353 | } |
354 | } | 354 | } |
355 | 355 | ||
356 | // strip elements that contain style="display: none;" | 356 | // strip elements that contain style="display: none;" |
357 | $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); | 357 | $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); |
358 | // check for matches | 358 | // check for matches |
359 | if ($elems && $elems->length > 0) { | 359 | if ($elems && $elems->length > 0) { |
360 | $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); | 360 | $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); |
361 | for ($i=$elems->length-1; $i >= 0; $i--) { | 361 | for ($i=$elems->length-1; $i >= 0; $i--) { |
362 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 362 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
363 | } | 363 | } |
364 | } | 364 | } |
365 | 365 | ||
366 | // try to get body | 366 | // try to get body |
367 | foreach ($this->config->body as $pattern) { | 367 | foreach ($this->config->body as $pattern) { |
368 | $elems = @$xpath->query($pattern, $this->readability->dom); | 368 | $elems = @$xpath->query($pattern, $this->readability->dom); |
369 | // check for matches | 369 | // check for matches |
370 | if ($elems && $elems->length > 0) { | 370 | if ($elems && $elems->length > 0) { |
371 | $this->debug('Body matched'); | 371 | $this->debug('Body matched'); |
372 | $this->debug("...XPath match: $pattern"); | 372 | $this->debug("...XPath match: $pattern"); |
373 | if ($elems->length == 1) { | 373 | if ($elems->length == 1) { |
374 | $this->body = $elems->item(0); | 374 | $this->body = $elems->item(0); |
375 | // prune (clean up elements that may not be content) | 375 | // prune (clean up elements that may not be content) |
376 | if ($this->config->prune()) { | 376 | if ($this->config->prune()) { |
377 | $this->debug('...pruning content'); | 377 | $this->debug('...pruning content'); |
378 | $this->readability->prepArticle($this->body); | 378 | $this->readability->prepArticle($this->body); |
379 | } | 379 | } |
380 | break; | 380 | break; |
381 | } else { | 381 | } else { |
382 | $this->body = $this->readability->dom->createElement('div'); | 382 | $this->body = $this->readability->dom->createElement('div'); |
383 | $this->debug($elems->length.' body elems found'); | 383 | $this->debug($elems->length.' body elems found'); |
384 | foreach ($elems as $elem) { | 384 | foreach ($elems as $elem) { |
385 | if (!isset($elem->parentNode)) continue; | 385 | if (!isset($elem->parentNode)) continue; |
386 | $isDescendant = false; | 386 | $isDescendant = false; |
387 | foreach ($this->body->childNodes as $parent) { | 387 | foreach ($this->body->childNodes as $parent) { |
388 | if ($this->isDescendant($parent, $elem)) { | 388 | if ($this->isDescendant($parent, $elem)) { |
389 | $isDescendant = true; | 389 | $isDescendant = true; |
390 | break; | 390 | break; |
391 | } | 391 | } |
392 | } | 392 | } |
393 | if ($isDescendant) { | 393 | if ($isDescendant) { |
394 | $this->debug('...element is child of another body element, skipping.'); | 394 | $this->debug('...element is child of another body element, skipping.'); |
395 | } else { | 395 | } else { |
396 | // prune (clean up elements that may not be content) | 396 | // prune (clean up elements that may not be content) |
397 | if ($this->config->prune()) { | 397 | if ($this->config->prune()) { |
398 | $this->debug('Pruning content'); | 398 | $this->debug('Pruning content'); |
399 | $this->readability->prepArticle($elem); | 399 | $this->readability->prepArticle($elem); |
400 | } | 400 | } |
401 | $this->debug('...element added to body'); | 401 | $this->debug('...element added to body'); |
402 | $this->body->appendChild($elem); | 402 | $this->body->appendChild($elem); |
403 | } | 403 | } |
404 | } | 404 | } |
405 | if ($this->body->hasChildNodes()) break; | 405 | if ($this->body->hasChildNodes()) break; |
406 | } | 406 | } |
407 | } | 407 | } |
408 | } | 408 | } |
409 | 409 | ||
410 | // auto detect? | 410 | // auto detect? |
411 | $detect_title = $detect_body = $detect_author = $detect_date = false; | 411 | $detect_title = $detect_body = $detect_author = $detect_date = false; |
412 | // detect title? | 412 | // detect title? |
413 | if (!isset($this->title)) { | 413 | if (!isset($this->title)) { |
414 | if (empty($this->config->title) || $this->config->autodetect_on_failure()) { | 414 | if (empty($this->config->title) || $this->config->autodetect_on_failure()) { |
415 | $detect_title = true; | 415 | $detect_title = true; |
416 | } | 416 | } |
417 | } | 417 | } |
418 | // detect body? | 418 | // detect body? |
419 | if (!isset($this->body)) { | 419 | if (!isset($this->body)) { |
420 | if (empty($this->config->body) || $this->config->autodetect_on_failure()) { | 420 | if (empty($this->config->body) || $this->config->autodetect_on_failure()) { |
421 | $detect_body = true; | 421 | $detect_body = true; |
422 | } | 422 | } |
423 | } | 423 | } |
424 | // detect author? | 424 | // detect author? |
425 | if (empty($this->author)) { | 425 | if (empty($this->author)) { |
426 | if (empty($this->config->author) || $this->config->autodetect_on_failure()) { | 426 | if (empty($this->config->author) || $this->config->autodetect_on_failure()) { |
427 | $detect_author = true; | 427 | $detect_author = true; |
428 | } | 428 | } |
429 | } | 429 | } |
430 | // detect date? | 430 | // detect date? |
431 | if (!isset($this->date)) { | 431 | if (!isset($this->date)) { |
432 | if (empty($this->config->date) || $this->config->autodetect_on_failure()) { | 432 | if (empty($this->config->date) || $this->config->autodetect_on_failure()) { |
433 | $detect_date = true; | 433 | $detect_date = true; |
434 | } | 434 | } |
435 | } | 435 | } |
436 | 436 | ||
437 | // check for hNews | 437 | // check for hNews |
438 | if ($detect_title || $detect_body) { | 438 | if ($detect_title || $detect_body) { |
439 | // check for hentry | 439 | // check for hentry |
440 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); | 440 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); |
441 | if ($elems && $elems->length > 0) { | 441 | if ($elems && $elems->length > 0) { |
442 | $this->debug('hNews: found hentry'); | 442 | $this->debug('hNews: found hentry'); |
443 | $hentry = $elems->item(0); | 443 | $hentry = $elems->item(0); |
444 | 444 | ||
445 | if ($detect_title) { | 445 | if ($detect_title) { |
446 | // check for entry-title | 446 | // check for entry-title |
447 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); | 447 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); |
448 | if ($elems && $elems->length > 0) { | 448 | if ($elems && $elems->length > 0) { |
449 | $this->title = $elems->item(0)->textContent; | 449 | $this->title = $elems->item(0)->textContent; |
450 | $this->debug('hNews: found entry-title: '.$this->title); | 450 | $this->debug('hNews: found entry-title: '.$this->title); |
451 | // remove title from document | 451 | // remove title from document |
452 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 452 | $elems->item(0)->parentNode->removeChild($elems->item(0)); |
453 | $detect_title = false; | 453 | $detect_title = false; |
454 | } | 454 | } |
455 | } | 455 | } |
456 | 456 | ||
457 | if ($detect_date) { | 457 | if ($detect_date) { |
458 | // check for time element with pubdate attribute | 458 | // check for time element with pubdate attribute |
459 | $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); | 459 | $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); |
460 | if ($elems && $elems->length > 0) { | 460 | if ($elems && $elems->length > 0) { |
461 | $this->date = strtotime(trim($elems->item(0)->textContent)); | 461 | $this->date = strtotime(trim($elems->item(0)->textContent)); |
462 | // remove date from document | 462 | // remove date from document |
463 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); | 463 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); |
464 | if ($this->date) { | 464 | if ($this->date) { |
465 | $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); | 465 | $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); |
466 | $detect_date = false; | 466 | $detect_date = false; |
467 | } else { | 467 | } else { |
468 | $this->date = null; | 468 | $this->date = null; |
469 | } | 469 | } |
470 | } | 470 | } |
471 | } | 471 | } |
472 | 472 | ||
473 | if ($detect_author) { | 473 | if ($detect_author) { |
474 | // check for time element with pubdate attribute | 474 | // check for time element with pubdate attribute |
475 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); | 475 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); |
476 | if ($elems && $elems->length > 0) { | 476 | if ($elems && $elems->length > 0) { |
477 | $author = $elems->item(0); | 477 | $author = $elems->item(0); |
478 | $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); | 478 | $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); |
479 | if ($fn && $fn->length > 0) { | 479 | if ($fn && $fn->length > 0) { |
480 | foreach ($fn as $_fn) { | 480 | foreach ($fn as $_fn) { |
481 | if (trim($_fn->textContent) != '') { | 481 | if (trim($_fn->textContent) != '') { |
482 | $this->author[] = trim($_fn->textContent); | 482 | $this->author[] = trim($_fn->textContent); |
483 | $this->debug('hNews: found author: '.trim($_fn->textContent)); | 483 | $this->debug('hNews: found author: '.trim($_fn->textContent)); |
484 | } | 484 | } |
485 | } | 485 | } |
486 | } else { | 486 | } else { |
487 | if (trim($author->textContent) != '') { | 487 | if (trim($author->textContent) != '') { |
488 | $this->author[] = trim($author->textContent); | 488 | $this->author[] = trim($author->textContent); |
489 | $this->debug('hNews: found author: '.trim($author->textContent)); | 489 | $this->debug('hNews: found author: '.trim($author->textContent)); |
490 | } | 490 | } |
491 | } | 491 | } |
492 | $detect_author = empty($this->author); | 492 | $detect_author = empty($this->author); |
493 | } | 493 | } |
494 | } | 494 | } |
495 | 495 | ||
496 | // check for entry-content. | 496 | // check for entry-content. |
497 | // according to hAtom spec, if there are multiple elements marked entry-content, | 497 | // according to hAtom spec, if there are multiple elements marked entry-content, |
498 | // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content | 498 | // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content |
499 | if ($detect_body) { | 499 | if ($detect_body) { |
500 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); | 500 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); |
501 | if ($elems && $elems->length > 0) { | 501 | if ($elems && $elems->length > 0) { |
502 | $this->debug('hNews: found entry-content'); | 502 | $this->debug('hNews: found entry-content'); |
503 | if ($elems->length == 1) { | 503 | if ($elems->length == 1) { |
504 | // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) | 504 | // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) |
505 | $e = $elems->item(0); | 505 | $e = $elems->item(0); |
506 | if (($e->tagName == 'img') || (trim($e->textContent) != '')) { | 506 | if (($e->tagName == 'img') || (trim($e->textContent) != '')) { |
507 | $this->body = $elems->item(0); | 507 | $this->body = $elems->item(0); |
508 | // prune (clean up elements that may not be content) | 508 | // prune (clean up elements that may not be content) |
509 | if ($this->config->prune()) { | 509 | if ($this->config->prune()) { |
510 | $this->debug('Pruning content'); | 510 | $this->debug('Pruning content'); |
511 | $this->readability->prepArticle($this->body); | 511 | $this->readability->prepArticle($this->body); |
512 | } | 512 | } |
513 | $detect_body = false; | 513 | $detect_body = false; |
514 | } else { | 514 | } else { |
515 | $this->debug('hNews: skipping entry-content - appears not to contain content'); | 515 | $this->debug('hNews: skipping entry-content - appears not to contain content'); |
516 | } | 516 | } |
517 | unset($e); | 517 | unset($e); |
518 | } else { | 518 | } else { |
519 | $this->body = $this->readability->dom->createElement('div'); | 519 | $this->body = $this->readability->dom->createElement('div'); |
520 | $this->debug($elems->length.' entry-content elems found'); | 520 | $this->debug($elems->length.' entry-content elems found'); |
521 | foreach ($elems as $elem) { | 521 | foreach ($elems as $elem) { |
522 | if (!isset($elem->parentNode)) continue; | 522 | if (!isset($elem->parentNode)) continue; |
523 | $isDescendant = false; | 523 | $isDescendant = false; |
524 | foreach ($this->body->childNodes as $parent) { | 524 | foreach ($this->body->childNodes as $parent) { |
525 | if ($this->isDescendant($parent, $elem)) { | 525 | if ($this->isDescendant($parent, $elem)) { |
526 | $isDescendant = true; | 526 | $isDescendant = true; |
527 | break; | 527 | break; |
528 | } | 528 | } |
529 | } | 529 | } |
530 | if ($isDescendant) { | 530 | if ($isDescendant) { |
531 | $this->debug('Element is child of another body element, skipping.'); | 531 | $this->debug('Element is child of another body element, skipping.'); |
532 | } else { | 532 | } else { |
533 | // prune (clean up elements that may not be content) | 533 | // prune (clean up elements that may not be content) |
534 | if ($this->config->prune()) { | 534 | if ($this->config->prune()) { |
535 | $this->debug('Pruning content'); | 535 | $this->debug('Pruning content'); |
536 | $this->readability->prepArticle($elem); | 536 | $this->readability->prepArticle($elem); |
537 | } | 537 | } |
538 | $this->debug('Element added to body'); | 538 | $this->debug('Element added to body'); |
539 | $this->body->appendChild($elem); | 539 | $this->body->appendChild($elem); |
540 | } | 540 | } |
541 | } | 541 | } |
542 | $detect_body = false; | 542 | $detect_body = false; |
543 | } | 543 | } |
544 | } | 544 | } |
545 | } | 545 | } |
546 | } | 546 | } |
547 | } | 547 | } |
548 | 548 | ||
549 | // check for elements marked with instapaper_title | 549 | // check for elements marked with instapaper_title |
550 | if ($detect_title) { | 550 | if ($detect_title) { |
551 | // check for instapaper_title | 551 | // check for instapaper_title |
552 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); | 552 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); |
553 | if ($elems && $elems->length > 0) { | 553 | if ($elems && $elems->length > 0) { |
554 | $this->title = $elems->item(0)->textContent; | 554 | $this->title = $elems->item(0)->textContent; |
555 | $this->debug('Title found (.instapaper_title): '.$this->title); | 555 | $this->debug('Title found (.instapaper_title): '.$this->title); |
556 | // remove title from document | 556 | // remove title from document |
557 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 557 | $elems->item(0)->parentNode->removeChild($elems->item(0)); |
558 | $detect_title = false; | 558 | $detect_title = false; |
559 | } | 559 | } |
560 | } | 560 | } |
561 | // check for elements marked with instapaper_body | 561 | // check for elements marked with instapaper_body |
562 | if ($detect_body) { | 562 | if ($detect_body) { |
563 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); | 563 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); |
564 | if ($elems && $elems->length > 0) { | 564 | if ($elems && $elems->length > 0) { |
565 | $this->debug('body found (.instapaper_body)'); | 565 | $this->debug('body found (.instapaper_body)'); |
566 | $this->body = $elems->item(0); | 566 | $this->body = $elems->item(0); |
567 | // prune (clean up elements that may not be content) | 567 | // prune (clean up elements that may not be content) |
568 | if ($this->config->prune()) { | 568 | if ($this->config->prune()) { |
569 | $this->debug('Pruning content'); | 569 | $this->debug('Pruning content'); |
570 | $this->readability->prepArticle($this->body); | 570 | $this->readability->prepArticle($this->body); |
571 | } | 571 | } |
572 | $detect_body = false; | 572 | $detect_body = false; |
573 | } | 573 | } |
574 | } | 574 | } |
575 | 575 | ||
576 | // Find author in rel="author" marked element | 576 | // Find author in rel="author" marked element |
577 | // We only use this if there's exactly one. | 577 | // We only use this if there's exactly one. |
578 | // If there's more than one, it could indicate more than | 578 | // If there's more than one, it could indicate more than |
579 | // one author, but it could also indicate that we're processing | 579 | // one author, but it could also indicate that we're processing |
580 | // a page listing different articles with different authors. | 580 | // a page listing different articles with different authors. |
581 | if ($detect_author) { | 581 | if ($detect_author) { |
582 | $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); | 582 | $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); |
583 | if ($elems && $elems->length == 1) { | 583 | if ($elems && $elems->length == 1) { |
584 | $author = trim($elems->item(0)->textContent); | 584 | $author = trim($elems->item(0)->textContent); |
585 | if ($author != '') { | 585 | if ($author != '') { |
586 | $this->debug("Author found (rel=\"author\"): $author"); | 586 | $this->debug("Author found (rel=\"author\"): $author"); |
587 | $this->author[] = $author; | 587 | $this->author[] = $author; |
588 | $detect_author = false; | 588 | $detect_author = false; |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | // Find date in pubdate marked time element | 593 | // Find date in pubdate marked time element |
594 | // For the same reason given above, we only use this | 594 | // For the same reason given above, we only use this |
595 | // if there's exactly one element. | 595 | // if there's exactly one element. |
596 | if ($detect_date) { | 596 | if ($detect_date) { |
597 | $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); | 597 | $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); |
598 | if ($elems && $elems->length == 1) { | 598 | if ($elems && $elems->length == 1) { |
599 | $this->date = strtotime(trim($elems->item(0)->textContent)); | 599 | $this->date = strtotime(trim($elems->item(0)->textContent)); |
600 | // remove date from document | 600 | // remove date from document |
601 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); | 601 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); |
602 | if ($this->date) { | 602 | if ($this->date) { |
603 | $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); | 603 | $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); |
604 | $detect_date = false; | 604 | $detect_date = false; |
605 | } else { | 605 | } else { |
606 | $this->date = null; | 606 | $this->date = null; |
607 | } | 607 | } |
608 | } | 608 | } |
609 | } | 609 | } |
610 | 610 | ||
611 | // still missing title or body, so we detect using Readability | 611 | // still missing title or body, so we detect using Readability |
612 | if ($detect_title || $detect_body) { | 612 | if ($detect_title || $detect_body) { |
613 | $this->debug('Using Readability'); | 613 | $this->debug('Using Readability'); |
614 | // clone body if we're only using Readability for title (otherwise it may interfere with body element) | 614 | // clone body if we're only using Readability for title (otherwise it may interfere with body element) |
615 | if (isset($this->body)) $this->body = $this->body->cloneNode(true); | 615 | if (isset($this->body)) $this->body = $this->body->cloneNode(true); |
616 | $success = $this->readability->init(); | 616 | $success = $this->readability->init(); |
617 | } | 617 | } |
618 | if ($detect_title) { | 618 | if ($detect_title) { |
619 | $this->debug('Detecting title'); | 619 | $this->debug('Detecting title'); |
620 | $this->title = $this->readability->getTitle()->textContent; | 620 | $this->title = $this->readability->getTitle()->textContent; |
621 | } | 621 | } |
622 | if ($detect_body && $success) { | 622 | if ($detect_body && $success) { |
623 | $this->debug('Detecting body'); | 623 | $this->debug('Detecting body'); |
624 | $this->body = $this->readability->getContent(); | 624 | $this->body = $this->readability->getContent(); |
625 | if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { | 625 | if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { |
626 | $this->body = $this->body->firstChild; | 626 | $this->body = $this->body->firstChild; |
627 | } | 627 | } |
628 | // prune (clean up elements that may not be content) | 628 | // prune (clean up elements that may not be content) |
629 | if ($this->config->prune()) { | 629 | if ($this->config->prune()) { |
630 | $this->debug('Pruning content'); | 630 | $this->debug('Pruning content'); |
631 | $this->readability->prepArticle($this->body); | 631 | $this->readability->prepArticle($this->body); |
632 | } | 632 | } |
633 | } | 633 | } |
634 | if (isset($this->body)) { | 634 | if (isset($this->body)) { |
635 | // remove scripts | 635 | // remove scripts |
636 | $this->readability->removeScripts($this->body); | 636 | $this->readability->removeScripts($this->body); |
637 | // remove any h1-h6 elements that appear as first thing in the body | 637 | // remove any h1-h6 elements that appear as first thing in the body |
638 | // and which match our title | 638 | // and which match our title |
639 | if (isset($this->title) && ($this->title != '')) { | 639 | if (isset($this->title) && ($this->title != '')) { |
640 | $firstChild = $this->body->firstChild; | 640 | $firstChild = $this->body->firstChild; |
641 | while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { | 641 | while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { |
642 | $firstChild = $firstChild->nextSibling; | 642 | $firstChild = $firstChild->nextSibling; |
643 | } | 643 | } |
644 | if (($firstChild->nodeType === XML_ELEMENT_NODE) | 644 | if (($firstChild->nodeType === XML_ELEMENT_NODE) |
645 | && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) | 645 | && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) |
646 | && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { | 646 | && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { |
647 | $this->body->removeChild($firstChild); | 647 | $this->body->removeChild($firstChild); |
648 | } | 648 | } |
649 | } | 649 | } |
650 | // prevent self-closing iframes | 650 | // prevent self-closing iframes |
651 | $elems = $this->body->getElementsByTagName('iframe'); | 651 | $elems = $this->body->getElementsByTagName('iframe'); |
652 | for ($i = $elems->length-1; $i >= 0; $i--) { | 652 | for ($i = $elems->length-1; $i >= 0; $i--) { |
653 | $e = $elems->item($i); | 653 | $e = $elems->item($i); |
654 | if (!$e->hasChildNodes()) { | 654 | if (!$e->hasChildNodes()) { |
655 | $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); | 655 | $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); |
656 | } | 656 | } |
657 | } | 657 | } |
658 | // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ | 658 | // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ |
659 | // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src | 659 | // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src |
660 | // inside the data-lazy-src attribute. It also places the original image inside a noscript element | 660 | // inside the data-lazy-src attribute. It also places the original image inside a noscript element |
661 | // next to the amended one. | 661 | // next to the amended one. |
662 | $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); | 662 | $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); |
663 | for ($i = $elems->length-1; $i >= 0; $i--) { | 663 | for ($i = $elems->length-1; $i >= 0; $i--) { |
664 | $e = $elems->item($i); | 664 | $e = $elems->item($i); |
665 | // let's see if we can grab image from noscript | 665 | // let's see if we can grab image from noscript |
666 | if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { | 666 | if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { |
667 | $_new_elem = $e->ownerDocument->createDocumentFragment(); | 667 | $_new_elem = $e->ownerDocument->createDocumentFragment(); |
668 | @$_new_elem->appendXML($e->nextSibling->innerHTML); | 668 | @$_new_elem->appendXML($e->nextSibling->innerHTML); |
669 | $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); | 669 | $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); |
670 | $e->parentNode->removeChild($e); | 670 | $e->parentNode->removeChild($e); |
671 | } else { | 671 | } else { |
672 | // Use data-lazy-src as src value | 672 | // Use data-lazy-src as src value |
673 | $e->setAttribute('src', $e->getAttribute('data-lazy-src')); | 673 | $e->setAttribute('src', $e->getAttribute('data-lazy-src')); |
674 | $e->removeAttribute('data-lazy-src'); | 674 | $e->removeAttribute('data-lazy-src'); |
675 | } | 675 | } |
676 | } | 676 | } |
677 | 677 | ||
678 | $this->success = true; | 678 | $this->success = true; |
679 | } | 679 | } |
680 | 680 | ||
681 | // if we've had no success and we've used tidy, there's a chance | 681 | // if we've had no success and we've used tidy, there's a chance |
682 | // that tidy has messed up. So let's try again without tidy... | 682 | // that tidy has messed up. So let's try again without tidy... |
683 | if (!$this->success && $tidied && $smart_tidy) { | 683 | if (!$this->success && $tidied && $smart_tidy) { |
684 | $this->debug('Trying again without tidy'); | 684 | $this->debug('Trying again without tidy'); |
685 | $this->process($original_html, $url, false); | 685 | $this->process($original_html, $url, false); |
686 | } | 686 | } |
687 | 687 | ||
688 | return $this->success; | 688 | return $this->success; |
689 | } | 689 | } |
690 | 690 | ||
691 | private function isDescendant(DOMElement $parent, DOMElement $child) { | 691 | private function isDescendant(DOMElement $parent, DOMElement $child) { |
692 | $node = $child->parentNode; | 692 | $node = $child->parentNode; |
693 | while ($node != null) { | 693 | while ($node != null) { |
694 | if ($node->isSameNode($parent)) return true; | 694 | if ($node->isSameNode($parent)) return true; |
695 | $node = $node->parentNode; | 695 | $node = $node->parentNode; |
696 | } | 696 | } |
697 | return false; | 697 | return false; |
698 | } | 698 | } |
699 | 699 | ||
700 | public function getContent() { | 700 | public function getContent() { |
701 | return $this->body; | 701 | return $this->body; |
702 | } | 702 | } |
703 | 703 | ||
704 | public function getTitle() { | 704 | public function getTitle() { |
705 | return $this->title; | 705 | return $this->title; |
706 | } | 706 | } |
707 | 707 | ||
708 | public function getAuthors() { | 708 | public function getAuthors() { |
709 | return $this->author; | 709 | return $this->author; |
710 | } | 710 | } |
711 | 711 | ||
712 | public function getLanguage() { | 712 | public function getLanguage() { |
713 | return $this->language; | 713 | return $this->language; |
714 | } | 714 | } |
715 | 715 | ||
716 | public function getDate() { | 716 | public function getDate() { |
717 | return $this->date; | 717 | return $this->date; |
718 | } | 718 | } |
719 | 719 | ||
720 | public function getSiteConfig() { | 720 | public function getSiteConfig() { |
721 | return $this->config; | 721 | return $this->config; |
722 | } | 722 | } |
723 | 723 | ||
724 | public function getNextPageUrl() { | 724 | public function getNextPageUrl() { |
725 | return $this->nextPageUrl; | 725 | return $this->nextPageUrl; |
726 | } | 726 | } |
727 | } | 727 | } \ No newline at end of file |
728 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php index c5e300d7..1f6a7603 100644 --- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php | |||
@@ -1,338 +1,343 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Site Config | 3 | * Site Config |
4 | * | 4 | * |
5 | * Each instance of this class should hold extraction patterns and other directives | 5 | * Each instance of this class should hold extraction patterns and other directives |
6 | * for a website. See ContentExtractor class to see how it's used. | 6 | * for a website. See ContentExtractor class to see how it's used. |
7 | * | 7 | * |
8 | * @version 0.7 | 8 | * @version 0.8 |
9 | * @date 2012-08-27 | 9 | * @date 2013-04-16 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2012 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class SiteConfig | 15 | class SiteConfig |
16 | { | 16 | { |
17 | // Use first matching element as title (0 or more xpath expressions) | 17 | // Use first matching element as title (0 or more xpath expressions) |
18 | public $title = array(); | 18 | public $title = array(); |
19 | 19 | ||
20 | // Use first matching element as body (0 or more xpath expressions) | 20 | // Use first matching element as body (0 or more xpath expressions) |
21 | public $body = array(); | 21 | public $body = array(); |
22 | 22 | ||
23 | // Use first matching element as author (0 or more xpath expressions) | 23 | // Use first matching element as author (0 or more xpath expressions) |
24 | public $author = array(); | 24 | public $author = array(); |
25 | 25 | ||
26 | // Use first matching element as date (0 or more xpath expressions) | 26 | // Use first matching element as date (0 or more xpath expressions) |
27 | public $date = array(); | 27 | public $date = array(); |
28 | 28 | ||
29 | // Strip elements matching these xpath expressions (0 or more) | 29 | // Strip elements matching these xpath expressions (0 or more) |
30 | public $strip = array(); | 30 | public $strip = array(); |
31 | 31 | ||
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute | 32 | // Strip elements which contain these strings (0 or more) in the id or class attribute |
33 | public $strip_id_or_class = array(); | 33 | public $strip_id_or_class = array(); |
34 | 34 | ||
35 | // Strip images which contain these strings (0 or more) in the src attribute | 35 | // Strip images which contain these strings (0 or more) in the src attribute |
36 | public $strip_image_src = array(); | 36 | public $strip_image_src = array(); |
37 | 37 | ||
38 | // Additional HTTP headers to send | 38 | // Additional HTTP headers to send |
39 | // NOT YET USED | 39 | // NOT YET USED |
40 | public $http_header = array(); | 40 | public $http_header = array(); |
41 | 41 | ||
42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) | 42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) |
43 | public $tidy = null; | 43 | public $tidy = null; |
44 | 44 | ||
45 | protected $default_tidy = true; // used if undeclared | 45 | protected $default_tidy = true; // used if undeclared |
46 | 46 | ||
47 | // Autodetect title/body if xpath expressions fail to produce results. | 47 | // Autodetect title/body if xpath expressions fail to produce results. |
48 | // Note that this applies to title and body separately, ie. | 48 | // Note that this applies to title and body separately, ie. |
49 | // * if we get a body match but no title match, this option will determine whether we autodetect title | 49 | // * if we get a body match but no title match, this option will determine whether we autodetect title |
50 | // * if neither match, this determines whether we autodetect title and body. | 50 | // * if neither match, this determines whether we autodetect title and body. |
51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. | 51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. |
52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) | 52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) |
53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. | 53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. |
54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). | 54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). |
55 | // bool or null if undeclared | 55 | // bool or null if undeclared |
56 | public $autodetect_on_failure = null; | 56 | public $autodetect_on_failure = null; |
57 | protected $default_autodetect_on_failure = true; // used if undeclared | 57 | protected $default_autodetect_on_failure = true; // used if undeclared |
58 | 58 | ||
59 | // Clean up content block - attempt to remove elements that appear to be superfluous | 59 | // Clean up content block - attempt to remove elements that appear to be superfluous |
60 | // bool or null if undeclared | 60 | // bool or null if undeclared |
61 | public $prune = null; | 61 | public $prune = null; |
62 | protected $default_prune = true; // used if undeclared | 62 | protected $default_prune = true; // used if undeclared |
63 | 63 | ||
64 | // Test URL - if present, can be used to test the config above | 64 | // Test URL - if present, can be used to test the config above |
65 | public $test_url = array(); | 65 | public $test_url = array(); |
66 | 66 | ||
67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article | 67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article |
68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to | 68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to |
69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page | 69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page |
70 | // which displays the entire article on one page (e.g. 'print view'). | 70 | // which displays the entire article on one page (e.g. 'print view'). |
71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, | 71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, |
72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. | 72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. |
73 | public $single_page_link = array(); | 73 | public $single_page_link = array(); |
74 | 74 | ||
75 | public $next_page_link = array(); | 75 | public $next_page_link = array(); |
76 | 76 | ||
77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed | 77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed |
78 | public $single_page_link_in_feed = array(); | 78 | public $single_page_link_in_feed = array(); |
79 | 79 | ||
80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
81 | // string or null if undeclared | 81 | // string or null if undeclared |
82 | public $parser = null; | 82 | public $parser = null; |
83 | protected $default_parser = 'libxml'; // used if undeclared | 83 | protected $default_parser = 'libxml'; // used if undeclared |
84 | 84 | ||
85 | // Strings to search for in HTML before processing begins (used with $replace_string) | 85 | // Strings to search for in HTML before processing begins (used with $replace_string) |
86 | public $find_string = array(); | 86 | public $find_string = array(); |
87 | // Strings to replace those found in $find_string before HTML processing begins | 87 | // Strings to replace those found in $find_string before HTML processing begins |
88 | public $replace_string = array(); | 88 | public $replace_string = array(); |
89 | 89 | ||
90 | // the options below cannot be set in the config files which this class represents | 90 | // the options below cannot be set in the config files which this class represents |
91 | 91 | ||
92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not | 92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not |
93 | public $cache_key = null; | 93 | public $cache_key = null; |
94 | public static $debug = false; | 94 | public static $debug = false; |
95 | protected static $apc = false; | 95 | protected static $apc = false; |
96 | protected static $config_path; | 96 | protected static $config_path; |
97 | protected static $config_path_fallback; | 97 | protected static $config_path_fallback; |
98 | protected static $config_cache = array(); | 98 | protected static $config_cache = array(); |
99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; | 99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; |
100 | 100 | ||
101 | protected static function debug($msg) { | 101 | protected static function debug($msg) { |
102 | if (self::$debug) { | 102 | if (self::$debug) { |
103 | //$mem = round(memory_get_usage()/1024, 2); | 103 | //$mem = round(memory_get_usage()/1024, 2); |
104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); |
105 | echo '* ',$msg; | 105 | echo '* ',$msg; |
106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
107 | echo "\n"; | 107 | echo "\n"; |
108 | ob_flush(); | 108 | ob_flush(); |
109 | flush(); | 109 | flush(); |
110 | } | 110 | } |
111 | } | 111 | } |
112 | 112 | ||
113 | // enable APC caching of certain site config files? | 113 | // enable APC caching of certain site config files? |
114 | // If enabled the following site config files will be | 114 | // If enabled the following site config files will be |
115 | // cached in APC cache (when requested for first time): | 115 | // cached in APC cache (when requested for first time): |
116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ | 116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ |
117 | // * the site config files associated with HTML fingerprints | 117 | // * the site config files associated with HTML fingerprints |
118 | // * the global site config file | 118 | // * the global site config file |
119 | // returns true if enabled, false otherwise | 119 | // returns true if enabled, false otherwise |
120 | public static function use_apc($apc=true) { | 120 | public static function use_apc($apc=true) { |
121 | if (!function_exists('apc_add')) { | 121 | if (!function_exists('apc_add')) { |
122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); | 122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); |
123 | return false; | 123 | return false; |
124 | } | 124 | } |
125 | self::$apc = $apc; | 125 | self::$apc = $apc; |
126 | return $apc; | 126 | return $apc; |
127 | } | 127 | } |
128 | 128 | ||
129 | // return bool or null | 129 | // return bool or null |
130 | public function tidy($use_default=true) { | 130 | public function tidy($use_default=true) { |
131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; | 131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; |
132 | return $this->tidy; | 132 | return $this->tidy; |
133 | } | 133 | } |
134 | 134 | ||
135 | // return bool or null | 135 | // return bool or null |
136 | public function prune($use_default=true) { | 136 | public function prune($use_default=true) { |
137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; | 137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; |
138 | return $this->prune; | 138 | return $this->prune; |
139 | } | 139 | } |
140 | 140 | ||
141 | // return string or null | 141 | // return string or null |
142 | public function parser($use_default=true) { | 142 | public function parser($use_default=true) { |
143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; | 143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; |
144 | return $this->parser; | 144 | return $this->parser; |
145 | } | 145 | } |
146 | 146 | ||
147 | // return bool or null | 147 | // return bool or null |
148 | public function autodetect_on_failure($use_default=true) { | 148 | public function autodetect_on_failure($use_default=true) { |
149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; | 149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; |
150 | return $this->autodetect_on_failure; | 150 | return $this->autodetect_on_failure; |
151 | } | 151 | } |
152 | 152 | ||
153 | public static function set_config_path($path, $fallback=null) { | 153 | public static function set_config_path($path, $fallback=null) { |
154 | self::$config_path = $path; | 154 | self::$config_path = $path; |
155 | self::$config_path_fallback = $fallback; | 155 | self::$config_path_fallback = $fallback; |
156 | } | 156 | } |
157 | 157 | ||
158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { | 158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { |
159 | $key = strtolower($key); | 159 | $key = strtolower($key); |
160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
161 | if ($config->cache_key) $key = $config->cache_key; | 161 | if ($config->cache_key) $key = $config->cache_key; |
162 | self::$config_cache[$key] = $config; | 162 | self::$config_cache[$key] = $config; |
163 | if (self::$apc && $use_apc) { | 163 | if (self::$apc && $use_apc) { |
164 | self::debug("Adding site config to APC cache with key sc.$key"); | 164 | self::debug("Adding site config to APC cache with key sc.$key"); |
165 | apc_add("sc.$key", $config); | 165 | apc_add("sc.$key", $config); |
166 | } | 166 | } |
167 | self::debug("Cached site config with key $key"); | 167 | self::debug("Cached site config with key $key"); |
168 | } | 168 | } |
169 | 169 | ||
170 | public static function is_cached($key) { | 170 | public static function is_cached($key) { |
171 | $key = strtolower($key); | 171 | $key = strtolower($key); |
172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
173 | if (array_key_exists($key, self::$config_cache)) { | 173 | if (array_key_exists($key, self::$config_cache)) { |
174 | return true; | 174 | return true; |
175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { | 175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { |
176 | return true; | 176 | return true; |
177 | } | 177 | } |
178 | return false; | 178 | return false; |
179 | } | 179 | } |
180 | 180 | ||
181 | public function append(SiteConfig $newconfig) { | 181 | public function append(SiteConfig $newconfig) { |
182 | // check for commands where we accept multiple statements (no test_url) | 182 | // check for commands where we accept multiple statements (no test_url) |
183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { | 183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { |
184 | // append array elements for this config variable from $newconfig to this config | 184 | // append array elements for this config variable from $newconfig to this config |
185 | //$this->$var = $this->$var + $newconfig->$var; | 185 | //$this->$var = $this->$var + $newconfig->$var; |
186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); | 186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); |
187 | } | 187 | } |
188 | // check for single statement commands | 188 | // check for single statement commands |
189 | // we do not overwrite existing non null values | 189 | // we do not overwrite existing non null values |
190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { | 190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { |
191 | if ($this->$var === null) $this->$var = $newconfig->$var; | 191 | if ($this->$var === null) $this->$var = $newconfig->$var; |
192 | } | 192 | } |
193 | } | 193 | // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) |
194 | 194 | foreach (array('find_string', 'replace_string') as $var) { | |
195 | // returns SiteConfig instance if an appropriate one is found, false otherwise | 195 | // append array elements for this config variable from $newconfig to this config |
196 | // if $exact_host_match is true, we will not look for wildcard config matches | 196 | //$this->$var = $this->$var + $newconfig->$var; |
197 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists | 197 | $this->$var = array_merge($this->$var, $newconfig->$var); |
198 | public static function build($host, $exact_host_match=false) { | 198 | } |
199 | $host = strtolower($host); | 199 | } |
200 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 200 | |
201 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; | 201 | // returns SiteConfig instance if an appropriate one is found, false otherwise |
202 | // check for site configuration | 202 | // if $exact_host_match is true, we will not look for wildcard config matches |
203 | $try = array($host); | 203 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists |
204 | // should we look for wildcard matches | 204 | public static function build($host, $exact_host_match=false) { |
205 | if (!$exact_host_match) { | 205 | $host = strtolower($host); |
206 | $split = explode('.', $host); | 206 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
207 | if (count($split) > 1) { | 207 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; |
208 | array_shift($split); | 208 | // check for site configuration |
209 | $try[] = '.'.implode('.', $split); | 209 | $try = array($host); |
210 | } | 210 | // should we look for wildcard matches |
211 | } | 211 | if (!$exact_host_match) { |
212 | 212 | $split = explode('.', $host); | |
213 | // look for site config file in primary folder | 213 | if (count($split) > 1) { |
214 | self::debug(". looking for site config for $host in primary folder"); | 214 | array_shift($split); |
215 | foreach ($try as $h) { | 215 | $try[] = '.'.implode('.', $split); |
216 | if (array_key_exists($h, self::$config_cache)) { | 216 | } |
217 | self::debug("... site config for $h already loaded in this request"); | 217 | } |
218 | return self::$config_cache[$h]; | 218 | |
219 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { | 219 | // look for site config file in primary folder |
220 | self::debug("... site config for $h in APC cache"); | 220 | self::debug(". looking for site config for $host in primary folder"); |
221 | return $sconfig; | 221 | foreach ($try as $h) { |
222 | } elseif (file_exists(self::$config_path."/$h.txt")) { | 222 | if (array_key_exists($h, self::$config_cache)) { |
223 | self::debug("... found site config ($h.txt)"); | 223 | self::debug("... site config for $h already loaded in this request"); |
224 | $file_primary = self::$config_path."/$h.txt"; | 224 | return self::$config_cache[$h]; |
225 | $matched_name = $h; | 225 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { |
226 | break; | 226 | self::debug("... site config for $h in APC cache"); |
227 | } | 227 | return $sconfig; |
228 | } | 228 | } elseif (file_exists(self::$config_path."/$h.txt")) { |
229 | 229 | self::debug("... found site config ($h.txt)"); | |
230 | // if we found site config, process it | 230 | $file_primary = self::$config_path."/$h.txt"; |
231 | if (isset($file_primary)) { | 231 | $matched_name = $h; |
232 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 232 | break; |
233 | if (!$config_lines || !is_array($config_lines)) return false; | 233 | } |
234 | $config = self::build_from_array($config_lines); | 234 | } |
235 | // if APC caching is available and enabled, mark this for cache | 235 | |
236 | //$config->cache_in_apc = true; | 236 | // if we found site config, process it |
237 | $config->cache_key = $matched_name; | 237 | if (isset($file_primary)) { |
238 | 238 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
239 | // if autodetec on failure is off (on by default) we do not need to look | 239 | if (!$config_lines || !is_array($config_lines)) return false; |
240 | // in secondary folder | 240 | $config = self::build_from_array($config_lines); |
241 | if (!$config->autodetect_on_failure()) { | 241 | // if APC caching is available and enabled, mark this for cache |
242 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); | 242 | //$config->cache_in_apc = true; |
243 | return $config; | 243 | $config->cache_key = $matched_name; |
244 | } | 244 | |
245 | } | 245 | // if autodetec on failure is off (on by default) we do not need to look |
246 | 246 | // in secondary folder | |
247 | // look for site config file in secondary folder | 247 | if (!$config->autodetect_on_failure()) { |
248 | if (isset(self::$config_path_fallback)) { | 248 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); |
249 | self::debug(". looking for site config for $host in secondary folder"); | 249 | return $config; |
250 | foreach ($try as $h) { | 250 | } |
251 | if (file_exists(self::$config_path_fallback."/$h.txt")) { | 251 | } |
252 | self::debug("... found site config in secondary folder ($h.txt)"); | 252 | |
253 | $file_secondary = self::$config_path_fallback."/$h.txt"; | 253 | // look for site config file in secondary folder |
254 | $matched_name = $h; | 254 | if (isset(self::$config_path_fallback)) { |
255 | break; | 255 | self::debug(". looking for site config for $host in secondary folder"); |
256 | } | 256 | foreach ($try as $h) { |
257 | } | 257 | if (file_exists(self::$config_path_fallback."/$h.txt")) { |
258 | if (!isset($file_secondary)) { | 258 | self::debug("... found site config in secondary folder ($h.txt)"); |
259 | self::debug("... no site config match in secondary folder"); | 259 | $file_secondary = self::$config_path_fallback."/$h.txt"; |
260 | } | 260 | $matched_name = $h; |
261 | } | 261 | break; |
262 | 262 | } | |
263 | // return false if no config file found | 263 | } |
264 | if (!isset($file_primary) && !isset($file_secondary)) { | 264 | if (!isset($file_secondary)) { |
265 | self::debug("... no site config match for $host"); | 265 | self::debug("... no site config match in secondary folder"); |
266 | return false; | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
269 | // return primary config if secondary not found | 269 | // return false if no config file found |
270 | if (!isset($file_secondary) && isset($config)) { | 270 | if (!isset($file_primary) && !isset($file_secondary)) { |
271 | return $config; | 271 | self::debug("... no site config match for $host"); |
272 | } | 272 | return false; |
273 | 273 | } | |
274 | // process secondary config file | 274 | |
275 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 275 | // return primary config if secondary not found |
276 | if (!$config_lines || !is_array($config_lines)) { | 276 | if (!isset($file_secondary) && isset($config)) { |
277 | // failed to process secondary | 277 | return $config; |
278 | if (isset($config)) { | 278 | } |
279 | // return primary config | 279 | |
280 | return $config; | 280 | // process secondary config file |
281 | } else { | 281 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
282 | return false; | 282 | if (!$config_lines || !is_array($config_lines)) { |
283 | } | 283 | // failed to process secondary |
284 | } | 284 | if (isset($config)) { |
285 | 285 | // return primary config | |
286 | // merge with primary and return | 286 | return $config; |
287 | if (isset($config)) { | 287 | } else { |
288 | self::debug('. merging config files'); | 288 | return false; |
289 | $config->append(self::build_from_array($config_lines)); | 289 | } |
290 | return $config; | 290 | } |
291 | } else { | 291 | |
292 | // return just secondary | 292 | // merge with primary and return |
293 | $config = self::build_from_array($config_lines); | 293 | if (isset($config)) { |
294 | // if APC caching is available and enabled, mark this for cache | 294 | self::debug('. merging config files'); |
295 | //$config->cache_in_apc = true; | 295 | $config->append(self::build_from_array($config_lines)); |
296 | $config->cache_key = $matched_name; | 296 | return $config; |
297 | return $config; | 297 | } else { |
298 | } | 298 | // return just secondary |
299 | } | 299 | $config = self::build_from_array($config_lines); |
300 | 300 | // if APC caching is available and enabled, mark this for cache | |
301 | public static function build_from_array(array $lines) { | 301 | //$config->cache_in_apc = true; |
302 | $config = new SiteConfig(); | 302 | $config->cache_key = $matched_name; |
303 | foreach ($lines as $line) { | 303 | return $config; |
304 | $line = trim($line); | 304 | } |
305 | 305 | } | |
306 | // skip comments, empty lines | 306 | |
307 | if ($line == '' || $line[0] == '#') continue; | 307 | public static function build_from_array(array $lines) { |
308 | 308 | $config = new SiteConfig(); | |
309 | // get command | 309 | foreach ($lines as $line) { |
310 | $command = explode(':', $line, 2); | 310 | $line = trim($line); |
311 | // if there's no colon ':', skip this line | 311 | |
312 | if (count($command) != 2) continue; | 312 | // skip comments, empty lines |
313 | $val = trim($command[1]); | 313 | if ($line == '' || $line[0] == '#') continue; |
314 | $command = trim($command[0]); | 314 | |
315 | if ($command == '' || $val == '') continue; | 315 | // get command |
316 | 316 | $command = explode(':', $line, 2); | |
317 | // check for commands where we accept multiple statements | 317 | // if there's no colon ':', skip this line |
318 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { | 318 | if (count($command) != 2) continue; |
319 | array_push($config->$command, $val); | 319 | $val = trim($command[1]); |
320 | // check for single statement commands that evaluate to true or false | 320 | $command = trim($command[0]); |
321 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { | 321 | if ($command == '' || $val == '') continue; |
322 | $config->$command = ($val == 'yes'); | 322 | |
323 | // check for single statement commands stored as strings | 323 | // check for commands where we accept multiple statements |
324 | } elseif (in_array($command, array('parser'))) { | 324 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { |
325 | $config->$command = $val; | 325 | array_push($config->$command, $val); |
326 | // check for replace_string(find): replace | 326 | // check for single statement commands that evaluate to true or false |
327 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { | 327 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { |
328 | if (in_array($match[1], array('replace_string'))) { | 328 | $config->$command = ($val == 'yes'); |
329 | $command = $match[1]; | 329 | // check for single statement commands stored as strings |
330 | array_push($config->find_string, $match[2]); | 330 | } elseif (in_array($command, array('parser'))) { |
331 | array_push($config->$command, $val); | 331 | $config->$command = $val; |
332 | } | 332 | // check for replace_string(find): replace |
333 | } | 333 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { |
334 | } | 334 | if (in_array($match[1], array('replace_string'))) { |
335 | return $config; | 335 | $command = $match[1]; |
336 | } | 336 | array_push($config->find_string, $match[2]); |
337 | } | 337 | array_push($config->$command, $val); |
338 | ?> \ No newline at end of file | 338 | } |
339 | } | ||
340 | } | ||
341 | return $config; | ||
342 | } | ||
343 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php index 54a56f22..40786598 100644..100755 --- a/inc/3rdparty/libraries/feedwriter/FeedItem.php +++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php | |||
@@ -1,7 +1,7 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Univarsel Feed Writer | 3 | * Univarsel Feed Writer |
4 | * | 4 | * |
5 | * FeedItem class - Used as feed element in FeedWriter class | 5 | * FeedItem class - Used as feed element in FeedWriter class |
6 | * | 6 | * |
7 | * @package UnivarselFeedWriter | 7 | * @package UnivarselFeedWriter |
@@ -12,20 +12,20 @@ | |||
12 | { | 12 | { |
13 | private $elements = array(); //Collection of feed elements | 13 | private $elements = array(); //Collection of feed elements |
14 | private $version; | 14 | private $version; |
15 | 15 | ||
16 | /** | 16 | /** |
17 | * Constructor | 17 | * Constructor |
18 | * | 18 | * |
19 | * @param contant (RSS1/RSS2/ATOM) RSS2 is default. | 19 | * @param contant (RSS1/RSS2/ATOM) RSS2 is default. |
20 | */ | 20 | */ |
21 | function __construct($version = RSS2) | 21 | function __construct($version = RSS2) |
22 | { | 22 | { |
23 | $this->version = $version; | 23 | $this->version = $version; |
24 | } | 24 | } |
25 | 25 | ||
26 | /** | 26 | /** |
27 | * Set element (overwrites existing elements with $elementName) | 27 | * Set element (overwrites existing elements with $elementName) |
28 | * | 28 | * |
29 | * @access public | 29 | * @access public |
30 | * @param srting The tag name of an element | 30 | * @param srting The tag name of an element |
31 | * @param srting The content of tag | 31 | * @param srting The content of tag |
@@ -38,11 +38,11 @@ | |||
38 | unset($this->elements[$elementName]); | 38 | unset($this->elements[$elementName]); |
39 | } | 39 | } |
40 | $this->addElement($elementName, $content, $attributes); | 40 | $this->addElement($elementName, $content, $attributes); |
41 | } | 41 | } |
42 | 42 | ||
43 | /** | 43 | /** |
44 | * Add an element to elements array | 44 | * Add an element to elements array |
45 | * | 45 | * |
46 | * @access public | 46 | * @access public |
47 | * @param srting The tag name of an element | 47 | * @param srting The tag name of an element |
48 | * @param srting The content of tag | 48 | * @param srting The content of tag |
@@ -61,11 +61,11 @@ | |||
61 | $this->elements[$elementName][$i]['content'] = $content; | 61 | $this->elements[$elementName][$i]['content'] = $content; |
62 | $this->elements[$elementName][$i]['attributes'] = $attributes; | 62 | $this->elements[$elementName][$i]['attributes'] = $attributes; |
63 | } | 63 | } |
64 | 64 | ||
65 | /** | 65 | /** |
66 | * Set multiple feed elements from an array. | 66 | * Set multiple feed elements from an array. |
67 | * Elements which have attributes cannot be added by this method | 67 | * Elements which have attributes cannot be added by this method |
68 | * | 68 | * |
69 | * @access public | 69 | * @access public |
70 | * @param array array of elements in 'tagName' => 'tagContent' format. | 70 | * @param array array of elements in 'tagName' => 'tagContent' format. |
71 | * @return void | 71 | * @return void |
@@ -73,15 +73,15 @@ | |||
73 | public function addElementArray($elementArray) | 73 | public function addElementArray($elementArray) |
74 | { | 74 | { |
75 | if(! is_array($elementArray)) return; | 75 | if(! is_array($elementArray)) return; |
76 | foreach ($elementArray as $elementName => $content) | 76 | foreach ($elementArray as $elementName => $content) |
77 | { | 77 | { |
78 | $this->addElement($elementName, $content); | 78 | $this->addElement($elementName, $content); |
79 | } | 79 | } |
80 | } | 80 | } |
81 | 81 | ||
82 | /** | 82 | /** |
83 | * Return the collection of elements in this feed item | 83 | * Return the collection of elements in this feed item |
84 | * | 84 | * |
85 | * @access public | 85 | * @access public |
86 | * @return array | 86 | * @return array |
87 | */ | 87 | */ |
@@ -89,68 +89,74 @@ | |||
89 | { | 89 | { |
90 | return $this->elements; | 90 | return $this->elements; |
91 | } | 91 | } |
92 | 92 | ||
93 | // Wrapper functions ------------------------------------------------------ | 93 | // Wrapper functions ------------------------------------------------------ |
94 | 94 | ||
95 | /** | 95 | /** |
96 | * Set the 'dscription' element of feed item | 96 | * Set the 'dscription' element of feed item |
97 | * | 97 | * |
98 | * @access public | 98 | * @access public |
99 | * @param string The content of 'description' element | 99 | * @param string The content of 'description' element |
100 | * @return void | 100 | * @return void |
101 | */ | 101 | */ |
102 | public function setDescription($description) | 102 | public function setDescription($description) |
103 | { | 103 | { |
104 | $this->setElement('description', $description); | 104 | $tag = ($this->version == ATOM)? 'summary' : 'description'; |
105 | $this->setElement($tag, $description); | ||
105 | } | 106 | } |
106 | 107 | ||
107 | /** | 108 | /** |
108 | * @desc Set the 'title' element of feed item | 109 | * @desc Set the 'title' element of feed item |
109 | * @access public | 110 | * @access public |
110 | * @param string The content of 'title' element | 111 | * @param string The content of 'title' element |
111 | * @return void | 112 | * @return void |
112 | */ | 113 | */ |
113 | public function setTitle($title) | 114 | public function setTitle($title) |
114 | { | 115 | { |
115 | $this->setElement('title', $title); | 116 | $this->setElement('title', $title); |
116 | } | 117 | } |
117 | 118 | ||
118 | /** | 119 | /** |
119 | * Set the 'date' element of feed item | 120 | * Set the 'date' element of feed item |
120 | * | 121 | * |
121 | * @access public | 122 | * @access public |
122 | * @param string The content of 'date' element | 123 | * @param string The content of 'date' element |
123 | * @return void | 124 | * @return void |
124 | */ | 125 | */ |
125 | public function setDate($date) | 126 | public function setDate($date) |
126 | { | 127 | { |
127 | if(! is_numeric($date)) | 128 | if(! is_numeric($date)) |
128 | { | 129 | { |
129 | $date = strtotime($date); | 130 | $date = strtotime($date); |
130 | } | 131 | } |
131 | 132 | ||
132 | if($this->version == RSS2) | 133 | if($this->version == ATOM) |
134 | { | ||
135 | $tag = 'updated'; | ||
136 | $value = date(DATE_ATOM, $date); | ||
137 | } | ||
138 | elseif($this->version == RSS2) | ||
133 | { | 139 | { |
134 | $tag = 'pubDate'; | 140 | $tag = 'pubDate'; |
135 | $value = date(DATE_RSS, $date); | 141 | $value = date(DATE_RSS, $date); |
136 | } | 142 | } |
137 | else | 143 | else |
138 | { | 144 | { |
139 | $tag = 'dc:date'; | 145 | $tag = 'dc:date'; |
140 | $value = date("Y-m-d", $date); | 146 | $value = date("Y-m-d", $date); |
141 | } | 147 | } |
142 | 148 | ||
143 | $this->setElement($tag, $value); | 149 | $this->setElement($tag, $value); |
144 | } | 150 | } |
145 | 151 | ||
146 | /** | 152 | /** |
147 | * Set the 'link' element of feed item | 153 | * Set the 'link' element of feed item |
148 | * | 154 | * |
149 | * @access public | 155 | * @access public |
150 | * @param string The content of 'link' element | 156 | * @param string The content of 'link' element |
151 | * @return void | 157 | * @return void |
152 | */ | 158 | */ |
153 | public function setLink($link) | 159 | public function setLink($link) |
154 | { | 160 | { |
155 | if($this->version == RSS2 || $this->version == RSS1) | 161 | if($this->version == RSS2 || $this->version == RSS1) |
156 | { | 162 | { |
@@ -161,27 +167,27 @@ | |||
161 | { | 167 | { |
162 | $this->setElement('link','',array('href'=>$link)); | 168 | $this->setElement('link','',array('href'=>$link)); |
163 | $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); | 169 | $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); |
164 | } | 170 | } |
165 | 171 | ||
166 | } | 172 | } |
167 | 173 | ||
168 | /** | 174 | /** |
169 | * Set the 'source' element of feed item | 175 | * Set the 'source' element of feed item |
170 | * | 176 | * |
171 | * @access public | 177 | * @access public |
172 | * @param string The content of 'source' element | 178 | * @param string The content of 'source' element |
173 | * @return void | 179 | * @return void |
174 | */ | 180 | */ |
175 | public function setSource($link) | 181 | public function setSource($link) |
176 | { | 182 | { |
177 | $attributes = array('url'=>$link); | 183 | $attributes = array('url'=>$link); |
178 | $this->setElement('source', "wallabag",$attributes); | 184 | $this->setElement('source', "wallabag",$attributes); |
179 | } | 185 | } |
180 | 186 | ||
181 | /** | 187 | /** |
182 | * Set the 'encloser' element of feed item | 188 | * Set the 'encloser' element of feed item |
183 | * For RSS 2.0 only | 189 | * For RSS 2.0 only |
184 | * | 190 | * |
185 | * @access public | 191 | * @access public |
186 | * @param string The url attribute of encloser tag | 192 | * @param string The url attribute of encloser tag |
187 | * @param string The length attribute of encloser tag | 193 | * @param string The length attribute of encloser tag |
@@ -193,6 +199,6 @@ | |||
193 | $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); | 199 | $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); |
194 | $this->setElement('enclosure','',$attributes); | 200 | $this->setElement('enclosure','',$attributes); |
195 | } | 201 | } |
196 | 202 | ||
197 | } // end of class FeedItem | 203 | } // end of class FeedItem |
198 | ?> \ No newline at end of file | 204 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index d708e99b..77755690 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php | |||
@@ -97,15 +97,12 @@ define('JSONP', 3, true); | |||
97 | header('X-content-type-options: nosniff'); | 97 | header('X-content-type-options: nosniff'); |
98 | } elseif ($this->version == JSON) { | 98 | } elseif ($this->version == JSON) { |
99 | header('Content-type: application/json; charset=UTF-8'); | 99 | header('Content-type: application/json; charset=UTF-8'); |
100 | $this->json = new stdClass(); | ||
100 | } elseif ($this->version == JSONP) { | 101 | } elseif ($this->version == JSONP) { |
101 | header('Content-type: application/javascript; charset=UTF-8'); | 102 | header('Content-type: application/javascript; charset=UTF-8'); |
103 | $this->json = new stdClass(); | ||
102 | } | 104 | } |
103 | } | 105 | } |
104 | |||
105 | if ($this->version == JSON || $this->version == JSONP) { | ||
106 | $this->json = new stdClass(); | ||
107 | } | ||
108 | |||
109 | 106 | ||
110 | $this->printHead(); | 107 | $this->printHead(); |
111 | $this->printChannels(); | 108 | $this->printChannels(); |
@@ -116,6 +113,11 @@ define('JSONP', 3, true); | |||
116 | } | 113 | } |
117 | } | 114 | } |
118 | 115 | ||
116 | public function &getItems() | ||
117 | { | ||
118 | return $this->items; | ||
119 | } | ||
120 | |||
119 | /** | 121 | /** |
120 | * Create a new FeedItem. | 122 | * Create a new FeedItem. |
121 | * | 123 | * |
@@ -199,7 +201,8 @@ define('JSONP', 3, true); | |||
199 | */ | 201 | */ |
200 | public function setDescription($description) | 202 | public function setDescription($description) |
201 | { | 203 | { |
202 | $this->setChannelElement('description', $description); | 204 | $tag = ($this->version == ATOM)? 'subtitle' : 'description'; |
205 | $this->setChannelElement($tag, $desciption); | ||
203 | } | 206 | } |
204 | 207 | ||
205 | /** | 208 | /** |
@@ -244,7 +247,7 @@ define('JSONP', 3, true); | |||
244 | { | 247 | { |
245 | $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; | 248 | $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; |
246 | if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; | 249 | if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; |
247 | $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; | 250 | $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; |
248 | echo $out; | 251 | echo $out; |
249 | } | 252 | } |
250 | elseif ($this->version == JSON || $this->version == JSONP) | 253 | elseif ($this->version == JSON || $this->version == JSONP) |
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php index 2f5244f9..c4a48b21 100644 --- a/inc/3rdparty/libraries/html5/TreeBuilder.php +++ b/inc/3rdparty/libraries/html5/TreeBuilder.php | |||
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder { | |||
134 | 134 | ||
135 | // Namespaces for foreign content | 135 | // Namespaces for foreign content |
136 | const NS_HTML = null; // to prevent DOM from requiring NS on everything | 136 | const NS_HTML = null; // to prevent DOM from requiring NS on everything |
137 | const NS_XHTML = 'http://www.w3.org/1999/xhtml'; | ||
137 | const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; | 138 | const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
138 | const NS_SVG = 'http://www.w3.org/2000/svg'; | 139 | const NS_SVG = 'http://www.w3.org/2000/svg'; |
139 | const NS_XLINK = 'http://www.w3.org/1999/xlink'; | 140 | const NS_XLINK = 'http://www.w3.org/1999/xlink'; |
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder { | |||
3157 | } | 3158 | } |
3158 | 3159 | ||
3159 | private function insertElement($token, $append = true) { | 3160 | private function insertElement($token, $append = true) { |
3160 | $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); | 3161 | //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']); |
3162 | $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML; | ||
3163 | $el = $this->dom->createElementNS($namespaceURI, $token['name']); | ||
3161 | 3164 | ||
3162 | if (!empty($token['attr'])) { | 3165 | if (!empty($token['attr'])) { |
3163 | foreach($token['attr'] as $attr) { | 3166 | foreach($token['attr'] as $attr) { |
3164 | if(!$el->hasAttribute($attr['name'])) { | 3167 | |
3168 | // mike@macgirvin.com 2011-11-17, check attribute name for | ||
3169 | // validity (ignoring extenders and combiners) as illegal chars in names | ||
3170 | // causes everything to abort | ||
3171 | |||
3172 | $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']); | ||
3173 | if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) { | ||
3165 | $el->setAttribute($attr['name'], $attr['value']); | 3174 | $el->setAttribute($attr['name'], $attr['value']); |
3166 | } | 3175 | } |
3167 | } | 3176 | } |
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php index 83e94f14..e4d5f495 100644 --- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php +++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php | |||
@@ -1,404 +1,403 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Cookie Jar | 3 | * Cookie Jar |
4 | * | 4 | * |
5 | * PHP class for handling cookies, as defined by the Netscape spec: | 5 | * PHP class for handling cookies, as defined by the Netscape spec: |
6 | * <http://curl.haxx.se/rfc/cookie_spec.html> | 6 | * <http://curl.haxx.se/rfc/cookie_spec.html> |
7 | * | 7 | * |
8 | * This class should be used to handle cookies (storing cookies from HTTP response messages, and | 8 | * This class should be used to handle cookies (storing cookies from HTTP response messages, and |
9 | * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org | 9 | * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org |
10 | * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ | 10 | * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ |
11 | * | 11 | * |
12 | * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ | 12 | * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ |
13 | * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. | 13 | * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. |
14 | * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. | 14 | * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. |
15 | * | 15 | * |
16 | * @version 0.5 | 16 | * @version 0.5 |
17 | * @date 2011-03-15 | 17 | * @date 2011-03-15 |
18 | * @see http://php.net/HttpRequestPool | 18 | * @see http://php.net/HttpRequestPool |
19 | * @author Keyvan Minoukadeh | 19 | * @author Keyvan Minoukadeh |
20 | * @copyright 2011 Keyvan Minoukadeh | 20 | * @copyright 2011 Keyvan Minoukadeh |
21 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 21 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
22 | */ | 22 | */ |
23 | 23 | ||
24 | class CookieJar | 24 | class CookieJar |
25 | { | 25 | { |
26 | /** | 26 | /** |
27 | * Cookies - array containing all cookies. | 27 | * Cookies - array containing all cookies. |
28 | * | 28 | * |
29 | * <pre> | 29 | * <pre> |
30 | * Cookies are stored like this: | 30 | * Cookies are stored like this: |
31 | * [domain][path][name] = array | 31 | * [domain][path][name] = array |
32 | * where array is: | 32 | * where array is: |
33 | * 0 => value, 1 => secure, 2 => expires | 33 | * 0 => value, 1 => secure, 2 => expires |
34 | * </pre> | 34 | * </pre> |
35 | * @var array | 35 | * @var array |
36 | * @access private | 36 | * @access private |
37 | */ | 37 | */ |
38 | public $cookies = array(); | 38 | public $cookies = array(); |
39 | public $debug = false; | 39 | public $debug = false; |
40 | 40 | ||
41 | /** | 41 | /** |
42 | * Constructor | 42 | * Constructor |
43 | */ | 43 | */ |
44 | function __construct() { | 44 | function __construct() { |
45 | } | 45 | } |
46 | 46 | ||
47 | protected function debug($msg, $file=null, $line=null) { | 47 | protected function debug($msg, $file=null, $line=null) { |
48 | if ($this->debug) { | 48 | if ($this->debug) { |
49 | $mem = round(memory_get_usage()/1024, 2); | 49 | $mem = round(memory_get_usage()/1024, 2); |
50 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 50 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
51 | echo '* ',$msg; | 51 | echo '* ',$msg; |
52 | if (isset($file, $line)) echo " ($file line $line)"; | 52 | if (isset($file, $line)) echo " ($file line $line)"; |
53 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 53 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
54 | ob_flush(); | 54 | ob_flush(); |
55 | flush(); | 55 | flush(); |
56 | } | 56 | } |
57 | } | 57 | } |
58 | 58 | ||
59 | /** | 59 | /** |
60 | * Get matching cookies | 60 | * Get matching cookies |
61 | * | 61 | * |
62 | * Only use this method if you cannot use add_cookie_header(), for example, if you want to use | 62 | * Only use this method if you cannot use add_cookie_header(), for example, if you want to use |
63 | * this cookie jar class without using the request class. | 63 | * this cookie jar class without using the request class. |
64 | * | 64 | * |
65 | * @param array $param associative array containing 'domain', 'path', 'secure' keys | 65 | * @param array $param associative array containing 'domain', 'path', 'secure' keys |
66 | * @return string | 66 | * @return string |
67 | * @see add_cookie_header() | 67 | * @see add_cookie_header() |
68 | */ | 68 | */ |
69 | public function getMatchingCookies($url) | 69 | public function getMatchingCookies($url) |
70 | { | 70 | { |
71 | if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { | 71 | if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { |
72 | $param['domain'] = $parts['host']; | 72 | $param['domain'] = $parts['host']; |
73 | $param['path'] = $parts['path']; | 73 | $param['path'] = $parts['path']; |
74 | $param['secure'] = (strtolower($parts['scheme']) == 'https'); | 74 | $param['secure'] = (strtolower($parts['scheme']) == 'https'); |
75 | unset($parts); | 75 | unset($parts); |
76 | } else { | 76 | } else { |
77 | return false; | 77 | return false; |
78 | } | 78 | } |
79 | // RFC 2965 notes: | 79 | // RFC 2965 notes: |
80 | // If multiple cookies satisfy the criteria above, they are ordered in | 80 | // If multiple cookies satisfy the criteria above, they are ordered in |
81 | // the Cookie header such that those with more specific Path attributes | 81 | // the Cookie header such that those with more specific Path attributes |
82 | // precede those with less specific. Ordering with respect to other | 82 | // precede those with less specific. Ordering with respect to other |
83 | // attributes (e.g., Domain) is unspecified. | 83 | // attributes (e.g., Domain) is unspecified. |
84 | $domain = $param['domain']; | 84 | $domain = $param['domain']; |
85 | if (strpos($domain, '.') === false) $domain .= '.local'; | 85 | if (strpos($domain, '.') === false) $domain .= '.local'; |
86 | $request_path = $param['path']; | 86 | $request_path = $param['path']; |
87 | if ($request_path == '') $request_path = '/'; | 87 | if ($request_path == '') $request_path = '/'; |
88 | $request_secure = $param['secure']; | 88 | $request_secure = $param['secure']; |
89 | $now = time(); | 89 | $now = time(); |
90 | $matched_cookies = array(); | 90 | $matched_cookies = array(); |
91 | // domain - find matching domains | 91 | // domain - find matching domains |
92 | $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); | 92 | $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); |
93 | while (strpos($domain, '.') !== false) { | 93 | while (strpos($domain, '.') !== false) { |
94 | if (isset($this->cookies[$domain])) { | 94 | if (isset($this->cookies[$domain])) { |
95 | $this->debug(' domain match found: '.$domain); | 95 | $this->debug(' domain match found: '.$domain); |
96 | $cookies =& $this->cookies[$domain]; | 96 | $cookies =& $this->cookies[$domain]; |
97 | } else { | 97 | } else { |
98 | $domain = $this->_reduce_domain($domain); | 98 | $domain = $this->_reduce_domain($domain); |
99 | continue; | 99 | continue; |
100 | } | 100 | } |
101 | // paths - find matching paths starting from most specific | 101 | // paths - find matching paths starting from most specific |
102 | $this->debug(' - Finding matching paths for '.$request_path); | 102 | $this->debug(' - Finding matching paths for '.$request_path); |
103 | $paths = array_keys($cookies); | 103 | $paths = array_keys($cookies); |
104 | usort($paths, array($this, '_cmp_length')); | 104 | usort($paths, array($this, '_cmp_length')); |
105 | foreach ($paths as $path) { | 105 | foreach ($paths as $path) { |
106 | // continue to next cookie if request path does not path-match cookie path | 106 | // continue to next cookie if request path does not path-match cookie path |
107 | if (!$this->_path_match($request_path, $path)) continue; | 107 | if (!$this->_path_match($request_path, $path)) continue; |
108 | // loop through cookie names | 108 | // loop through cookie names |
109 | $this->debug(' path match found: '.$path); | 109 | $this->debug(' path match found: '.$path); |
110 | foreach ($cookies[$path] as $name => $values) { | 110 | foreach ($cookies[$path] as $name => $values) { |
111 | // if this cookie is secure but request isn't, continue to next cookie | 111 | // if this cookie is secure but request isn't, continue to next cookie |
112 | if ($values[1] && !$request_secure) continue; | 112 | if ($values[1] && !$request_secure) continue; |
113 | // if cookie is not a session cookie and has expired, continue to next cookie | 113 | // if cookie is not a session cookie and has expired, continue to next cookie |
114 | if (is_int($values[2]) && ($values[2] < $now)) continue; | 114 | if (is_int($values[2]) && ($values[2] < $now)) continue; |
115 | // cookie matches request | 115 | // cookie matches request |
116 | $this->debug(' cookie match: '.$name.'='.$values[0]); | 116 | $this->debug(' cookie match: '.$name.'='.$values[0]); |
117 | $matched_cookies[] = $name.'='.$values[0]; | 117 | $matched_cookies[] = $name.'='.$values[0]; |
118 | } | 118 | } |
119 | } | 119 | } |
120 | $domain = $this->_reduce_domain($domain); | 120 | $domain = $this->_reduce_domain($domain); |
121 | } | 121 | } |
122 | // return cookies | 122 | // return cookies |
123 | return implode('; ', $matched_cookies); | 123 | return implode('; ', $matched_cookies); |
124 | } | 124 | } |
125 | 125 | ||
126 | /** | 126 | /** |
127 | * Parse Set-Cookie values. | 127 | * Parse Set-Cookie values. |
128 | * | 128 | * |
129 | * Only use this method if you cannot use extract_cookies(), for example, if you want to use | 129 | * Only use this method if you cannot use extract_cookies(), for example, if you want to use |
130 | * this cookie jar class without using the response class. | 130 | * this cookie jar class without using the response class. |
131 | * | 131 | * |
132 | * @param array $set_cookies array holding 1 or more "Set-Cookie" header values | 132 | * @param array $set_cookies array holding 1 or more "Set-Cookie" header values |
133 | * @param array $param associative array containing 'host', 'path' keys | 133 | * @param array $param associative array containing 'host', 'path' keys |
134 | * @return void | 134 | * @return void |
135 | * @see extract_cookies() | 135 | * @see extract_cookies() |
136 | */ | 136 | */ |
137 | public function storeCookies($url, $set_cookies) | 137 | public function storeCookies($url, $set_cookies) |
138 | { | 138 | { |
139 | if (count($set_cookies) == 0) return; | 139 | if (count($set_cookies) == 0) return; |
140 | $param = @parse_url($url); | 140 | $param = @parse_url($url); |
141 | if (!is_array($param) || !isset($param['host'])) return; | 141 | if (!is_array($param) || !isset($param['host'])) return; |
142 | $request_host = $param['host']; | 142 | $request_host = $param['host']; |
143 | if (strpos($request_host, '.') === false) $request_host .= '.local'; | 143 | if (strpos($request_host, '.') === false) $request_host .= '.local'; |
144 | $request_path = @$param['path']; | 144 | $request_path = @$param['path']; |
145 | if ($request_path == '') $request_path = '/'; | 145 | if ($request_path == '') $request_path = '/'; |
146 | // | 146 | // |
147 | // loop through set-cookie headers | 147 | // loop through set-cookie headers |
148 | // | 148 | // |
149 | foreach ($set_cookies as $set_cookie) { | 149 | foreach ($set_cookies as $set_cookie) { |
150 | $this->debug('Parsing: '.$set_cookie); | 150 | $this->debug('Parsing: '.$set_cookie); |
151 | // temporary cookie store (before adding to jar) | 151 | // temporary cookie store (before adding to jar) |
152 | $tmp_cookie = array(); | 152 | $tmp_cookie = array(); |
153 | $param = explode(';', $set_cookie); | 153 | $param = explode(';', $set_cookie); |
154 | // loop through params | 154 | // loop through params |
155 | for ($x=0; $x<count($param); $x++) { | 155 | for ($x=0; $x<count($param); $x++) { |
156 | $key_val = explode('=', $param[$x], 2); | 156 | $key_val = explode('=', $param[$x], 2); |
157 | if (count($key_val) != 2) { | 157 | if (count($key_val) != 2) { |
158 | // if the first param isn't a name=value pair, continue to the next set-cookie | 158 | // if the first param isn't a name=value pair, continue to the next set-cookie |
159 | // header | 159 | // header |
160 | if ($x == 0) continue 2; | 160 | if ($x == 0) continue 2; |
161 | // check for secure flag | 161 | // check for secure flag |
162 | if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; | 162 | if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; |
163 | // continue to next param | 163 | // continue to next param |
164 | continue; | 164 | continue; |
165 | } | 165 | } |
166 | list($key, $val) = array_map('trim', $key_val); | 166 | list($key, $val) = array_map('trim', $key_val); |
167 | // first name=value pair is the cookie name and value | 167 | // first name=value pair is the cookie name and value |
168 | // the name and value are stored under 'name' and 'value' to avoid conflicts | 168 | // the name and value are stored under 'name' and 'value' to avoid conflicts |
169 | // with later parameters. | 169 | // with later parameters. |
170 | if ($x == 0) { | 170 | if ($x == 0) { |
171 | $tmp_cookie = array('name'=>$key, 'value'=>$val); | 171 | $tmp_cookie = array('name'=>$key, 'value'=>$val); |
172 | continue; | 172 | continue; |
173 | } | 173 | } |
174 | $key = strtolower($key); | 174 | $key = strtolower($key); |
175 | if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { | 175 | if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { |
176 | $tmp_cookie[$key] = $val; | 176 | $tmp_cookie[$key] = $val; |
177 | } | 177 | } |
178 | } | 178 | } |
179 | // | 179 | // |
180 | // set cookie | 180 | // set cookie |
181 | // | 181 | // |
182 | // check domain | 182 | // check domain |
183 | if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && | 183 | if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && |
184 | ($tmp_cookie['domain'] != ".$request_host")) { | 184 | ($tmp_cookie['domain'] != ".$request_host")) { |
185 | $domain = $tmp_cookie['domain']; | 185 | $domain = $tmp_cookie['domain']; |
186 | if ((strpos($domain, '.') === false) && ($domain != 'local')) { | 186 | if ((strpos($domain, '.') === false) && ($domain != 'local')) { |
187 | $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); | 187 | $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); |
188 | continue; | 188 | continue; |
189 | } | 189 | } |
190 | if (preg_match('/\.[0-9]+$/', $domain)) { | 190 | if (preg_match('/\.[0-9]+$/', $domain)) { |
191 | $this->debug(' - domain "'.$domain.'" appears to be an ip address'); | 191 | $this->debug(' - domain "'.$domain.'" appears to be an ip address'); |
192 | continue; | 192 | continue; |
193 | } | 193 | } |
194 | if (substr($domain, 0, 1) != '.') $domain = ".$domain"; | 194 | if (substr($domain, 0, 1) != '.') $domain = ".$domain"; |
195 | if (!$this->_domain_match($request_host, $domain)) { | 195 | if (!$this->_domain_match($request_host, $domain)) { |
196 | $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); | 196 | $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); |
197 | continue; | 197 | continue; |
198 | } | 198 | } |
199 | } else { | 199 | } else { |
200 | // if domain is not specified in the set-cookie header, domain will default to | 200 | // if domain is not specified in the set-cookie header, domain will default to |
201 | // the request host | 201 | // the request host |
202 | $domain = $request_host; | 202 | $domain = $request_host; |
203 | } | 203 | } |
204 | // check path | 204 | // check path |
205 | if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { | 205 | if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { |
206 | $path = urldecode($tmp_cookie['path']); | 206 | $path = urldecode($tmp_cookie['path']); |
207 | if (!$this->_path_match($request_path, $path)) { | 207 | if (!$this->_path_match($request_path, $path)) { |
208 | $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); | 208 | $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); |
209 | continue; | 209 | continue; |
210 | } | 210 | } |
211 | } else { | 211 | } else { |
212 | $path = $request_path; | 212 | $path = $request_path; |
213 | $path = substr($path, 0, strrpos($path, '/')); | 213 | $path = substr($path, 0, strrpos($path, '/')); |
214 | if ($path == '') $path = '/'; | 214 | if ($path == '') $path = '/'; |
215 | } | 215 | } |
216 | // check if secure | 216 | // check if secure |
217 | $secure = (isset($tmp_cookie['secure'])) ? true : false; | 217 | $secure = (isset($tmp_cookie['secure'])) ? true : false; |
218 | // check expiry | 218 | // check expiry |
219 | if (isset($tmp_cookie['expires'])) { | 219 | if (isset($tmp_cookie['expires'])) { |
220 | if (($expires = strtotime($tmp_cookie['expires'])) < 0) { | 220 | if (($expires = strtotime($tmp_cookie['expires'])) < 0) { |
221 | $expires = null; | 221 | $expires = null; |
222 | } | 222 | } |
223 | } else { | 223 | } else { |
224 | $expires = null; | 224 | $expires = null; |
225 | } | 225 | } |
226 | // set cookie | 226 | // set cookie |
227 | $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); | 227 | $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); |
228 | } | 228 | } |
229 | } | 229 | } |
230 | 230 | ||
231 | // return array of set-cookie values extracted from HTTP response headers (string $h) | 231 | // return array of set-cookie values extracted from HTTP response headers (string $h) |
232 | public function extractCookies($h) { | 232 | public function extractCookies($h) { |
233 | $x = 0; | 233 | $x = 0; |
234 | $lines = 0; | 234 | $lines = 0; |
235 | $headers = array(); | 235 | $headers = array(); |
236 | $last_match = false; | 236 | $last_match = false; |
237 | $h = explode("\n", $h); | 237 | $h = explode("\n", $h); |
238 | foreach ($h as $line) { | 238 | foreach ($h as $line) { |
239 | $line = rtrim($line); | 239 | $line = rtrim($line); |
240 | $lines++; | 240 | $lines++; |
241 | 241 | ||
242 | $trimmed_line = trim($line); | 242 | $trimmed_line = trim($line); |
243 | if (isset($line_last)) { | 243 | if (isset($line_last)) { |
244 | // check if we have \r\n\r\n (indicating the end of headers) | 244 | // check if we have \r\n\r\n (indicating the end of headers) |
245 | // some servers will not use CRLF (\r\n), so we make CR (\r) optional. | 245 | // some servers will not use CRLF (\r\n), so we make CR (\r) optional. |
246 | // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { | 246 | // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { |
247 | // break; | 247 | // break; |
248 | // } | 248 | // } |
249 | // As an alternative, we can check if the current trimmed line is empty | 249 | // As an alternative, we can check if the current trimmed line is empty |
250 | if ($trimmed_line == '') { | 250 | if ($trimmed_line == '') { |
251 | break; | 251 | break; |
252 | } | 252 | } |
253 | 253 | ||
254 | // check for continuation line... | 254 | // check for continuation line... |
255 | // RFC 2616 Section 2.2 "Basic Rules": | 255 | // RFC 2616 Section 2.2 "Basic Rules": |
256 | // HTTP/1.1 header field values can be folded onto multiple lines if the | 256 | // HTTP/1.1 header field values can be folded onto multiple lines if the |
257 | // continuation line begins with a space or horizontal tab. All linear | 257 | // continuation line begins with a space or horizontal tab. All linear |
258 | // white space, including folding, has the same semantics as SP. A | 258 | // white space, including folding, has the same semantics as SP. A |
259 | // recipient MAY replace any linear white space with a single SP before | 259 | // recipient MAY replace any linear white space with a single SP before |
260 | // interpreting the field value or forwarding the message downstream. | 260 | // interpreting the field value or forwarding the message downstream. |
261 | if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { | 261 | if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { |
262 | // append to previous header value | 262 | // append to previous header value |
263 | $headers[$x-1] .= ' '.rtrim($match[1]); | 263 | $headers[$x-1] .= ' '.rtrim($match[1]); |
264 | continue; | 264 | continue; |
265 | } | 265 | } |
266 | } | 266 | } |
267 | $line_last = $line; | 267 | $line_last = $line; |
268 | 268 | ||
269 | // split header name and value | 269 | // split header name and value |
270 | if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { | 270 | if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { |
271 | $headers[$x++] = rtrim($match[1]); | 271 | $headers[$x++] = rtrim($match[1]); |
272 | $last_match = true; | 272 | $last_match = true; |
273 | } else { | 273 | } else { |
274 | $last_match = false; | 274 | $last_match = false; |
275 | } | 275 | } |
276 | } | 276 | } |
277 | return $headers; | 277 | return $headers; |
278 | } | 278 | } |
279 | 279 | ||
280 | /** | 280 | /** |
281 | * Set Cookie | 281 | * Set Cookie |
282 | * @param string $domain | 282 | * @param string $domain |
283 | * @param string $path | 283 | * @param string $path |
284 | * @param string $name cookie name | 284 | * @param string $name cookie name |
285 | * @param string $value cookie value | 285 | * @param string $value cookie value |
286 | * @param bool $secure | 286 | * @param bool $secure |
287 | * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) | 287 | * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) |
288 | * @return void | 288 | * @return void |
289 | */ | 289 | */ |
290 | function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) | 290 | function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) |
291 | { | 291 | { |
292 | if ($domain == '') return; | 292 | if ($domain == '') return; |
293 | if ($path == '') return; | 293 | if ($path == '') return; |
294 | if ($name == '') return; | 294 | if ($name == '') return; |
295 | // check if cookie needs to go | 295 | // check if cookie needs to go |
296 | if (isset($expires) && ($expires <= 0)) { | 296 | if (isset($expires) && ($expires <= 0)) { |
297 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); | 297 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); |
298 | return; | 298 | return; |
299 | } | 299 | } |
300 | if ($value == '') return; | 300 | if ($value == '') return; |
301 | $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); | 301 | $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); |
302 | return; | 302 | return; |
303 | } | 303 | } |
304 | 304 | ||
305 | /** | 305 | /** |
306 | * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. | 306 | * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. |
307 | * @param string $domain | 307 | * @param string $domain |
308 | * @param string $path | 308 | * @param string $path |
309 | * @param string $name | 309 | * @param string $name |
310 | * @return void | 310 | * @return void |
311 | */ | 311 | */ |
312 | function clear($domain=null, $path=null, $name=null) | 312 | function clear($domain=null, $path=null, $name=null) |
313 | { | 313 | { |
314 | if (!isset($domain)) { | 314 | if (!isset($domain)) { |
315 | $this->cookies = array(); | 315 | $this->cookies = array(); |
316 | } elseif (!isset($path)) { | 316 | } elseif (!isset($path)) { |
317 | if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); | 317 | if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); |
318 | } elseif (!isset($name)) { | 318 | } elseif (!isset($name)) { |
319 | if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); | 319 | if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); |
320 | } elseif (isset($name)) { | 320 | } elseif (isset($name)) { |
321 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); | 321 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); |
322 | } | 322 | } |
323 | } | 323 | } |
324 | 324 | ||
325 | /** | 325 | /** |
326 | * Compare string length - used for sorting | 326 | * Compare string length - used for sorting |
327 | * @access private | 327 | * @access private |
328 | * @return int | 328 | * @return int |
329 | */ | 329 | */ |
330 | function _cmp_length($a, $b) | 330 | function _cmp_length($a, $b) |
331 | { | 331 | { |
332 | $la = strlen($a); $lb = strlen($b); | 332 | $la = strlen($a); $lb = strlen($b); |
333 | if ($la == $lb) return 0; | 333 | if ($la == $lb) return 0; |
334 | return ($la > $lb) ? -1 : 1; | 334 | return ($la > $lb) ? -1 : 1; |
335 | } | 335 | } |
336 | 336 | ||
337 | /** | 337 | /** |
338 | * Reduce domain | 338 | * Reduce domain |
339 | * @param string $domain | 339 | * @param string $domain |
340 | * @return string | 340 | * @return string |
341 | * @access private | 341 | * @access private |
342 | */ | 342 | */ |
343 | function _reduce_domain($domain) | 343 | function _reduce_domain($domain) |
344 | { | 344 | { |
345 | if ($domain == '') return ''; | 345 | if ($domain == '') return ''; |
346 | if (substr($domain, 0, 1) == '.') return substr($domain, 1); | 346 | if (substr($domain, 0, 1) == '.') return substr($domain, 1); |
347 | return substr($domain, strpos($domain, '.')); | 347 | return substr($domain, strpos($domain, '.')); |
348 | } | 348 | } |
349 | 349 | ||
350 | /** | 350 | /** |
351 | * Path match - check if path1 path-matches path2 | 351 | * Path match - check if path1 path-matches path2 |
352 | * | 352 | * |
353 | * From RFC 2965: | 353 | * From RFC 2965: |
354 | * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 | 354 | * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 |
355 | * if P2 is a prefix of P1 (including the case where P1 and P2 string- | 355 | * if P2 is a prefix of P1 (including the case where P1 and P2 string- |
356 | * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> | 356 | * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> |
357 | * @param string $path1 | 357 | * @param string $path1 |
358 | * @param string $path2 | 358 | * @param string $path2 |
359 | * @return bool | 359 | * @return bool |
360 | * @access private | 360 | * @access private |
361 | */ | 361 | */ |
362 | function _path_match($path1, $path2) | 362 | function _path_match($path1, $path2) |
363 | { | 363 | { |
364 | return (substr($path1, 0, strlen($path2)) == $path2); | 364 | return (substr($path1, 0, strlen($path2)) == $path2); |
365 | } | 365 | } |
366 | 366 | ||
367 | /** | 367 | /** |
368 | * Domain match - check if domain1 domain-matches domain2 | 368 | * Domain match - check if domain1 domain-matches domain2 |
369 | * | 369 | * |
370 | * A few extracts from RFC 2965: | 370 | * A few extracts from RFC 2965: |
371 | * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com | 371 | * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com |
372 | * would be rejected, because H is y.x and contains a dot. | 372 | * would be rejected, because H is y.x and contains a dot. |
373 | * | 373 | * |
374 | * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com | 374 | * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com |
375 | * would be accepted. | 375 | * would be accepted. |
376 | * | 376 | * |
377 | * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be | 377 | * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be |
378 | * rejected, because there is no embedded dot. | 378 | * rejected, because there is no embedded dot. |
379 | * | 379 | * |
380 | * - A Set-Cookie2 from request-host example for Domain=.local will | 380 | * - A Set-Cookie2 from request-host example for Domain=.local will |
381 | * be accepted, because the effective host name for the request- | 381 | * be accepted, because the effective host name for the request- |
382 | * host is example.local, and example.local domain-matches .local. | 382 | * host is example.local, and example.local domain-matches .local. |
383 | * | 383 | * |
384 | * I'm ignoring the first point for now (must check to see how other browsers handle | 384 | * I'm ignoring the first point for now (must check to see how other browsers handle |
385 | * this rule for Set-Cookie headers) | 385 | * this rule for Set-Cookie headers) |
386 | * | 386 | * |
387 | * @param string $domain1 | 387 | * @param string $domain1 |
388 | * @param string $domain2 | 388 | * @param string $domain2 |
389 | * @return bool | 389 | * @return bool |
390 | * @access private | 390 | * @access private |
391 | */ | 391 | */ |
392 | function _domain_match($domain1, $domain2) | 392 | function _domain_match($domain1, $domain2) |
393 | { | 393 | { |
394 | $domain1 = strtolower($domain1); | 394 | $domain1 = strtolower($domain1); |
395 | $domain2 = strtolower($domain2); | 395 | $domain2 = strtolower($domain2); |
396 | while (strpos($domain1, '.') !== false) { | 396 | while (strpos($domain1, '.') !== false) { |
397 | if ($domain1 == $domain2) return true; | 397 | if ($domain1 == $domain2) return true; |
398 | $domain1 = $this->_reduce_domain($domain1); | 398 | $domain1 = $this->_reduce_domain($domain1); |
399 | continue; | 399 | continue; |
400 | } | 400 | } |
401 | return false; | 401 | return false; |
402 | } | 402 | } |
403 | } | 403 | } \ No newline at end of file |
404 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php index e4f1b3b3..963f0c05 100644 --- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | |||
@@ -1,779 +1,810 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent | 3 | * Humble HTTP Agent |
4 | * | 4 | * |
5 | * This class is designed to take advantage of parallel HTTP requests | 5 | * This class is designed to take advantage of parallel HTTP requests |
6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. | 6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. |
7 | * For environments which do not have these options, it reverts to standard sequential | 7 | * For environments which do not have these options, it reverts to standard sequential |
8 | * requests (using file_get_contents()) | 8 | * requests (using file_get_contents()) |
9 | * | 9 | * |
10 | * @version 1.1 | 10 | * @version 1.4 |
11 | * @date 2012-08-20 | 11 | * @date 2013-05-10 |
12 | * @see http://php.net/HttpRequestPool | 12 | * @see http://php.net/HttpRequestPool |
13 | * @author Keyvan Minoukadeh | 13 | * @author Keyvan Minoukadeh |
14 | * @copyright 2011-2012 Keyvan Minoukadeh | 14 | * @copyright 2011-2013 Keyvan Minoukadeh |
15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
16 | */ | 16 | */ |
17 | 17 | ||
18 | class HumbleHttpAgent | 18 | class HumbleHttpAgent |
19 | { | 19 | { |
20 | const METHOD_REQUEST_POOL = 1; | 20 | const METHOD_REQUEST_POOL = 1; |
21 | const METHOD_CURL_MULTI = 2; | 21 | const METHOD_CURL_MULTI = 2; |
22 | const METHOD_FILE_GET_CONTENTS = 4; | 22 | const METHOD_FILE_GET_CONTENTS = 4; |
23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; | 23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; |
24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; | 24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; |
25 | const UA_PHP = 'PHP/5.2'; | 25 | const UA_PHP = 'PHP/5.4'; |
26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; | 26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; |
27 | 27 | ||
28 | protected $requests = array(); | 28 | protected $requests = array(); |
29 | protected $redirectQueue = array(); | 29 | protected $redirectQueue = array(); |
30 | protected $requestOptions; | 30 | protected $requestOptions; |
31 | protected $maxParallelRequests = 5; | 31 | protected $maxParallelRequests = 5; |
32 | protected $cache = null; //TODO | 32 | protected $cache = null; //TODO |
33 | protected $httpContext; | 33 | protected $httpContext; |
34 | protected $minimiseMemoryUse = false; //TODO | 34 | protected $minimiseMemoryUse = false; //TODO |
35 | protected $method; | 35 | protected $method; |
36 | protected $cookieJar; | 36 | protected $cookieJar; |
37 | public $debug = false; | 37 | public $debug = false; |
38 | public $debugVerbose = false; | 38 | public $debugVerbose = false; |
39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html | 39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html |
40 | public $maxRedirects = 5; | 40 | public $maxRedirects = 5; |
41 | public $userAgentMap = array(); | 41 | public $userAgentMap = array(); |
42 | public $rewriteUrls = array(); | 42 | public $rewriteUrls = array(); |
43 | public $userAgentDefault; | 43 | public $userAgentDefault; |
44 | public $referer; | 44 | public $referer; |
45 | //public $userAgent = 'Mozilla/5.0'; | 45 | //public $userAgent = 'Mozilla/5.0'; |
46 | 46 | ||
47 | // Prevent certain file/mime types | 47 | // Prevent certain file/mime types |
48 | // HTTP responses which match these content types will | 48 | // HTTP responses which match these content types will |
49 | // be returned without body. | 49 | // be returned without body. |
50 | public $headerOnlyTypes = array(); | 50 | public $headerOnlyTypes = array(); |
51 | // URLs ending with one of these extensions will | 51 | // URLs ending with one of these extensions will |
52 | // prompt Humble HTTP Agent to send a HEAD request first | 52 | // prompt Humble HTTP Agent to send a HEAD request first |
53 | // to see if returned content type matches $headerOnlyTypes. | 53 | // to see if returned content type matches $headerOnlyTypes. |
54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); | 54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); |
55 | // AJAX triggers to search for. | 55 | // AJAX triggers to search for. |
56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); | 57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); |
58 | 58 | ||
59 | //TODO: set max file size | 59 | //TODO: set max file size |
60 | //TODO: normalise headers | 60 | //TODO: normalise headers |
61 | 61 | ||
62 | function __construct($requestOptions=null, $method=null) { | 62 | function __construct($requestOptions=null, $method=null) { |
63 | $this->userAgentDefault = self::UA_BROWSER; | 63 | $this->userAgentDefault = self::UA_BROWSER; |
64 | $this->referer = self::REF_GOOGLE; | 64 | $this->referer = self::REF_GOOGLE; |
65 | // set the request method | 65 | // set the request method |
66 | if (in_array($method, array(1,2,4))) { | 66 | if (in_array($method, array(1,2,4))) { |
67 | $this->method = $method; | 67 | $this->method = $method; |
68 | } else { | 68 | } else { |
69 | if (class_exists('HttpRequestPool')) { | 69 | if (class_exists('HttpRequestPool')) { |
70 | $this->method = self::METHOD_REQUEST_POOL; | 70 | $this->method = self::METHOD_REQUEST_POOL; |
71 | } elseif (function_exists('curl_multi_init')) { | 71 | } elseif (function_exists('curl_multi_init')) { |
72 | $this->method = self::METHOD_CURL_MULTI; | 72 | $this->method = self::METHOD_CURL_MULTI; |
73 | } else { | 73 | } else { |
74 | $this->method = self::METHOD_FILE_GET_CONTENTS; | 74 | $this->method = self::METHOD_FILE_GET_CONTENTS; |
75 | } | 75 | } |
76 | } | 76 | } |
77 | if ($this->method == self::METHOD_CURL_MULTI) { | 77 | if ($this->method == self::METHOD_CURL_MULTI) { |
78 | require_once(dirname(__FILE__).'/RollingCurl.php'); | 78 | require_once(dirname(__FILE__).'/RollingCurl.php'); |
79 | } | 79 | } |
80 | // create cookie jar | 80 | // create cookie jar |
81 | $this->cookieJar = new CookieJar(); | 81 | $this->cookieJar = new CookieJar(); |
82 | // set request options (redirect must be 0) | 82 | // set request options (redirect must be 0) |
83 | $this->requestOptions = array( | 83 | $this->requestOptions = array( |
84 | 'timeout' => 15, | 84 | 'timeout' => 15, |
85 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web | 85 | 'connecttimeout' => 15, |
86 | // TODO: test onprogress? | 86 | 'dns_cache_timeout' => 300, |
87 | ); | 87 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web |
88 | if (is_array($requestOptions)) { | 88 | // TODO: test onprogress? |
89 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); | 89 | ); |
90 | } | 90 | if (is_array($requestOptions)) { |
91 | $this->httpContext = array( | 91 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); |
92 | 'http' => array( | 92 | } |
93 | 'ignore_errors' => true, | 93 | $this->httpContext = array( |
94 | 'timeout' => $this->requestOptions['timeout'], | 94 | 'http' => array( |
95 | 'max_redirects' => $this->requestOptions['redirect'], | 95 | 'ignore_errors' => true, |
96 | 'header' => "Accept: */*\r\n" | 96 | 'timeout' => $this->requestOptions['timeout'], |
97 | ) | 97 | 'max_redirects' => $this->requestOptions['redirect'], |
98 | ); | 98 | 'header' => "Accept: */*\r\n" |
99 | } | 99 | ) |
100 | 100 | ); | |
101 | protected function debug($msg) { | 101 | } |
102 | if ($this->debug) { | 102 | |
103 | $mem = round(memory_get_usage()/1024, 2); | 103 | protected function debug($msg) { |
104 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | if ($this->debug) { |
105 | echo '* ',$msg; | 105 | $mem = round(memory_get_usage()/1024, 2); |
106 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 106 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
107 | echo "\n"; | 107 | echo '* ',$msg; |
108 | ob_flush(); | 108 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
109 | flush(); | 109 | echo "\n"; |
110 | } | 110 | ob_flush(); |
111 | } | 111 | flush(); |
112 | 112 | } | |
113 | protected function getUserAgent($url, $asArray=false) { | 113 | } |
114 | $host = @parse_url($url, PHP_URL_HOST); | 114 | |
115 | if (strtolower(substr($host, 0, 4)) == 'www.') { | 115 | protected function getUserAgent($url, $asArray=false) { |
116 | $host = substr($host, 4); | 116 | $host = @parse_url($url, PHP_URL_HOST); |
117 | } | 117 | if (strtolower(substr($host, 0, 4)) == 'www.') { |
118 | if ($host) { | 118 | $host = substr($host, 4); |
119 | $try = array($host); | 119 | } |
120 | $split = explode('.', $host); | 120 | if ($host) { |
121 | if (count($split) > 1) { | 121 | $try = array($host); |
122 | array_shift($split); | 122 | $split = explode('.', $host); |
123 | $try[] = '.'.implode('.', $split); | 123 | if (count($split) > 1) { |
124 | } | 124 | array_shift($split); |
125 | foreach ($try as $h) { | 125 | $try[] = '.'.implode('.', $split); |
126 | if (isset($this->userAgentMap[$h])) { | 126 | } |
127 | $ua = $this->userAgentMap[$h]; | 127 | foreach ($try as $h) { |
128 | break; | 128 | if (isset($this->userAgentMap[$h])) { |
129 | } | 129 | $ua = $this->userAgentMap[$h]; |
130 | } | 130 | break; |
131 | } | 131 | } |
132 | if (!isset($ua)) $ua = $this->userAgentDefault; | 132 | } |
133 | if ($asArray) { | 133 | } |
134 | return array('User-Agent' => $ua); | 134 | if (!isset($ua)) $ua = $this->userAgentDefault; |
135 | } else { | 135 | if ($asArray) { |
136 | return 'User-Agent: '.$ua; | 136 | return array('User-Agent' => $ua); |
137 | } | 137 | } else { |
138 | } | 138 | return 'User-Agent: '.$ua; |
139 | 139 | } | |
140 | public function rewriteHashbangFragment($url) { | 140 | } |
141 | // return $url if there's no '#!' | 141 | |
142 | if (strpos($url, '#!') === false) return $url; | 142 | public function rewriteHashbangFragment($url) { |
143 | // split $url and rewrite | 143 | // return $url if there's no '#!' |
144 | // TODO: is SimplePie_IRI included? | 144 | if (strpos($url, '#!') === false) return $url; |
145 | $iri = new SimplePie_IRI($url); | 145 | // split $url and rewrite |
146 | $fragment = substr($iri->fragment, 1); // strip '!' | 146 | // TODO: is SimplePie_IRI included? |
147 | $iri->fragment = null; | 147 | $iri = new SimplePie_IRI($url); |
148 | if (isset($iri->query)) { | 148 | $fragment = substr($iri->fragment, 1); // strip '!' |
149 | parse_str($iri->query, $query); | 149 | $iri->fragment = null; |
150 | } else { | 150 | if (isset($iri->query)) { |
151 | $query = array(); | 151 | parse_str($iri->query, $query); |
152 | } | 152 | } else { |
153 | $query['_escaped_fragment_'] = (string)$fragment; | 153 | $query = array(); |
154 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 154 | } |
155 | return $iri->get_iri(); | 155 | $query['_escaped_fragment_'] = (string)$fragment; |
156 | } | 156 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
157 | 157 | return $iri->get_iri(); | |
158 | public function getUglyURL($url, $html) { | 158 | } |
159 | if ($html == '') return false; | 159 | |
160 | $found = false; | 160 | public function getRedirectURLfromHTML($url, $html) { |
161 | foreach ($this->ajaxTriggers as $string) { | 161 | $redirect_url = $this->getMetaRefreshURL($url, $html); |
162 | if (stripos($html, $string)) { | 162 | if (!$redirect_url) { |
163 | $found = true; | 163 | $redirect_url = $this->getUglyURL($url, $html); |
164 | break; | 164 | } |
165 | } | 165 | return $redirect_url; |
166 | } | 166 | } |
167 | if (!$found) return false; | 167 | |
168 | $iri = new SimplePie_IRI($url); | 168 | public function getMetaRefreshURL($url, $html) { |
169 | if (isset($iri->query)) { | 169 | if ($html == '') return false; |
170 | parse_str($iri->query, $query); | 170 | // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513"> |
171 | } else { | 171 | if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) { |
172 | $query = array(); | 172 | return false; |
173 | } | 173 | } |
174 | $query['_escaped_fragment_'] = ''; | 174 | $redirect_url = $match[1]; |
175 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 175 | if (preg_match('!^https?://!i', $redirect_url)) { |
176 | return $iri->get_iri(); | 176 | // already absolute |
177 | } | 177 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); |
178 | 178 | return $redirect_url; | |
179 | public function removeFragment($url) { | 179 | } |
180 | $pos = strpos($url, '#'); | 180 | // absolutize redirect URL |
181 | if ($pos === false) { | 181 | $base = new SimplePie_IRI($url); |
182 | return $url; | 182 | // remove '//' in URL path (causes URLs not to resolve properly) |
183 | } else { | 183 | if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); |
184 | return substr($url, 0, $pos); | 184 | if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { |
185 | } | 185 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); |
186 | } | 186 | return $absolute; |
187 | 187 | } | |
188 | public function rewriteUrls($url) { | 188 | return false; |
189 | foreach ($this->rewriteUrls as $find => $action) { | 189 | } |
190 | if (strpos($url, $find) !== false) { | 190 | |
191 | if (is_array($action)) { | 191 | public function getUglyURL($url, $html) { |
192 | return strtr($url, $action); | 192 | if ($html == '') return false; |
193 | } | 193 | $found = false; |
194 | } | 194 | foreach ($this->ajaxTriggers as $string) { |
195 | } | 195 | if (stripos($html, $string)) { |
196 | return $url; | 196 | $found = true; |
197 | } | 197 | break; |
198 | 198 | } | |
199 | public function enableDebug($bool=true) { | 199 | } |
200 | $this->debug = (bool)$bool; | 200 | if (!$found) return false; |
201 | } | 201 | $iri = new SimplePie_IRI($url); |
202 | 202 | if (isset($iri->query)) { | |
203 | public function minimiseMemoryUse($bool = true) { | 203 | parse_str($iri->query, $query); |
204 | $this->minimiseMemoryUse = $bool; | 204 | } else { |
205 | } | 205 | $query = array(); |
206 | 206 | } | |
207 | public function setMaxParallelRequests($max) { | 207 | $query['_escaped_fragment_'] = ''; |
208 | $this->maxParallelRequests = $max; | 208 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
209 | } | 209 | $ugly_url = $iri->get_iri(); |
210 | 210 | $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); | |
211 | public function validateUrl($url) { | 211 | return $ugly_url; |
212 | $url = filter_var($url, FILTER_SANITIZE_URL); | 212 | } |
213 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 213 | |
214 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) | 214 | public function removeFragment($url) { |
215 | if ($test === false) { | 215 | $pos = strpos($url, '#'); |
216 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 216 | if ($pos === false) { |
217 | } | 217 | return $url; |
218 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { | 218 | } else { |
219 | return $url; | 219 | return substr($url, 0, $pos); |
220 | } else { | 220 | } |
221 | return false; | 221 | } |
222 | } | 222 | |
223 | } | 223 | public function rewriteUrls($url) { |
224 | 224 | foreach ($this->rewriteUrls as $find => $action) { | |
225 | public function fetchAll(array $urls) { | 225 | if (strpos($url, $find) !== false) { |
226 | $this->fetchAllOnce($urls, $isRedirect=false); | 226 | if (is_array($action)) { |
227 | $redirects = 0; | 227 | return strtr($url, $action); |
228 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { | 228 | } |
229 | $this->debug("Following redirects #$redirects..."); | 229 | } |
230 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); | 230 | } |
231 | } | 231 | return $url; |
232 | } | 232 | } |
233 | 233 | ||
234 | // fetch all URLs without following redirects | 234 | public function enableDebug($bool=true) { |
235 | public function fetchAllOnce(array $urls, $isRedirect=false) { | 235 | $this->debug = (bool)$bool; |
236 | if (!$isRedirect) $urls = array_unique($urls); | 236 | } |
237 | if (empty($urls)) return; | 237 | |
238 | 238 | public function minimiseMemoryUse($bool = true) { | |
239 | ////////////////////////////////////////////////////// | 239 | $this->minimiseMemoryUse = $bool; |
240 | // parallel (HttpRequestPool) | 240 | } |
241 | if ($this->method == self::METHOD_REQUEST_POOL) { | 241 | |
242 | $this->debug('Starting parallel fetch (HttpRequestPool)'); | 242 | public function setMaxParallelRequests($max) { |
243 | try { | 243 | $this->maxParallelRequests = $max; |
244 | while (count($urls) > 0) { | 244 | } |
245 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 245 | |
246 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 246 | public function validateUrl($url) { |
247 | $pool = new HttpRequestPool(); | 247 | $url = filter_var($url, FILTER_SANITIZE_URL); |
248 | foreach ($subset as $orig => $url) { | 248 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
249 | if (!$isRedirect) $orig = $url; | 249 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) |
250 | unset($this->redirectQueue[$orig]); | 250 | if ($test === false) { |
251 | $this->debug("...$url"); | 251 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
252 | if (!$isRedirect && isset($this->requests[$url])) { | 252 | } |
253 | $this->debug("......in memory"); | 253 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { |
254 | /* | 254 | return $url; |
255 | } elseif ($this->isCached($url)) { | 255 | } else { |
256 | $this->debug("......is cached"); | 256 | return false; |
257 | if (!$this->minimiseMemoryUse) { | 257 | } |
258 | $this->requests[$url] = $this->getCached($url); | 258 | } |
259 | } | 259 | |
260 | */ | 260 | public function fetchAll(array $urls) { |
261 | } else { | 261 | $this->fetchAllOnce($urls, $isRedirect=false); |
262 | $this->debug("......adding to pool"); | 262 | $redirects = 0; |
263 | $req_url = $this->rewriteUrls($url); | 263 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { |
264 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 264 | $this->debug("Following redirects #$redirects..."); |
265 | $req_url = $this->removeFragment($req_url); | 265 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); |
266 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 266 | } |
267 | $_meth = HttpRequest::METH_HEAD; | 267 | } |
268 | } else { | 268 | |
269 | $_meth = HttpRequest::METH_GET; | 269 | // fetch all URLs without following redirects |
270 | unset($this->requests[$orig]['wrongGuess']); | 270 | public function fetchAllOnce(array $urls, $isRedirect=false) { |
271 | } | 271 | if (!$isRedirect) $urls = array_unique($urls); |
272 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); | 272 | if (empty($urls)) return; |
273 | // send cookies, if we have any | 273 | |
274 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 274 | ////////////////////////////////////////////////////// |
275 | $this->debug("......sending cookies: $cookies"); | 275 | // parallel (HttpRequestPool) |
276 | $httpRequest->addHeaders(array('Cookie' => $cookies)); | 276 | if ($this->method == self::METHOD_REQUEST_POOL) { |
277 | } | 277 | $this->debug('Starting parallel fetch (HttpRequestPool)'); |
278 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); | 278 | try { |
279 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); | 279 | while (count($urls) > 0) { |
280 | // add referer for picky sites | 280 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
281 | $httpRequest->addheaders(array('Referer' => $this->referer)); | 281 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
282 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 282 | $pool = new HttpRequestPool(); |
283 | $this->requests[$orig]['original_url'] = $orig; | 283 | foreach ($subset as $orig => $url) { |
284 | $pool->attach($httpRequest); | 284 | if (!$isRedirect) $orig = $url; |
285 | } | 285 | unset($this->redirectQueue[$orig]); |
286 | } | 286 | $this->debug("...$url"); |
287 | // did we get anything into the pool? | 287 | if (!$isRedirect && isset($this->requests[$url])) { |
288 | if (count($pool) > 0) { | 288 | $this->debug("......in memory"); |
289 | $this->debug('Sending request...'); | 289 | /* |
290 | try { | 290 | } elseif ($this->isCached($url)) { |
291 | $pool->send(); | 291 | $this->debug("......is cached"); |
292 | } catch (HttpRequestPoolException $e) { | 292 | if (!$this->minimiseMemoryUse) { |
293 | // do nothing | 293 | $this->requests[$url] = $this->getCached($url); |
294 | } | 294 | } |
295 | $this->debug('Received responses'); | 295 | */ |
296 | foreach($subset as $orig => $url) { | 296 | } else { |
297 | if (!$isRedirect) $orig = $url; | 297 | $this->debug("......adding to pool"); |
298 | $request = $this->requests[$orig]['httpRequest']; | 298 | $req_url = $this->rewriteUrls($url); |
299 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); | 299 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
300 | // getResponseHeader() doesn't return status line, so, for consistency... | 300 | $req_url = $this->removeFragment($req_url); |
301 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); | 301 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
302 | // check content type | 302 | $_meth = HttpRequest::METH_HEAD; |
303 | // TODO: use getResponseHeader('content-type') or getResponseInfo() | 303 | } else { |
304 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 304 | $_meth = HttpRequest::METH_GET; |
305 | $this->requests[$orig]['body'] = ''; | 305 | unset($this->requests[$orig]['wrongGuess']); |
306 | $_header_only_type = true; | 306 | } |
307 | $this->debug('Header only type returned'); | 307 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); |
308 | } else { | 308 | // send cookies, if we have any |
309 | $this->requests[$orig]['body'] = $request->getResponseBody(); | 309 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
310 | $_header_only_type = false; | 310 | $this->debug("......sending cookies: $cookies"); |
311 | } | 311 | $httpRequest->addHeaders(array('Cookie' => $cookies)); |
312 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); | 312 | } |
313 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); | 313 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); |
314 | // is redirect? | 314 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); |
315 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { | 315 | // add referer for picky sites |
316 | $redirectURL = $request->getResponseHeader('location'); | 316 | $httpRequest->addheaders(array('Referer' => $this->referer)); |
317 | if (!preg_match('!^https?://!i', $redirectURL)) { | 317 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
318 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 318 | $this->requests[$orig]['original_url'] = $orig; |
319 | } | 319 | $pool->attach($httpRequest); |
320 | if ($this->validateURL($redirectURL)) { | 320 | } |
321 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 321 | } |
322 | // store any cookies | 322 | // did we get anything into the pool? |
323 | $cookies = $request->getResponseHeader('set-cookie'); | 323 | if (count($pool) > 0) { |
324 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); | 324 | $this->debug('Sending request...'); |
325 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); | 325 | try { |
326 | $this->redirectQueue[$orig] = $redirectURL; | 326 | $pool->send(); |
327 | } else { | 327 | } catch (HttpRequestPoolException $e) { |
328 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 328 | // do nothing |
329 | } | 329 | } |
330 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { | 330 | $this->debug('Received responses'); |
331 | // the response content-type did not match our 'header only' types, | 331 | foreach($subset as $orig => $url) { |
332 | // but we'd issues a HEAD request because we assumed it would. So | 332 | if (!$isRedirect) $orig = $url; |
333 | // let's queue a proper GET request for this item... | 333 | $request = $this->requests[$orig]['httpRequest']; |
334 | $this->debug('Wrong guess at content-type, queing GET request'); | 334 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); |
335 | $this->requests[$orig]['wrongGuess'] = true; | 335 | // getResponseHeader() doesn't return status line, so, for consistency... |
336 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 336 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); |
337 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 337 | // check content type |
338 | // check for <meta name='fragment' content='!'/> | 338 | // TODO: use getResponseHeader('content-type') or getResponseInfo() |
339 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 339 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
340 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 340 | $this->requests[$orig]['body'] = ''; |
341 | if (isset($this->requests[$orig]['body'])) { | 341 | $_header_only_type = true; |
342 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 342 | $this->debug('Header only type returned'); |
343 | if ($redirectURL) { | 343 | } else { |
344 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 344 | $this->requests[$orig]['body'] = $request->getResponseBody(); |
345 | $this->redirectQueue[$orig] = $redirectURL; | 345 | $_header_only_type = false; |
346 | } | 346 | } |
347 | } | 347 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); |
348 | } | 348 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); |
349 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 349 | // is redirect? |
350 | $pool->detach($request); | 350 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { |
351 | unset($this->requests[$orig]['httpRequest'], $request); | 351 | $redirectURL = $request->getResponseHeader('location'); |
352 | /* | 352 | if (!preg_match('!^https?://!i', $redirectURL)) { |
353 | if ($this->minimiseMemoryUse) { | 353 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
354 | if ($this->cache($url)) { | 354 | } |
355 | unset($this->requests[$url]); | 355 | if ($this->validateURL($redirectURL)) { |
356 | } | 356 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
357 | } | 357 | // store any cookies |
358 | */ | 358 | $cookies = $request->getResponseHeader('set-cookie'); |
359 | } | 359 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); |
360 | } | 360 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); |
361 | } | 361 | $this->redirectQueue[$orig] = $redirectURL; |
362 | } catch (HttpException $e) { | 362 | } else { |
363 | $this->debug($e); | 363 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
364 | return false; | 364 | } |
365 | } | 365 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { |
366 | } | 366 | // the response content-type did not match our 'header only' types, |
367 | 367 | // but we'd issues a HEAD request because we assumed it would. So | |
368 | ////////////////////////////////////////////////////////// | 368 | // let's queue a proper GET request for this item... |
369 | // parallel (curl_multi_*) | 369 | $this->debug('Wrong guess at content-type, queing GET request'); |
370 | elseif ($this->method == self::METHOD_CURL_MULTI) { | 370 | $this->requests[$orig]['wrongGuess'] = true; |
371 | $this->debug('Starting parallel fetch (curl_multi_*)'); | 371 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
372 | while (count($urls) > 0) { | 372 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
373 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 373 | // check for <meta name='fragment' content='!'/> |
374 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 374 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
375 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); | 375 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
376 | $pool->window_size = count($subset); | 376 | if (isset($this->requests[$orig]['body'])) { |
377 | 377 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | |
378 | foreach ($subset as $orig => $url) { | 378 | if ($redirectURL) { |
379 | if (!$isRedirect) $orig = $url; | 379 | $this->redirectQueue[$orig] = $redirectURL; |
380 | unset($this->redirectQueue[$orig]); | 380 | } |
381 | $this->debug("...$url"); | 381 | } |
382 | if (!$isRedirect && isset($this->requests[$url])) { | 382 | } |
383 | $this->debug("......in memory"); | 383 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
384 | /* | 384 | $pool->detach($request); |
385 | } elseif ($this->isCached($url)) { | 385 | unset($this->requests[$orig]['httpRequest'], $request); |
386 | $this->debug("......is cached"); | 386 | /* |
387 | if (!$this->minimiseMemoryUse) { | 387 | if ($this->minimiseMemoryUse) { |
388 | $this->requests[$url] = $this->getCached($url); | 388 | if ($this->cache($url)) { |
389 | } | 389 | unset($this->requests[$url]); |
390 | */ | 390 | } |
391 | } else { | 391 | } |
392 | $this->debug("......adding to pool"); | 392 | */ |
393 | $req_url = $this->rewriteUrls($url); | 393 | } |
394 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 394 | } |
395 | $req_url = $this->removeFragment($req_url); | 395 | } |
396 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 396 | } catch (HttpException $e) { |
397 | $_meth = 'HEAD'; | 397 | $this->debug($e); |
398 | } else { | 398 | return false; |
399 | $_meth = 'GET'; | 399 | } |
400 | unset($this->requests[$orig]['wrongGuess']); | 400 | } |
401 | } | 401 | |
402 | $headers = array(); | 402 | ////////////////////////////////////////////////////////// |
403 | //$headers[] = 'User-Agent: '.$this->userAgent; | 403 | // parallel (curl_multi_*) |
404 | $headers[] = $this->getUserAgent($req_url); | 404 | elseif ($this->method == self::METHOD_CURL_MULTI) { |
405 | // add referer for picky sites | 405 | $this->debug('Starting parallel fetch (curl_multi_*)'); |
406 | $headers[] = 'Referer: '.$this->referer; | 406 | while (count($urls) > 0) { |
407 | // send cookies, if we have any | 407 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
408 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 408 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
409 | $this->debug("......sending cookies: $cookies"); | 409 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); |
410 | $headers[] = 'Cookie: '.$cookies; | 410 | $pool->window_size = count($subset); |
411 | } | 411 | |
412 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( | 412 | foreach ($subset as $orig => $url) { |
413 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], | 413 | if (!$isRedirect) $orig = $url; |
414 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] | 414 | unset($this->redirectQueue[$orig]); |
415 | )); | 415 | $this->debug("...$url"); |
416 | $httpRequest->set_original_url($orig); | 416 | if (!$isRedirect && isset($this->requests[$url])) { |
417 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 417 | $this->debug("......in memory"); |
418 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? | 418 | /* |
419 | $pool->add($httpRequest); | 419 | } elseif ($this->isCached($url)) { |
420 | } | 420 | $this->debug("......is cached"); |
421 | } | 421 | if (!$this->minimiseMemoryUse) { |
422 | // did we get anything into the pool? | 422 | $this->requests[$url] = $this->getCached($url); |
423 | if (count($pool) > 0) { | 423 | } |
424 | $this->debug('Sending request...'); | 424 | */ |
425 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] | 425 | } else { |
426 | $this->debug('Received responses'); | 426 | $this->debug("......adding to pool"); |
427 | foreach($subset as $orig => $url) { | 427 | $req_url = $this->rewriteUrls($url); |
428 | if (!$isRedirect) $orig = $url; | 428 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
429 | // $this->requests[$orig]['headers'] | 429 | $req_url = $this->removeFragment($req_url); |
430 | // $this->requests[$orig]['body'] | 430 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
431 | // $this->requests[$orig]['effective_url'] | 431 | $_meth = 'HEAD'; |
432 | // check content type | 432 | } else { |
433 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 433 | $_meth = 'GET'; |
434 | $this->requests[$orig]['body'] = ''; | 434 | unset($this->requests[$orig]['wrongGuess']); |
435 | $_header_only_type = true; | 435 | } |
436 | $this->debug('Header only type returned'); | 436 | $headers = array(); |
437 | } else { | 437 | //$headers[] = 'User-Agent: '.$this->userAgent; |
438 | $_header_only_type = false; | 438 | $headers[] = $this->getUserAgent($req_url); |
439 | } | 439 | // add referer for picky sites |
440 | $status_code = $this->requests[$orig]['status_code']; | 440 | $headers[] = 'Referer: '.$this->referer; |
441 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 441 | // send cookies, if we have any |
442 | $redirectURL = $this->requests[$orig]['location']; | 442 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
443 | if (!preg_match('!^https?://!i', $redirectURL)) { | 443 | $this->debug("......sending cookies: $cookies"); |
444 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 444 | $headers[] = 'Cookie: '.$cookies; |
445 | } | 445 | } |
446 | if ($this->validateURL($redirectURL)) { | 446 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( |
447 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 447 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], |
448 | // store any cookies | 448 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] |
449 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 449 | )); |
450 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 450 | $httpRequest->set_original_url($orig); |
451 | $this->redirectQueue[$orig] = $redirectURL; | 451 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
452 | } else { | 452 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? |
453 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 453 | $pool->add($httpRequest); |
454 | } | 454 | } |
455 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { | 455 | } |
456 | // the response content-type did not match our 'header only' types, | 456 | // did we get anything into the pool? |
457 | // but we'd issues a HEAD request because we assumed it would. So | 457 | if (count($pool) > 0) { |
458 | // let's queue a proper GET request for this item... | 458 | $this->debug('Sending request...'); |
459 | $this->debug('Wrong guess at content-type, queing GET request'); | 459 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] |
460 | $this->requests[$orig]['wrongGuess'] = true; | 460 | $this->debug('Received responses'); |
461 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 461 | foreach($subset as $orig => $url) { |
462 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 462 | if (!$isRedirect) $orig = $url; |
463 | // check for <meta name='fragment' content='!'/> | 463 | // $this->requests[$orig]['headers'] |
464 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 464 | // $this->requests[$orig]['body'] |
465 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 465 | // $this->requests[$orig]['effective_url'] |
466 | if (isset($this->requests[$orig]['body'])) { | 466 | // check content type |
467 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 467 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
468 | if ($redirectURL) { | 468 | $this->requests[$orig]['body'] = ''; |
469 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 469 | $_header_only_type = true; |
470 | $this->redirectQueue[$orig] = $redirectURL; | 470 | $this->debug('Header only type returned'); |
471 | } | 471 | } else { |
472 | } | 472 | $_header_only_type = false; |
473 | } | 473 | } |
474 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 474 | $status_code = $this->requests[$orig]['status_code']; |
475 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); | 475 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
476 | } | 476 | $redirectURL = $this->requests[$orig]['location']; |
477 | } | 477 | if (!preg_match('!^https?://!i', $redirectURL)) { |
478 | } | 478 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
479 | } | 479 | } |
480 | 480 | if ($this->validateURL($redirectURL)) { | |
481 | ////////////////////////////////////////////////////// | 481 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
482 | // sequential (file_get_contents) | 482 | // store any cookies |
483 | else { | 483 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
484 | $this->debug('Starting sequential fetch (file_get_contents)'); | 484 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
485 | $this->debug('Processing set of '.count($urls)); | 485 | $this->redirectQueue[$orig] = $redirectURL; |
486 | foreach ($urls as $orig => $url) { | 486 | } else { |
487 | if (!$isRedirect) $orig = $url; | 487 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
488 | unset($this->redirectQueue[$orig]); | 488 | } |
489 | $this->debug("...$url"); | 489 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { |
490 | if (!$isRedirect && isset($this->requests[$url])) { | 490 | // the response content-type did not match our 'header only' types, |
491 | $this->debug("......in memory"); | 491 | // but we'd issues a HEAD request because we assumed it would. So |
492 | /* | 492 | // let's queue a proper GET request for this item... |
493 | } elseif ($this->isCached($url)) { | 493 | $this->debug('Wrong guess at content-type, queing GET request'); |
494 | $this->debug("......is cached"); | 494 | $this->requests[$orig]['wrongGuess'] = true; |
495 | if (!$this->minimiseMemoryUse) { | 495 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
496 | $this->requests[$url] = $this->getCached($url); | 496 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
497 | } | 497 | // check for <meta name='fragment' content='!'/> |
498 | */ | 498 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
499 | } else { | 499 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
500 | $this->debug("Sending request for $url"); | 500 | if (isset($this->requests[$orig]['body'])) { |
501 | $this->requests[$orig]['original_url'] = $orig; | 501 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
502 | $req_url = $this->rewriteUrls($url); | 502 | if ($redirectURL) { |
503 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 503 | $this->redirectQueue[$orig] = $redirectURL; |
504 | $req_url = $this->removeFragment($req_url); | 504 | } |
505 | // send cookies, if we have any | 505 | } |
506 | $httpContext = $this->httpContext; | 506 | } |
507 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; | 507 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
508 | // add referer for picky sites | 508 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); |
509 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; | 509 | } |
510 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 510 | } |
511 | $this->debug("......sending cookies: $cookies"); | 511 | } |
512 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; | 512 | } |
513 | } | 513 | |
514 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { | 514 | ////////////////////////////////////////////////////// |
515 | $this->debug('Received response'); | 515 | // sequential (file_get_contents) |
516 | // get status code | 516 | else { |
517 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { | 517 | $this->debug('Starting sequential fetch (file_get_contents)'); |
518 | $this->debug('Error: no status code found'); | 518 | $this->debug('Processing set of '.count($urls)); |
519 | // TODO: handle error - no status code | 519 | foreach ($urls as $orig => $url) { |
520 | } else { | 520 | if (!$isRedirect) $orig = $url; |
521 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); | 521 | unset($this->redirectQueue[$orig]); |
522 | // check content type | 522 | $this->debug("...$url"); |
523 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 523 | if (!$isRedirect && isset($this->requests[$url])) { |
524 | $this->requests[$orig]['body'] = ''; | 524 | $this->debug("......in memory"); |
525 | } else { | 525 | /* |
526 | $this->requests[$orig]['body'] = $html; | 526 | } elseif ($this->isCached($url)) { |
527 | } | 527 | $this->debug("......is cached"); |
528 | $this->requests[$orig]['effective_url'] = $req_url; | 528 | if (!$this->minimiseMemoryUse) { |
529 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; | 529 | $this->requests[$url] = $this->getCached($url); |
530 | unset($match); | 530 | } |
531 | // handle redirect | 531 | */ |
532 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 532 | } else { |
533 | $this->requests[$orig]['location'] = trim($match[1]); | 533 | $this->debug("Sending request for $url"); |
534 | } | 534 | $this->requests[$orig]['original_url'] = $orig; |
535 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 535 | $req_url = $this->rewriteUrls($url); |
536 | $redirectURL = $this->requests[$orig]['location']; | 536 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
537 | if (!preg_match('!^https?://!i', $redirectURL)) { | 537 | $req_url = $this->removeFragment($req_url); |
538 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 538 | // send cookies, if we have any |
539 | } | 539 | $httpContext = $this->httpContext; |
540 | if ($this->validateURL($redirectURL)) { | 540 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; |
541 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 541 | // add referer for picky sites |
542 | // store any cookies | 542 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; |
543 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 543 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
544 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 544 | $this->debug("......sending cookies: $cookies"); |
545 | $this->redirectQueue[$orig] = $redirectURL; | 545 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; |
546 | } else { | 546 | } |
547 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 547 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { |
548 | } | 548 | $this->debug('Received response'); |
549 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 549 | // get status code |
550 | // check for <meta name='fragment' content='!'/> | 550 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { |
551 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 551 | $this->debug('Error: no status code found'); |
552 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 552 | // TODO: handle error - no status code |
553 | if (isset($this->requests[$orig]['body'])) { | 553 | } else { |
554 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 554 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); |
555 | if ($redirectURL) { | 555 | // check content type |
556 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 556 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
557 | $this->redirectQueue[$orig] = $redirectURL; | 557 | $this->requests[$orig]['body'] = ''; |
558 | } | 558 | } else { |
559 | } | 559 | $this->requests[$orig]['body'] = $html; |
560 | } | 560 | } |
561 | } | 561 | $this->requests[$orig]['effective_url'] = $req_url; |
562 | } else { | 562 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; |
563 | $this->debug('Error retrieving URL'); | 563 | unset($match); |
564 | //print_r($req_url); | 564 | // handle redirect |
565 | //print_r($http_response_header); | 565 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
566 | //print_r($html); | 566 | $this->requests[$orig]['location'] = trim($match[1]); |
567 | 567 | } | |
568 | // TODO: handle error - failed to retrieve URL | 568 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
569 | } | 569 | $redirectURL = $this->requests[$orig]['location']; |
570 | } | 570 | if (!preg_match('!^https?://!i', $redirectURL)) { |
571 | } | 571 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
572 | } | 572 | } |
573 | } | 573 | if ($this->validateURL($redirectURL)) { |
574 | 574 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | |
575 | public function handleCurlResponse($response, $info, $request) { | 575 | // store any cookies |
576 | $orig = $request->url_original; | 576 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
577 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); | 577 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
578 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); | 578 | $this->redirectQueue[$orig] = $redirectURL; |
579 | $this->requests[$orig]['method'] = $request->method; | 579 | } else { |
580 | $this->requests[$orig]['effective_url'] = $info['url']; | 580 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
581 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; | 581 | } |
582 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 582 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
583 | $this->requests[$orig]['location'] = trim($match[1]); | 583 | // check for <meta name='fragment' content='!'/> |
584 | } | 584 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
585 | } | 585 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
586 | 586 | if (isset($this->requests[$orig]['body'])) { | |
587 | protected function headersToString(array $headers, $associative=true) { | 587 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
588 | if (!$associative) { | 588 | if ($redirectURL) { |
589 | return implode("\n", $headers); | 589 | $this->redirectQueue[$orig] = $redirectURL; |
590 | } else { | 590 | } |
591 | $str = ''; | 591 | } |
592 | foreach ($headers as $key => $val) { | 592 | } |
593 | if (is_array($val)) { | 593 | } |
594 | foreach ($val as $v) $str .= "$key: $v\n"; | 594 | } else { |
595 | } else { | 595 | $this->debug('Error retrieving URL'); |
596 | $str .= "$key: $val\n"; | 596 | //print_r($req_url); |
597 | } | 597 | //print_r($http_response_header); |
598 | } | 598 | //print_r($html); |
599 | return rtrim($str); | 599 | |
600 | } | 600 | // TODO: handle error - failed to retrieve URL |
601 | } | 601 | } |
602 | 602 | } | |
603 | public function get($url, $remove=false, $gzdecode=true) { | 603 | } |
604 | $url = "$url"; | 604 | } |
605 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 605 | } |
606 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | 606 | |
607 | $response = $this->requests[$url]; | 607 | public function handleCurlResponse($response, $info, $request) { |
608 | /* | 608 | $orig = $request->url_original; |
609 | } elseif ($this->isCached($url)) { | 609 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); |
610 | $this->debug("URL already fetched - in disk cache ($url)"); | 610 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); |
611 | $response = $this->getCached($url); | 611 | $this->requests[$orig]['method'] = $request->method; |
612 | $this->requests[$url] = $response; | 612 | $this->requests[$orig]['effective_url'] = $info['url']; |
613 | */ | 613 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; |
614 | } else { | 614 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
615 | $this->debug("Fetching URL ($url)"); | 615 | $this->requests[$orig]['location'] = trim($match[1]); |
616 | $this->fetchAll(array($url)); | 616 | } |
617 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 617 | } |
618 | $response = $this->requests[$url]; | 618 | |
619 | } else { | 619 | protected function headersToString(array $headers, $associative=true) { |
620 | $this->debug("Request failed"); | 620 | if (!$associative) { |
621 | $response = false; | 621 | return implode("\n", $headers); |
622 | } | 622 | } else { |
623 | } | 623 | $str = ''; |
624 | /* | 624 | foreach ($headers as $key => $val) { |
625 | if ($this->minimiseMemoryUse && $response) { | 625 | if (is_array($val)) { |
626 | $this->cache($url); | 626 | foreach ($val as $v) $str .= "$key: $v\n"; |
627 | unset($this->requests[$url]); | 627 | } else { |
628 | } | 628 | $str .= "$key: $val\n"; |
629 | */ | 629 | } |
630 | if ($remove && $response) unset($this->requests[$url]); | 630 | } |
631 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { | 631 | return rtrim($str); |
632 | if ($html = gzdecode($response['body'])) { | 632 | } |
633 | $response['body'] = $html; | 633 | } |
634 | } | 634 | |
635 | } | 635 | public function get($url, $remove=false, $gzdecode=true) { |
636 | return $response; | 636 | $url = "$url"; |
637 | } | 637 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
638 | 638 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | |
639 | public function parallelSupport() { | 639 | $response = $this->requests[$url]; |
640 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); | 640 | /* |
641 | } | 641 | } elseif ($this->isCached($url)) { |
642 | 642 | $this->debug("URL already fetched - in disk cache ($url)"); | |
643 | private function headerOnlyType($headers) { | 643 | $response = $this->getCached($url); |
644 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { | 644 | $this->requests[$url] = $response; |
645 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) | 645 | */ |
646 | $match[1] = strtolower(trim($match[1])); | 646 | } else { |
647 | $match[2] = strtolower(trim($match[2])); | 647 | $this->debug("Fetching URL ($url)"); |
648 | foreach (array($match[1], $match[2]) as $mime) { | 648 | $this->fetchAll(array($url)); |
649 | if (in_array($mime, $this->headerOnlyTypes)) return true; | 649 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
650 | } | 650 | $response = $this->requests[$url]; |
651 | } | 651 | } else { |
652 | return false; | 652 | $this->debug("Request failed"); |
653 | } | 653 | $response = false; |
654 | 654 | } | |
655 | private function possibleUnsupportedType($url) { | 655 | } |
656 | $path = @parse_url($url, PHP_URL_PATH); | 656 | /* |
657 | if ($path && strpos($path, '.') !== false) { | 657 | if ($this->minimiseMemoryUse && $response) { |
658 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); | 658 | $this->cache($url); |
659 | return in_array($ext, $this->headerOnlyClues); | 659 | unset($this->requests[$url]); |
660 | } | 660 | } |
661 | return false; | 661 | */ |
662 | } | 662 | if ($remove && $response) unset($this->requests[$url]); |
663 | } | 663 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { |
664 | 664 | if ($html = gzdecode($response['body'])) { | |
665 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 | 665 | $response['body'] = $html; |
666 | if (!function_exists('gzdecode')) { | 666 | } |
667 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) | 667 | } |
668 | { | 668 | return $response; |
669 | $len = strlen($data); | 669 | } |
670 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { | 670 | |
671 | $error = "Not in GZIP format."; | 671 | public function parallelSupport() { |
672 | return null; // Not GZIP format (See RFC 1952) | 672 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); |
673 | } | 673 | } |
674 | $method = ord(substr($data,2,1)); // Compression method | 674 | |
675 | $flags = ord(substr($data,3,1)); // Flags | 675 | private function headerOnlyType($headers) { |
676 | if ($flags & 31 != $flags) { | 676 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { |
677 | $error = "Reserved bits not allowed."; | 677 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) |
678 | return null; | 678 | $match[1] = strtolower(trim($match[1])); |
679 | } | 679 | $match[2] = strtolower(trim($match[2])); |
680 | // NOTE: $mtime may be negative (PHP integer limitations) | 680 | foreach (array($match[1], $match[2]) as $mime) { |
681 | $mtime = unpack("V", substr($data,4,4)); | 681 | if (in_array($mime, $this->headerOnlyTypes)) return true; |
682 | $mtime = $mtime[1]; | 682 | } |
683 | $xfl = substr($data,8,1); | 683 | } |
684 | $os = substr($data,8,1); | 684 | return false; |
685 | $headerlen = 10; | 685 | } |
686 | $extralen = 0; | 686 | |
687 | $extra = ""; | 687 | private function possibleUnsupportedType($url) { |
688 | if ($flags & 4) { | 688 | $path = @parse_url($url, PHP_URL_PATH); |
689 | // 2-byte length prefixed EXTRA data in header | 689 | if ($path && strpos($path, '.') !== false) { |
690 | if ($len - $headerlen - 2 < 8) { | 690 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); |
691 | return false; // invalid | 691 | return in_array($ext, $this->headerOnlyClues); |
692 | } | 692 | } |
693 | $extralen = unpack("v",substr($data,8,2)); | 693 | return false; |
694 | $extralen = $extralen[1]; | 694 | } |
695 | if ($len - $headerlen - 2 - $extralen < 8) { | 695 | } |
696 | return false; // invalid | 696 | |
697 | } | 697 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 |
698 | $extra = substr($data,10,$extralen); | 698 | if (!function_exists('gzdecode')) { |
699 | $headerlen += 2 + $extralen; | 699 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) |
700 | } | 700 | { |
701 | $filenamelen = 0; | 701 | $len = strlen($data); |
702 | $filename = ""; | 702 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { |
703 | if ($flags & 8) { | 703 | $error = "Not in GZIP format."; |
704 | // C-style string | 704 | return null; // Not GZIP format (See RFC 1952) |
705 | if ($len - $headerlen - 1 < 8) { | 705 | } |
706 | return false; // invalid | 706 | $method = ord(substr($data,2,1)); // Compression method |
707 | } | 707 | $flags = ord(substr($data,3,1)); // Flags |
708 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); | 708 | if ($flags & 31 != $flags) { |
709 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { | 709 | $error = "Reserved bits not allowed."; |
710 | return false; // invalid | 710 | return null; |
711 | } | 711 | } |
712 | $filename = substr($data,$headerlen,$filenamelen); | 712 | // NOTE: $mtime may be negative (PHP integer limitations) |
713 | $headerlen += $filenamelen + 1; | 713 | $mtime = unpack("V", substr($data,4,4)); |
714 | } | 714 | $mtime = $mtime[1]; |
715 | $commentlen = 0; | 715 | $xfl = substr($data,8,1); |
716 | $comment = ""; | 716 | $os = substr($data,8,1); |
717 | if ($flags & 16) { | 717 | $headerlen = 10; |
718 | // C-style string COMMENT data in header | 718 | $extralen = 0; |
719 | if ($len - $headerlen - 1 < 8) { | 719 | $extra = ""; |
720 | return false; // invalid | 720 | if ($flags & 4) { |
721 | } | 721 | // 2-byte length prefixed EXTRA data in header |
722 | $commentlen = strpos(substr($data,$headerlen),chr(0)); | 722 | if ($len - $headerlen - 2 < 8) { |
723 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { | 723 | return false; // invalid |
724 | return false; // Invalid header format | 724 | } |
725 | } | 725 | $extralen = unpack("v",substr($data,8,2)); |
726 | $comment = substr($data,$headerlen,$commentlen); | 726 | $extralen = $extralen[1]; |
727 | $headerlen += $commentlen + 1; | 727 | if ($len - $headerlen - 2 - $extralen < 8) { |
728 | } | 728 | return false; // invalid |
729 | $headercrc = ""; | 729 | } |
730 | if ($flags & 2) { | 730 | $extra = substr($data,10,$extralen); |
731 | // 2-bytes (lowest order) of CRC32 on header present | 731 | $headerlen += 2 + $extralen; |
732 | if ($len - $headerlen - 2 < 8) { | 732 | } |
733 | return false; // invalid | 733 | $filenamelen = 0; |
734 | } | 734 | $filename = ""; |
735 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; | 735 | if ($flags & 8) { |
736 | $headercrc = unpack("v", substr($data,$headerlen,2)); | 736 | // C-style string |
737 | $headercrc = $headercrc[1]; | 737 | if ($len - $headerlen - 1 < 8) { |
738 | if ($headercrc != $calccrc) { | 738 | return false; // invalid |
739 | $error = "Header checksum failed."; | 739 | } |
740 | return false; // Bad header CRC | 740 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); |
741 | } | 741 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { |
742 | $headerlen += 2; | 742 | return false; // invalid |
743 | } | 743 | } |
744 | // GZIP FOOTER | 744 | $filename = substr($data,$headerlen,$filenamelen); |
745 | $datacrc = unpack("V",substr($data,-8,4)); | 745 | $headerlen += $filenamelen + 1; |
746 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); | 746 | } |
747 | $isize = unpack("V",substr($data,-4)); | 747 | $commentlen = 0; |
748 | $isize = $isize[1]; | 748 | $comment = ""; |
749 | // decompression: | 749 | if ($flags & 16) { |
750 | $bodylen = $len-$headerlen-8; | 750 | // C-style string COMMENT data in header |
751 | if ($bodylen < 1) { | 751 | if ($len - $headerlen - 1 < 8) { |
752 | // IMPLEMENTATION BUG! | 752 | return false; // invalid |
753 | return null; | 753 | } |
754 | } | 754 | $commentlen = strpos(substr($data,$headerlen),chr(0)); |
755 | $body = substr($data,$headerlen,$bodylen); | 755 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { |
756 | $data = ""; | 756 | return false; // Invalid header format |
757 | if ($bodylen > 0) { | 757 | } |
758 | switch ($method) { | 758 | $comment = substr($data,$headerlen,$commentlen); |
759 | case 8: | 759 | $headerlen += $commentlen + 1; |
760 | // Currently the only supported compression method: | 760 | } |
761 | $data = gzinflate($body,$maxlength); | 761 | $headercrc = ""; |
762 | break; | 762 | if ($flags & 2) { |
763 | default: | 763 | // 2-bytes (lowest order) of CRC32 on header present |
764 | $error = "Unknown compression method."; | 764 | if ($len - $headerlen - 2 < 8) { |
765 | return false; | 765 | return false; // invalid |
766 | } | 766 | } |
767 | } // zero-byte body content is allowed | 767 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; |
768 | // Verifiy CRC32 | 768 | $headercrc = unpack("v", substr($data,$headerlen,2)); |
769 | $crc = sprintf("%u",crc32($data)); | 769 | $headercrc = $headercrc[1]; |
770 | $crcOK = $crc == $datacrc; | 770 | if ($headercrc != $calccrc) { |
771 | $lenOK = $isize == strlen($data); | 771 | $error = "Header checksum failed."; |
772 | if (!$lenOK || !$crcOK) { | 772 | return false; // Bad header CRC |
773 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | 773 | } |
774 | return false; | 774 | $headerlen += 2; |
775 | } | 775 | } |
776 | return $data; | 776 | // GZIP FOOTER |
777 | } | 777 | $datacrc = unpack("V",substr($data,-8,4)); |
778 | } | 778 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); |
779 | ?> \ No newline at end of file | 779 | $isize = unpack("V",substr($data,-4)); |
780 | $isize = $isize[1]; | ||
781 | // decompression: | ||
782 | $bodylen = $len-$headerlen-8; | ||
783 | if ($bodylen < 1) { | ||
784 | // IMPLEMENTATION BUG! | ||
785 | return null; | ||
786 | } | ||
787 | $body = substr($data,$headerlen,$bodylen); | ||
788 | $data = ""; | ||
789 | if ($bodylen > 0) { | ||
790 | switch ($method) { | ||
791 | case 8: | ||
792 | // Currently the only supported compression method: | ||
793 | $data = gzinflate($body,$maxlength); | ||
794 | break; | ||
795 | default: | ||
796 | $error = "Unknown compression method."; | ||
797 | return false; | ||
798 | } | ||
799 | } // zero-byte body content is allowed | ||
800 | // Verifiy CRC32 | ||
801 | $crc = sprintf("%u",crc32($data)); | ||
802 | $crcOK = $crc == $datacrc; | ||
803 | $lenOK = $isize == strlen($data); | ||
804 | if (!$lenOK || !$crcOK) { | ||
805 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | ||
806 | return false; | ||
807 | } | ||
808 | return $data; | ||
809 | } | ||
810 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php index ecd46d5f..c524a1ee 100644 --- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php | |||
@@ -1,79 +1,78 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent extension for SimplePie_File | 3 | * Humble HTTP Agent extension for SimplePie_File |
4 | * | 4 | * |
5 | * This class is designed to extend and override SimplePie_File | 5 | * This class is designed to extend and override SimplePie_File |
6 | * in order to prevent duplicate HTTP requests being sent out. | 6 | * in order to prevent duplicate HTTP requests being sent out. |
7 | * The idea is to initialise an instance of Humble HTTP Agent | 7 | * The idea is to initialise an instance of Humble HTTP Agent |
8 | * and attach it, to a static class variable, of this class. | 8 | * and attach it, to a static class variable, of this class. |
9 | * SimplePie will then automatically initialise this class | 9 | * SimplePie will then automatically initialise this class |
10 | * | 10 | * |
11 | * @date 2011-02-28 | 11 | * @date 2011-02-28 |
12 | */ | 12 | */ |
13 | 13 | ||
14 | class SimplePie_HumbleHttpAgent extends SimplePie_File | 14 | class SimplePie_HumbleHttpAgent extends SimplePie_File |
15 | { | 15 | { |
16 | protected static $agent; | 16 | protected static $agent; |
17 | var $url; | 17 | var $url; |
18 | var $useragent; | 18 | var $useragent; |
19 | var $success = true; | 19 | var $success = true; |
20 | var $headers = array(); | 20 | var $headers = array(); |
21 | var $body; | 21 | var $body; |
22 | var $status_code; | 22 | var $status_code; |
23 | var $redirects = 0; | 23 | var $redirects = 0; |
24 | var $error; | 24 | var $error; |
25 | var $method = SIMPLEPIE_FILE_SOURCE_NONE; | 25 | var $method = SIMPLEPIE_FILE_SOURCE_NONE; |
26 | 26 | ||
27 | public static function set_agent(HumbleHttpAgent $agent) { | 27 | public static function set_agent(HumbleHttpAgent $agent) { |
28 | self::$agent = $agent; | 28 | self::$agent = $agent; |
29 | } | 29 | } |
30 | 30 | ||
31 | public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { | 31 | public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { |
32 | if (class_exists('idna_convert')) | 32 | if (class_exists('idna_convert')) |
33 | { | 33 | { |
34 | $idn = new idna_convert(); | 34 | $idn = new idna_convert(); |
35 | $parsed = SimplePie_Misc::parse_url($url); | 35 | $parsed = SimplePie_Misc::parse_url($url); |
36 | $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); | 36 | $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); |
37 | } | 37 | } |
38 | $this->url = $url; | 38 | $this->url = $url; |
39 | $this->useragent = $useragent; | 39 | $this->useragent = $useragent; |
40 | if (preg_match('/^http(s)?:\/\//i', $url)) | 40 | if (preg_match('/^http(s)?:\/\//i', $url)) |
41 | { | 41 | { |
42 | if (!is_array($headers)) | 42 | if (!is_array($headers)) |
43 | { | 43 | { |
44 | $headers = array(); | 44 | $headers = array(); |
45 | } | 45 | } |
46 | $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; | 46 | $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; |
47 | $headers2 = array(); | 47 | $headers2 = array(); |
48 | foreach ($headers as $key => $value) { | 48 | foreach ($headers as $key => $value) { |
49 | $headers2[] = "$key: $value"; | 49 | $headers2[] = "$key: $value"; |
50 | } | 50 | } |
51 | //TODO: allow for HTTP headers | 51 | //TODO: allow for HTTP headers |
52 | // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); | 52 | // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); |
53 | 53 | ||
54 | $response = self::$agent->get($url); | 54 | $response = self::$agent->get($url); |
55 | 55 | ||
56 | if ($response === false || !isset($response['status_code'])) { | 56 | if ($response === false || !isset($response['status_code'])) { |
57 | $this->error = 'failed to fetch URL'; | 57 | $this->error = 'failed to fetch URL'; |
58 | $this->success = false; | 58 | $this->success = false; |
59 | } else { | 59 | } else { |
60 | // The extra lines at the end are there to satisfy SimplePie's HTTP parser. | 60 | // The extra lines at the end are there to satisfy SimplePie's HTTP parser. |
61 | // The class expects a full HTTP message, whereas we're giving it only | 61 | // The class expects a full HTTP message, whereas we're giving it only |
62 | // headers - the new lines indicate the start of the body. | 62 | // headers - the new lines indicate the start of the body. |
63 | $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); | 63 | $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); |
64 | if ($parser->parse()) { | 64 | if ($parser->parse()) { |
65 | $this->headers = $parser->headers; | 65 | $this->headers = $parser->headers; |
66 | //$this->body = $parser->body; | 66 | //$this->body = $parser->body; |
67 | $this->body = $response['body']; | 67 | $this->body = $response['body']; |
68 | $this->status_code = $parser->status_code; | 68 | $this->status_code = $parser->status_code; |
69 | } | 69 | } |
70 | } | 70 | } |
71 | } | 71 | } |
72 | else | 72 | else |
73 | { | 73 | { |
74 | $this->error = 'invalid URL'; | 74 | $this->error = 'invalid URL'; |
75 | $this->success = false; | 75 | $this->success = false; |
76 | } | 76 | } |
77 | } | 77 | } |
78 | } | 78 | } \ No newline at end of file |
79 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php index 09b11546..382d869c 100644 --- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php | |||
@@ -6,23 +6,24 @@ | |||
6 | * Attempts to detect the language of a sample of text by correlating ranked | 6 | * Attempts to detect the language of a sample of text by correlating ranked |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | 7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. |
8 | * | 8 | * |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | 9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle |
10 | * (1994): "N-Gram-Based Text Categorization" | 10 | * (1994): "N-Gram-Based Text Categorization" |
11 | * | 11 | * |
12 | * PHP versions 4 and 5 | 12 | * PHP version 5 |
13 | * | 13 | * |
14 | * @category Text | 14 | * @category Text |
15 | * @package Text_LanguageDetect | 15 | * @package Text_LanguageDetect |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
17 | * @copyright 2005-2006 Nicholas Pisarro | 17 | * @copyright 2005-2006 Nicholas Pisarro |
18 | * @license http://www.debian.org/misc/bsd.license BSD | 18 | * @license http://www.debian.org/misc/bsd.license BSD |
19 | * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ | 19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | 20 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
21 | * @link http://langdetect.blogspot.com/ | 21 | * @link http://langdetect.blogspot.com/ |
22 | */ | 22 | */ |
23 | 23 | ||
24 | //require_once 'PEAR.php'; | 24 | require_once 'LanguageDetect/Exception.php'; |
25 | require_once 'Parser.php'; | 25 | require_once 'LanguageDetect/Parser.php'; |
26 | require_once 'LanguageDetect/ISO639.php'; | ||
26 | 27 | ||
27 | /** | 28 | /** |
28 | * Language detection class | 29 | * Language detection class |
@@ -41,9 +42,10 @@ require_once 'Parser.php'; | |||
41 | * | 42 | * |
42 | * echo "Supported languages:\n"; | 43 | * echo "Supported languages:\n"; |
43 | * | 44 | * |
44 | * $langs = $l->getLanguages(); | 45 | * try { |
45 | * if (PEAR::isError($langs)) { | 46 | * $langs = $l->getLanguages(); |
46 | * die($langs->getMessage()); | 47 | * } catch (Text_LanguageDetect_Exception $e) { |
48 | * die($e->getMessage()); | ||
47 | * } | 49 | * } |
48 | * | 50 | * |
49 | * sort($langs); | 51 | * sort($langs); |
@@ -54,38 +56,38 @@ require_once 'Parser.php'; | |||
54 | * } | 56 | * } |
55 | * </code> | 57 | * </code> |
56 | * | 58 | * |
57 | * @category Text | 59 | * @category Text |
58 | * @package Text_LanguageDetect | 60 | * @package Text_LanguageDetect |
59 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
60 | * @copyright 2005 Nicholas Pisarro | 62 | * @copyright 2005 Nicholas Pisarro |
61 | * @license http://www.debian.org/misc/bsd.license BSD | 63 | * @license http://www.debian.org/misc/bsd.license BSD |
62 | * @version Release: @package_version@ | 64 | * @version Release: @package_version@ |
63 | * @todo allow users to generate their own language models | 65 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
66 | * @todo allow users to generate their own language models | ||
64 | */ | 67 | */ |
65 | |||
66 | class Text_LanguageDetect | 68 | class Text_LanguageDetect |
67 | { | 69 | { |
68 | /** | 70 | /** |
69 | * The filename that stores the trigram data for the detector | 71 | * The filename that stores the trigram data for the detector |
70 | * | 72 | * |
71 | * If this value starts with a slash (/) or a dot (.) the value of | 73 | * If this value starts with a slash (/) or a dot (.) the value of |
72 | * $this->_data_dir will be ignored | 74 | * $this->_data_dir will be ignored |
73 | * | 75 | * |
74 | * @var string | 76 | * @var string |
75 | * @access private | 77 | * @access private |
76 | */ | 78 | */ |
77 | var $_db_filename = './lang.dat'; | 79 | var $_db_filename = 'lang.dat'; |
78 | 80 | ||
79 | /** | 81 | /** |
80 | * The filename that stores the unicode block definitions | 82 | * The filename that stores the unicode block definitions |
81 | * | 83 | * |
82 | * If this value starts with a slash (/) or a dot (.) the value of | 84 | * If this value starts with a slash (/) or a dot (.) the value of |
83 | * $this->_data_dir will be ignored | 85 | * $this->_data_dir will be ignored |
84 | * | 86 | * |
85 | * @var string | 87 | * @var string |
86 | * @access private | 88 | * @access private |
87 | */ | 89 | */ |
88 | var $_unicode_db_filename = './unicode_blocks.dat'; | 90 | var $_unicode_db_filename = 'unicode_blocks.dat'; |
89 | 91 | ||
90 | /** | 92 | /** |
91 | * The data directory | 93 | * The data directory |
@@ -99,11 +101,8 @@ class Text_LanguageDetect | |||
99 | 101 | ||
100 | /** | 102 | /** |
101 | * The trigram data for comparison | 103 | * The trigram data for comparison |
102 | * | ||
103 | * Will be loaded on start from $this->_db_filename | ||
104 | * | 104 | * |
105 | * May be set to a PEAR_Error object if there is an error during its | 105 | * Will be loaded on start from $this->_db_filename |
106 | * initialization | ||
107 | * | 106 | * |
108 | * @var array | 107 | * @var array |
109 | * @access private | 108 | * @access private |
@@ -120,7 +119,7 @@ class Text_LanguageDetect | |||
120 | 119 | ||
121 | /** | 120 | /** |
122 | * The size of the trigram data arrays | 121 | * The size of the trigram data arrays |
123 | * | 122 | * |
124 | * @var int | 123 | * @var int |
125 | * @access private | 124 | * @access private |
126 | */ | 125 | */ |
@@ -140,7 +139,7 @@ class Text_LanguageDetect | |||
140 | 139 | ||
141 | /** | 140 | /** |
142 | * Whether or not to simulate perl's Language::Guess exactly | 141 | * Whether or not to simulate perl's Language::Guess exactly |
143 | * | 142 | * |
144 | * @access private | 143 | * @access private |
145 | * @var bool | 144 | * @var bool |
146 | * @see setPerlCompatible() | 145 | * @see setPerlCompatible() |
@@ -165,18 +164,24 @@ class Text_LanguageDetect | |||
165 | var $_clusters; | 164 | var $_clusters; |
166 | 165 | ||
167 | /** | 166 | /** |
167 | * Which type of "language names" are accepted and returned: | ||
168 | * | ||
169 | * 0 - language name ("english") | ||
170 | * 2 - 2-letter ISO 639-1 code ("en") | ||
171 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
172 | */ | ||
173 | var $_name_mode = 0; | ||
174 | |||
175 | /** | ||
168 | * Constructor | 176 | * Constructor |
169 | * | 177 | * |
170 | * Will attempt to load the language database. If it fails, you will get | 178 | * Will attempt to load the language database. If it fails, you will get |
171 | * a PEAR_Error object returned when you try to use detect() | 179 | * an exception. |
172 | * | ||
173 | */ | 180 | */ |
174 | function Text_LanguageDetect($db=null, $unicode_db=null) | 181 | function __construct() |
175 | { | 182 | { |
176 | if (isset($db)) $this->_db_filename = $db; | ||
177 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
178 | |||
179 | $data = $this->_readdb($this->_db_filename); | 183 | $data = $this->_readdb($this->_db_filename); |
184 | $this->_checkTrigram($data['trigram']); | ||
180 | $this->_lang_db = $data['trigram']; | 185 | $this->_lang_db = $data['trigram']; |
181 | 186 | ||
182 | if (isset($data['trigram-unicodemap'])) { | 187 | if (isset($data['trigram-unicodemap'])) { |
@@ -186,29 +191,32 @@ class Text_LanguageDetect | |||
186 | // Not yet implemented: | 191 | // Not yet implemented: |
187 | if (isset($data['trigram-clusters'])) { | 192 | if (isset($data['trigram-clusters'])) { |
188 | $this->_clusters = $data['trigram-clusters']; | 193 | $this->_clusters = $data['trigram-clusters']; |
189 | } | 194 | } |
190 | } | 195 | } |
191 | 196 | ||
192 | /** | 197 | /** |
193 | * Returns the path to the location of the database | 198 | * Returns the path to the location of the database |
194 | * | 199 | * |
195 | * @access private | 200 | * @param string $fname File name to load |
196 | * @return string expected path to the language model database | 201 | * |
202 | * @return string expected path to the language model database | ||
203 | * @access private | ||
197 | */ | 204 | */ |
198 | function _get_data_loc($fname) | 205 | function _get_data_loc($fname) |
199 | { | 206 | { |
200 | return $fname; | 207 | return dirname(__FILE__).'/'.$fname; |
201 | } | 208 | } |
202 | 209 | ||
203 | /** | 210 | /** |
204 | * Loads the language trigram database from filename | 211 | * Loads the language trigram database from filename |
205 | * | 212 | * |
206 | * Trigram datbase should be a serialize()'d array | 213 | * Trigram datbase should be a serialize()'d array |
207 | * | 214 | * |
208 | * @access private | 215 | * @param string $fname the filename where the data is stored |
209 | * @param string $fname the filename where the data is stored | 216 | * |
210 | * @return array the language model data | 217 | * @return array the language model data |
211 | * @throws PEAR_Error | 218 | * @throws Text_LanguageDetect_Exception |
219 | * @access private | ||
212 | */ | 220 | */ |
213 | function _readdb($fname) | 221 | function _readdb($fname) |
214 | { | 222 | { |
@@ -217,79 +225,74 @@ class Text_LanguageDetect | |||
217 | 225 | ||
218 | // input check | 226 | // input check |
219 | if (!file_exists($fname)) { | 227 | if (!file_exists($fname)) { |
220 | throw new Exception('Language database does not exist.'); | 228 | throw new Text_LanguageDetect_Exception( |
229 | 'Language database does not exist: ' . $fname, | ||
230 | Text_LanguageDetect_Exception::DB_NOT_FOUND | ||
231 | ); | ||
221 | } elseif (!is_readable($fname)) { | 232 | } elseif (!is_readable($fname)) { |
222 | throw new Exception('Language database is not readable.'); | 233 | throw new Text_LanguageDetect_Exception( |
234 | 'Language database is not readable: ' . $fname, | ||
235 | Text_LanguageDetect_Exception::DB_NOT_READABLE | ||
236 | ); | ||
223 | } | 237 | } |
224 | 238 | ||
225 | if (function_exists('file_get_contents')) { | 239 | return unserialize(file_get_contents($fname)); |
226 | return unserialize(file_get_contents($fname)); | ||
227 | } else { | ||
228 | // if you don't have file_get_contents(), | ||
229 | // then this is the next fastest way | ||
230 | ob_start(); | ||
231 | readfile($fname); | ||
232 | $contents = ob_get_contents(); | ||
233 | ob_end_clean(); | ||
234 | return unserialize($contents); | ||
235 | } | ||
236 | } | 240 | } |
237 | 241 | ||
238 | 242 | ||
239 | /** | 243 | /** |
240 | * Checks if this object is ready to detect languages | 244 | * Checks if this object is ready to detect languages |
241 | * | 245 | * |
242 | * @access private | 246 | * @param array $trigram Trigram data from database |
243 | * @param mixed &$err error object to be returned by reference, if any | 247 | * |
244 | * @return bool true if no errors | 248 | * @return void |
249 | * @access private | ||
245 | */ | 250 | */ |
246 | function _setup_ok(&$err) | 251 | function _checkTrigram($trigram) |
247 | { | 252 | { |
248 | if (!is_array($this->_lang_db)) { | 253 | if (!is_array($trigram)) { |
249 | if (ini_get('magic_quotes_runtime')) { | 254 | if (ini_get('magic_quotes_runtime')) { |
250 | throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); | 255 | throw new Text_LanguageDetect_Exception( |
251 | } else { | 256 | 'Error loading database. Try turning magic_quotes_runtime off.', |
252 | throw new Exception('Language database is not an array.'); | 257 | Text_LanguageDetect_Exception::MAGIC_QUOTES |
258 | ); | ||
253 | } | 259 | } |
254 | return false; | 260 | throw new Text_LanguageDetect_Exception( |
255 | 261 | 'Language database is not an array.', | |
256 | } elseif (empty($this->_lang_db)) { | 262 | Text_LanguageDetect_Exception::DB_NOT_ARRAY |
257 | throw new Exception('Language database has no elements.'); | 263 | ); |
258 | return false; | 264 | } elseif (empty($trigram)) { |
259 | 265 | throw new Text_LanguageDetect_Exception( | |
260 | } else { | 266 | 'Language database has no elements.', |
261 | return true; | 267 | Text_LanguageDetect_Exception::DB_EMPTY |
268 | ); | ||
262 | } | 269 | } |
263 | } | 270 | } |
264 | 271 | ||
265 | /** | 272 | /** |
266 | * Omits languages | 273 | * Omits languages |
267 | * | 274 | * |
268 | * Pass this function the name of or an array of names of | 275 | * Pass this function the name of or an array of names of |
269 | * languages that you don't want considered | 276 | * languages that you don't want considered |
270 | * | 277 | * |
271 | * If you're only expecting a limited set of languages, this can greatly | 278 | * If you're only expecting a limited set of languages, this can greatly |
272 | * speed up processing | 279 | * speed up processing |
273 | * | 280 | * |
274 | * @access public | 281 | * @param mixed $omit_list language name or array of names to omit |
275 | * @param mixed $omit_list language name or array of names to omit | 282 | * @param bool $include_only if true will include (rather than |
276 | * @param bool $include_only if true will include (rather than | 283 | * exclude) only those in the list |
277 | * exclude) only those in the list | 284 | * |
278 | * @return int number of languages successfully deleted | 285 | * @return int number of languages successfully deleted |
279 | * @throws PEAR_Error | 286 | * @throws Text_LanguageDetect_Exception |
280 | */ | 287 | */ |
281 | function omitLanguages($omit_list, $include_only = false) | 288 | public function omitLanguages($omit_list, $include_only = false) |
282 | { | 289 | { |
283 | |||
284 | // setup check | ||
285 | if (!$this->_setup_ok($err)) { | ||
286 | return $err; | ||
287 | } | ||
288 | |||
289 | $deleted = 0; | 290 | $deleted = 0; |
290 | 291 | ||
291 | // deleting the given languages | 292 | $omit_list = $this->_convertFromNameMode($omit_list); |
293 | |||
292 | if (!$include_only) { | 294 | if (!$include_only) { |
295 | // deleting the given languages | ||
293 | if (!is_array($omit_list)) { | 296 | if (!is_array($omit_list)) { |
294 | $omit_list = strtolower($omit_list); // case desensitize | 297 | $omit_list = strtolower($omit_list); // case desensitize |
295 | if (isset($this->_lang_db[$omit_list])) { | 298 | if (isset($this->_lang_db[$omit_list])) { |
@@ -301,12 +304,12 @@ class Text_LanguageDetect | |||
301 | if (isset($this->_lang_db[$omit_lang])) { | 304 | if (isset($this->_lang_db[$omit_lang])) { |
302 | unset($this->_lang_db[$omit_lang]); | 305 | unset($this->_lang_db[$omit_lang]); |
303 | $deleted++; | 306 | $deleted++; |
304 | } | 307 | } |
305 | } | 308 | } |
306 | } | 309 | } |
307 | 310 | ||
308 | // deleting all except the given languages | ||
309 | } else { | 311 | } else { |
312 | // deleting all except the given languages | ||
310 | if (!is_array($omit_list)) { | 313 | if (!is_array($omit_list)) { |
311 | $omit_list = array($omit_list); | 314 | $omit_list = array($omit_list); |
312 | } | 315 | } |
@@ -327,7 +330,7 @@ class Text_LanguageDetect | |||
327 | // reset the cluster cache if the number of languages changes | 330 | // reset the cluster cache if the number of languages changes |
328 | // this will then have to be recalculated | 331 | // this will then have to be recalculated |
329 | if (isset($this->_clusters) && $deleted > 0) { | 332 | if (isset($this->_clusters) && $deleted > 0) { |
330 | unset($this->_clusters); | 333 | $this->_clusters = null; |
331 | } | 334 | } |
332 | 335 | ||
333 | return $deleted; | 336 | return $deleted; |
@@ -339,49 +342,40 @@ class Text_LanguageDetect | |||
339 | * | 342 | * |
340 | * @access public | 343 | * @access public |
341 | * @return int the number of languages | 344 | * @return int the number of languages |
342 | * @throws PEAR_Error | 345 | * @throws Text_LanguageDetect_Exception |
343 | */ | 346 | */ |
344 | function getLanguageCount() | 347 | function getLanguageCount() |
345 | { | 348 | { |
346 | if (!$this->_setup_ok($err)) { | 349 | return count($this->_lang_db); |
347 | return $err; | ||
348 | } else { | ||
349 | return count($this->_lang_db); | ||
350 | } | ||
351 | } | 350 | } |
352 | 351 | ||
353 | /** | 352 | /** |
354 | * Returns true if a given language exists | 353 | * Checks if the language with the given name exists in the database |
355 | * | 354 | * |
356 | * If passed an array of names, will return true only if all exist | 355 | * @param mixed $lang Language name or array of language names |
357 | * | 356 | * |
358 | * @access public | 357 | * @return bool true if language model exists |
359 | * @param mixed $lang language name or array of language names | ||
360 | * @return bool true if language model exists | ||
361 | * @throws PEAR_Error | ||
362 | */ | 358 | */ |
363 | function languageExists($lang) | 359 | public function languageExists($lang) |
364 | { | 360 | { |
365 | if (!$this->_setup_ok($err)) { | 361 | $lang = $this->_convertFromNameMode($lang); |
366 | return $err; | ||
367 | } else { | ||
368 | // string | ||
369 | if (is_string($lang)) { | ||
370 | return isset($this->_lang_db[strtolower($lang)]); | ||
371 | |||
372 | // array | ||
373 | } elseif (is_array($lang)) { | ||
374 | foreach ($lang as $test_lang) { | ||
375 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
376 | return false; | ||
377 | } | ||
378 | } | ||
379 | return true; | ||
380 | 362 | ||
381 | // other (error) | 363 | if (is_string($lang)) { |
382 | } else { | 364 | return isset($this->_lang_db[strtolower($lang)]); |
383 | throw new Exception('Unknown type passed to languageExists()'); | 365 | |
366 | } elseif (is_array($lang)) { | ||
367 | foreach ($lang as $test_lang) { | ||
368 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
369 | return false; | ||
370 | } | ||
384 | } | 371 | } |
372 | return true; | ||
373 | |||
374 | } else { | ||
375 | throw new Text_LanguageDetect_Exception( | ||
376 | 'Unsupported parameter type passed to languageExists()', | ||
377 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
378 | ); | ||
385 | } | 379 | } |
386 | } | 380 | } |
387 | 381 | ||
@@ -389,25 +383,24 @@ class Text_LanguageDetect | |||
389 | * Returns the list of detectable languages | 383 | * Returns the list of detectable languages |
390 | * | 384 | * |
391 | * @access public | 385 | * @access public |
392 | * @return array the names of the languages known to this object | 386 | * @return array the names of the languages known to this object<<<<<<< |
393 | * @throws PEAR_Error | 387 | * @throws Text_LanguageDetect_Exception |
394 | */ | 388 | */ |
395 | function getLanguages() | 389 | function getLanguages() |
396 | { | 390 | { |
397 | if (!$this->_setup_ok($err)) { | 391 | return $this->_convertToNameMode( |
398 | return $err; | 392 | array_keys($this->_lang_db) |
399 | } else { | 393 | ); |
400 | return array_keys($this->_lang_db); | ||
401 | } | ||
402 | } | 394 | } |
403 | 395 | ||
404 | /** | 396 | /** |
405 | * Make this object behave like Language::Guess | 397 | * Make this object behave like Language::Guess |
406 | * | 398 | * |
407 | * @access public | 399 | * @param bool $setting false to turn off perl compatibility |
408 | * @param bool $setting false to turn off perl compatibility | 400 | * |
401 | * @return void | ||
409 | */ | 402 | */ |
410 | function setPerlCompatible($setting = true) | 403 | public function setPerlCompatible($setting = true) |
411 | { | 404 | { |
412 | if (is_bool($setting)) { // input check | 405 | if (is_bool($setting)) { // input check |
413 | $this->_perl_compatible = $setting; | 406 | $this->_perl_compatible = $setting; |
@@ -422,6 +415,21 @@ class Text_LanguageDetect | |||
422 | } | 415 | } |
423 | 416 | ||
424 | /** | 417 | /** |
418 | * Sets the way how language names are accepted and returned. | ||
419 | * | ||
420 | * @param integer $name_mode One of the following modes: | ||
421 | * 0 - language name ("english") | ||
422 | * 2 - 2-letter ISO 639-1 code ("en") | ||
423 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
424 | * | ||
425 | * @return void | ||
426 | */ | ||
427 | function setNameMode($name_mode) | ||
428 | { | ||
429 | $this->_name_mode = $name_mode; | ||
430 | } | ||
431 | |||
432 | /** | ||
425 | * Whether to use unicode block ranges in detection | 433 | * Whether to use unicode block ranges in detection |
426 | * | 434 | * |
427 | * Should speed up most detections if turned on (detault is on). In some | 435 | * Should speed up most detections if turned on (detault is on). In some |
@@ -429,10 +437,11 @@ class Text_LanguageDetect | |||
429 | * in languages that use latin scripts. In other cases it should speed up | 437 | * in languages that use latin scripts. In other cases it should speed up |
430 | * detection noticeably. | 438 | * detection noticeably. |
431 | * | 439 | * |
432 | * @access public | 440 | * @param bool $setting false to turn off |
433 | * @param bool $setting false to turn off | 441 | * |
442 | * @return void | ||
434 | */ | 443 | */ |
435 | function useUnicodeBlocks($setting = true) | 444 | public function useUnicodeBlocks($setting = true) |
436 | { | 445 | { |
437 | if (is_bool($setting)) { | 446 | if (is_bool($setting)) { |
438 | $this->_use_unicode_narrowing = $setting; | 447 | $this->_use_unicode_narrowing = $setting; |
@@ -442,15 +451,15 @@ class Text_LanguageDetect | |||
442 | /** | 451 | /** |
443 | * Converts a piece of text into trigrams | 452 | * Converts a piece of text into trigrams |
444 | * | 453 | * |
445 | * Superceded by the Text_LanguageDetect_Parser class | 454 | * @param string $text text to convert |
446 | * | 455 | * |
447 | * @access private | 456 | * @return array array of trigram frequencies |
448 | * @param string $text text to convert | 457 | * @access private |
449 | * @return array array of trigram frequencies | 458 | * @deprecated Superceded by the Text_LanguageDetect_Parser class |
450 | */ | 459 | */ |
451 | function _trigram($text) | 460 | function _trigram($text) |
452 | { | 461 | { |
453 | $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); | 462 | $s = new Text_LanguageDetect_Parser($text); |
454 | $s->prepareTrigram(); | 463 | $s->prepareTrigram(); |
455 | $s->prepareUnicode(false); | 464 | $s->prepareUnicode(false); |
456 | $s->setPadStart(!$this->_perl_compatible); | 465 | $s->setPadStart(!$this->_perl_compatible); |
@@ -463,11 +472,12 @@ class Text_LanguageDetect | |||
463 | * | 472 | * |
464 | * Thresholds (cuts off) the list at $this->_threshold | 473 | * Thresholds (cuts off) the list at $this->_threshold |
465 | * | 474 | * |
466 | * @access protected | 475 | * @param array $arr array of trigram |
467 | * @param array $arr array of trgram | 476 | * |
468 | * @return array ranks of trigrams | 477 | * @return array ranks of trigrams |
478 | * @access protected | ||
469 | */ | 479 | */ |
470 | function _arr_rank(&$arr) | 480 | function _arr_rank($arr) |
471 | { | 481 | { |
472 | 482 | ||
473 | // sorts alphabetically first as a standard way of breaking rank ties | 483 | // sorts alphabetically first as a standard way of breaking rank ties |
@@ -494,14 +504,17 @@ class Text_LanguageDetect | |||
494 | 504 | ||
495 | /** | 505 | /** |
496 | * Sorts an array by value breaking ties alphabetically | 506 | * Sorts an array by value breaking ties alphabetically |
497 | * | 507 | * |
498 | * @access private | 508 | * @param array &$arr the array to sort |
499 | * @param array &$arr the array to sort | 509 | * |
510 | * @return void | ||
511 | * @access private | ||
500 | */ | 512 | */ |
501 | function _bub_sort(&$arr) | 513 | function _bub_sort(&$arr) |
502 | { | 514 | { |
503 | // should do the same as this perl statement: | 515 | // should do the same as this perl statement: |
504 | // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | 516 | // sort { $trigrams{$b} == $trigrams{$a} |
517 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | ||
505 | 518 | ||
506 | // needs to sort by both key and value at once | 519 | // needs to sort by both key and value at once |
507 | // using the key to break ties for the value | 520 | // using the key to break ties for the value |
@@ -528,13 +541,14 @@ class Text_LanguageDetect | |||
528 | /** | 541 | /** |
529 | * Sort function used by bubble sort | 542 | * Sort function used by bubble sort |
530 | * | 543 | * |
531 | * Callback function for usort(). | 544 | * Callback function for usort(). |
532 | * | 545 | * |
533 | * @access private | 546 | * @param array $a first param passed by usort() |
534 | * @param array first param passed by usort() | 547 | * @param array $b second param passed by usort() |
535 | * @param array second param passed by usort() | 548 | * |
536 | * @return int 1 if $a is greater, -1 if not | 549 | * @return int 1 if $a is greater, -1 if not |
537 | * @see _bub_sort() | 550 | * @see _bub_sort() |
551 | * @access private | ||
538 | */ | 552 | */ |
539 | function _sort_func($a, $b) | 553 | function _sort_func($a, $b) |
540 | { | 554 | { |
@@ -542,12 +556,12 @@ class Text_LanguageDetect | |||
542 | list($a_key, $a_value) = $a; | 556 | list($a_key, $a_value) = $a; |
543 | list($b_key, $b_value) = $b; | 557 | list($b_key, $b_value) = $b; |
544 | 558 | ||
545 | // if the values are the same, break ties using the key | ||
546 | if ($a_value == $b_value) { | 559 | if ($a_value == $b_value) { |
560 | // if the values are the same, break ties using the key | ||
547 | return strcmp($a_key, $b_key); | 561 | return strcmp($a_key, $b_key); |
548 | 562 | ||
549 | // if not, just sort normally | ||
550 | } else { | 563 | } else { |
564 | // if not, just sort normally | ||
551 | if ($a_value > $b_value) { | 565 | if ($a_value > $b_value) { |
552 | return -1; | 566 | return -1; |
553 | } else { | 567 | } else { |
@@ -559,23 +573,24 @@ class Text_LanguageDetect | |||
559 | } | 573 | } |
560 | 574 | ||
561 | /** | 575 | /** |
562 | * Calculates a linear rank-order distance statistic between two sets of | 576 | * Calculates a linear rank-order distance statistic between two sets of |
563 | * ranked trigrams | 577 | * ranked trigrams |
564 | * | 578 | * |
565 | * Sums the differences in rank for each trigram. If the trigram does not | 579 | * Sums the differences in rank for each trigram. If the trigram does not |
566 | * appear in both, consider it a difference of $this->_threshold. | 580 | * appear in both, consider it a difference of $this->_threshold. |
567 | * | 581 | * |
568 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | 582 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite |
569 | * its simplicity it has been shown to be highly accurate for language | 583 | * its simplicity it has been shown to be highly accurate for language |
570 | * identification tasks. | 584 | * identification tasks. |
571 | * | 585 | * |
572 | * @access private | 586 | * @param array $arr1 the reference set of trigram ranks |
573 | * @param array $arr1 the reference set of trigram ranks | 587 | * @param array $arr2 the target set of trigram ranks |
574 | * @param array $arr2 the target set of trigram ranks | 588 | * |
575 | * @return int the sum of the differences between the ranks of | 589 | * @return int the sum of the differences between the ranks of |
576 | * the two trigram sets | 590 | * the two trigram sets |
591 | * @access private | ||
577 | */ | 592 | */ |
578 | function _distance(&$arr1, &$arr2) | 593 | function _distance($arr1, $arr2) |
579 | { | 594 | { |
580 | $sumdist = 0; | 595 | $sumdist = 0; |
581 | 596 | ||
@@ -598,14 +613,15 @@ class Text_LanguageDetect | |||
598 | 613 | ||
599 | /** | 614 | /** |
600 | * Normalizes the score returned by _distance() | 615 | * Normalizes the score returned by _distance() |
601 | * | 616 | * |
602 | * Different if perl compatible or not | 617 | * Different if perl compatible or not |
603 | * | 618 | * |
604 | * @access private | 619 | * @param int $score the score from _distance() |
605 | * @param int $score the score from _distance() | 620 | * @param int $base_count the number of trigrams being considered |
606 | * @param int $base_count the number of trigrams being considered | 621 | * |
607 | * @return float the normalized score | 622 | * @return float the normalized score |
608 | * @see _distance() | 623 | * @see _distance() |
624 | * @access private | ||
609 | */ | 625 | */ |
610 | function _normalize_score($score, $base_count = null) | 626 | function _normalize_score($score, $base_count = null) |
611 | { | 627 | { |
@@ -630,29 +646,24 @@ class Text_LanguageDetect | |||
630 | * | 646 | * |
631 | * If perl compatible, the score is 300-0, 0 being most similar. | 647 | * If perl compatible, the score is 300-0, 0 being most similar. |
632 | * Otherwise, it's 0-1 with 1 being most similar. | 648 | * Otherwise, it's 0-1 with 1 being most similar. |
633 | * | 649 | * |
634 | * The $sample text should be at least a few sentences in length; | 650 | * The $sample text should be at least a few sentences in length; |
635 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | 651 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension |
636 | * is present it will try to detect and convert. However, experience has | 652 | * is present it will try to detect and convert. However, experience has |
637 | * shown that mb_detect_encoding() *does not work very well* with at least | 653 | * shown that mb_detect_encoding() *does not work very well* with at least |
638 | * some types of encoding. | 654 | * some types of encoding. |
639 | * | 655 | * |
640 | * @access public | 656 | * @param string $sample a sample of text to compare. |
641 | * @param string $sample a sample of text to compare. | 657 | * @param int $limit if specified, return an array of the most likely |
642 | * @param int $limit if specified, return an array of the most likely | 658 | * $limit languages and their scores. |
643 | * $limit languages and their scores. | 659 | * |
644 | * @return mixed sorted array of language scores, blank array if no | 660 | * @return mixed sorted array of language scores, blank array if no |
645 | * useable text was found, or PEAR_Error if error | 661 | * useable text was found |
646 | * with the object setup | 662 | * @see _distance() |
647 | * @see _distance() | 663 | * @throws Text_LanguageDetect_Exception |
648 | * @throws PEAR_Error | ||
649 | */ | 664 | */ |
650 | function detect($sample, $limit = 0) | 665 | public function detect($sample, $limit = 0) |
651 | { | 666 | { |
652 | if (!$this->_setup_ok($err)) { | ||
653 | return $err; | ||
654 | } | ||
655 | |||
656 | // input check | 667 | // input check |
657 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | 668 | if (!Text_LanguageDetect_Parser::validateString($sample)) { |
658 | return array(); | 669 | return array(); |
@@ -660,36 +671,27 @@ class Text_LanguageDetect | |||
660 | 671 | ||
661 | // check char encoding | 672 | // check char encoding |
662 | // (only if mbstring extension is compiled and PHP > 4.0.6) | 673 | // (only if mbstring extension is compiled and PHP > 4.0.6) |
663 | if (function_exists('mb_detect_encoding') | 674 | if (function_exists('mb_detect_encoding') |
664 | && function_exists('mb_convert_encoding')) { | 675 | && function_exists('mb_convert_encoding') |
665 | 676 | ) { | |
666 | // mb_detect_encoding isn't very reliable, to say the least | 677 | // mb_detect_encoding isn't very reliable, to say the least |
667 | // detection should still work with a sufficient sample of ascii characters | 678 | // detection should still work with a sufficient sample |
679 | // of ascii characters | ||
668 | $encoding = mb_detect_encoding($sample); | 680 | $encoding = mb_detect_encoding($sample); |
669 | 681 | ||
670 | // mb_detect_encoding() will return FALSE if detection fails | 682 | // mb_detect_encoding() will return FALSE if detection fails |
671 | // don't attempt conversion if that's the case | 683 | // don't attempt conversion if that's the case |
672 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { | 684 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' |
673 | 685 | && $encoding !== false | |
674 | if (function_exists('mb_list_encodings')) { | 686 | ) { |
675 | 687 | // verify the encoding exists in mb_list_encodings | |
676 | // verify the encoding exists in mb_list_encodings | 688 | if (in_array($encoding, mb_list_encodings())) { |
677 | if (in_array($encoding, mb_list_encodings())) { | 689 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); |
678 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
679 | } | ||
680 | |||
681 | // if the previous condition failed: | ||
682 | // somehow we detected an encoding that also we don't support | ||
683 | |||
684 | } else { | ||
685 | // php 4 doesnt have mb_list_encodings() | ||
686 | // so attempt with error suppression | ||
687 | $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
688 | } | 690 | } |
689 | } | 691 | } |
690 | } | 692 | } |
691 | 693 | ||
692 | $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); | 694 | $sample_obj = new Text_LanguageDetect_Parser($sample); |
693 | $sample_obj->prepareTrigram(); | 695 | $sample_obj->prepareTrigram(); |
694 | if ($this->_use_unicode_narrowing) { | 696 | if ($this->_use_unicode_narrowing) { |
695 | $sample_obj->prepareUnicode(); | 697 | $sample_obj->prepareUnicode(); |
@@ -713,7 +715,10 @@ class Text_LanguageDetect | |||
713 | if (is_array($blocks)) { | 715 | if (is_array($blocks)) { |
714 | $present_blocks = array_keys($blocks); | 716 | $present_blocks = array_keys($blocks); |
715 | } else { | 717 | } else { |
716 | throw new Exception('Error during block detection'); | 718 | throw new Text_LanguageDetect_Exception( |
719 | 'Error during block detection', | ||
720 | Text_LanguageDetect_Exception::BLOCK_DETECTION | ||
721 | ); | ||
717 | } | 722 | } |
718 | 723 | ||
719 | $possible_langs = array(); | 724 | $possible_langs = array(); |
@@ -731,30 +736,30 @@ class Text_LanguageDetect | |||
731 | } | 736 | } |
732 | 737 | ||
733 | // could also try an intersect operation rather than a union | 738 | // could also try an intersect operation rather than a union |
734 | // in other words, choose languages whose trigrams contain | 739 | // in other words, choose languages whose trigrams contain |
735 | // ALL of the unicode blocks found in this sample | 740 | // ALL of the unicode blocks found in this sample |
736 | // would improve speed but would be completely thrown off by an | 741 | // would improve speed but would be completely thrown off by an |
737 | // unexpected character, like an umlaut appearing in english text | 742 | // unexpected character, like an umlaut appearing in english text |
738 | 743 | ||
739 | $possible_langs = array_intersect( | 744 | $possible_langs = array_intersect( |
740 | array_keys($this->_lang_db), | 745 | array_keys($this->_lang_db), |
741 | array_unique($possible_langs) | 746 | array_unique($possible_langs) |
742 | ); | 747 | ); |
743 | 748 | ||
744 | // needs to intersect it with the keys of _lang_db in case | 749 | // needs to intersect it with the keys of _lang_db in case |
745 | // languages have been omitted | 750 | // languages have been omitted |
746 | 751 | ||
747 | // or just try 'em all | ||
748 | } else { | 752 | } else { |
753 | // or just try 'em all | ||
749 | $possible_langs = array_keys($this->_lang_db); | 754 | $possible_langs = array_keys($this->_lang_db); |
750 | } | 755 | } |
751 | 756 | ||
752 | 757 | ||
753 | foreach ($possible_langs as $lang) { | 758 | foreach ($possible_langs as $lang) { |
754 | $scores[$lang] = | 759 | $scores[$lang] = $this->_normalize_score( |
755 | $this->_normalize_score( | 760 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), |
756 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | 761 | $trigram_count |
757 | $trigram_count); | 762 | ); |
758 | } | 763 | } |
759 | 764 | ||
760 | unset($sample_obj); | 765 | unset($sample_obj); |
@@ -772,7 +777,6 @@ class Text_LanguageDetect | |||
772 | $limited_scores = array(); | 777 | $limited_scores = array(); |
773 | 778 | ||
774 | $i = 0; | 779 | $i = 0; |
775 | |||
776 | foreach ($scores as $key => $value) { | 780 | foreach ($scores as $key => $value) { |
777 | if ($i++ >= $limit) { | 781 | if ($i++ >= $limit) { |
778 | break; | 782 | break; |
@@ -781,9 +785,9 @@ class Text_LanguageDetect | |||
781 | $limited_scores[$key] = $value; | 785 | $limited_scores[$key] = $value; |
782 | } | 786 | } |
783 | 787 | ||
784 | return $limited_scores; | 788 | return $this->_convertToNameMode($limited_scores, true); |
785 | } else { | 789 | } else { |
786 | return $scores; | 790 | return $this->_convertToNameMode($scores, true); |
787 | } | 791 | } |
788 | } | 792 | } |
789 | 793 | ||
@@ -791,35 +795,33 @@ class Text_LanguageDetect | |||
791 | * Returns only the most similar language to the text sample | 795 | * Returns only the most similar language to the text sample |
792 | * | 796 | * |
793 | * Calls $this->detect() and returns only the top result | 797 | * Calls $this->detect() and returns only the top result |
794 | * | 798 | * |
795 | * @access public | 799 | * @param string $sample text to detect the language of |
796 | * @param string $sample text to detect the language of | 800 | * |
797 | * @return string the name of the most likely language | 801 | * @return string the name of the most likely language |
798 | * or null if no language is similar | 802 | * or null if no language is similar |
799 | * @see detect() | 803 | * @see detect() |
800 | * @throws PEAR_Error | 804 | * @throws Text_LanguageDetect_Exception |
801 | */ | 805 | */ |
802 | function detectSimple($sample) | 806 | public function detectSimple($sample) |
803 | { | 807 | { |
804 | $scores = $this->detect($sample, 1); | 808 | $scores = $this->detect($sample, 1); |
805 | 809 | ||
806 | // if top language has the maximum possible score, | 810 | // if top language has the maximum possible score, |
807 | // then the top score will have been picked at random | 811 | // then the top score will have been picked at random |
808 | if ( !is_array($scores) | 812 | if (!is_array($scores) || empty($scores) |
809 | || empty($scores) | 813 | || current($scores) == $this->_max_score |
810 | || current($scores) == $this->_max_score) { | 814 | ) { |
811 | |||
812 | return null; | 815 | return null; |
813 | |||
814 | } else { | 816 | } else { |
815 | return ucfirst(key($scores)); | 817 | return key($scores); |
816 | } | 818 | } |
817 | } | 819 | } |
818 | 820 | ||
819 | /** | 821 | /** |
820 | * Returns an array containing the most similar language and a confidence | 822 | * Returns an array containing the most similar language and a confidence |
821 | * rating | 823 | * rating |
822 | * | 824 | * |
823 | * Confidence is a simple measure calculated from the similarity score | 825 | * Confidence is a simple measure calculated from the similarity score |
824 | * minus the similarity score from the next most similar language | 826 | * minus the similarity score from the next most similar language |
825 | * divided by the highest possible score. Languages that have closely | 827 | * divided by the highest possible score. Languages that have closely |
@@ -827,46 +829,43 @@ class Text_LanguageDetect | |||
827 | * confidence scores. | 829 | * confidence scores. |
828 | * | 830 | * |
829 | * The similarity score answers the question "How likely is the text the | 831 | * The similarity score answers the question "How likely is the text the |
830 | * returned language regardless of the other languages considered?" The | 832 | * returned language regardless of the other languages considered?" The |
831 | * confidence score is one way of answering the question "how likely is the | 833 | * confidence score is one way of answering the question "how likely is the |
832 | * text the detected language relative to the rest of the language model | 834 | * text the detected language relative to the rest of the language model |
833 | * set?" | 835 | * set?" |
834 | * | 836 | * |
835 | * To see how similar languages are a priori, see languageSimilarity() | 837 | * To see how similar languages are a priori, see languageSimilarity() |
836 | * | 838 | * |
837 | * @access public | 839 | * @param string $sample text for which language will be detected |
838 | * @param string $sample text for which language will be detected | 840 | * |
839 | * @return array most similar language, score and confidence rating | 841 | * @return array most similar language, score and confidence rating |
840 | * or null if no language is similar | 842 | * or null if no language is similar |
841 | * @see detect() | 843 | * @see detect() |
842 | * @throws PEAR_Error | 844 | * @throws Text_LanguageDetect_Exception |
843 | */ | 845 | */ |
844 | function detectConfidence($sample) | 846 | public function detectConfidence($sample) |
845 | { | 847 | { |
846 | $scores = $this->detect($sample, 2); | 848 | $scores = $this->detect($sample, 2); |
847 | 849 | ||
848 | // if most similar language has the max score, it | 850 | // if most similar language has the max score, it |
849 | // will have been picked at random | 851 | // will have been picked at random |
850 | if ( !is_array($scores) | 852 | if (!is_array($scores) || empty($scores) |
851 | || empty($scores) | 853 | || current($scores) == $this->_max_score |
852 | || current($scores) == $this->_max_score) { | 854 | ) { |
853 | |||
854 | return null; | 855 | return null; |
855 | } | 856 | } |
856 | 857 | ||
857 | $arr['language'] = ucfirst(key($scores)); | 858 | $arr['language'] = key($scores); |
858 | $arr['similarity'] = current($scores); | 859 | $arr['similarity'] = current($scores); |
859 | if (next($scores) !== false) { // if false then no next element | 860 | if (next($scores) !== false) { // if false then no next element |
860 | // the goal is to return a higher value if the distance between | 861 | // the goal is to return a higher value if the distance between |
861 | // the similarity of the first score and the second score is high | 862 | // the similarity of the first score and the second score is high |
862 | 863 | ||
863 | if ($this->_perl_compatible) { | 864 | if ($this->_perl_compatible) { |
864 | 865 | $arr['confidence'] = (current($scores) - $arr['similarity']) | |
865 | $arr['confidence'] = | 866 | / $this->_max_score; |
866 | (current($scores) - $arr['similarity']) / $this->_max_score; | ||
867 | 867 | ||
868 | } else { | 868 | } else { |
869 | |||
870 | $arr['confidence'] = $arr['similarity'] - current($scores); | 869 | $arr['confidence'] = $arr['similarity'] - current($scores); |
871 | 870 | ||
872 | } | 871 | } |
@@ -882,32 +881,26 @@ class Text_LanguageDetect | |||
882 | * Returns the distribution of unicode blocks in a given utf8 string | 881 | * Returns the distribution of unicode blocks in a given utf8 string |
883 | * | 882 | * |
884 | * For the block name of a single char, use unicodeBlockName() | 883 | * For the block name of a single char, use unicodeBlockName() |
885 | * | 884 | * |
886 | * @access public | 885 | * @param string $str input string. Must be ascii or utf8 |
887 | * @param string $str input string. Must be ascii or utf8 | 886 | * @param bool $skip_symbols if true, skip ascii digits, symbols and |
888 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | 887 | * non-printing characters. Includes spaces, |
889 | * non-printing characters. Includes spaces, | 888 | * newlines and common punctutation characters. |
890 | * newlines and common punctutation characters. | 889 | * |
891 | * @return array | 890 | * @return array |
892 | * @throws PEAR_Error | 891 | * @throws Text_LanguageDetect_Exception |
893 | */ | 892 | */ |
894 | function detectUnicodeBlocks($str, $skip_symbols) | 893 | public function detectUnicodeBlocks($str, $skip_symbols) |
895 | { | 894 | { |
896 | // input check | 895 | $skip_symbols = (bool)$skip_symbols; |
897 | if (!is_bool($skip_symbols)) { | 896 | $str = (string)$str; |
898 | throw new Exception('Second parameter must be boolean'); | ||
899 | } | ||
900 | |||
901 | if (!is_string($str)) { | ||
902 | throw new Exception('First parameter was not a string'); | ||
903 | } | ||
904 | 897 | ||
905 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 898 | $sample_obj = new Text_LanguageDetect_Parser($str); |
906 | $sample_obj->prepareUnicode(); | 899 | $sample_obj->prepareUnicode(); |
907 | $sample_obj->prepareTrigram(false); | 900 | $sample_obj->prepareTrigram(false); |
908 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | 901 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); |
909 | $sample_obj->analyze(); | 902 | $sample_obj->analyze(); |
910 | $blocks =& $sample_obj->getUnicodeBlocks(); | 903 | $blocks = $sample_obj->getUnicodeBlocks(); |
911 | unset($sample_obj); | 904 | unset($sample_obj); |
912 | return $blocks; | 905 | return $blocks; |
913 | } | 906 | } |
@@ -915,38 +908,37 @@ class Text_LanguageDetect | |||
915 | /** | 908 | /** |
916 | * Returns the block name for a given unicode value | 909 | * Returns the block name for a given unicode value |
917 | * | 910 | * |
918 | * If passed a string, will assume it is being passed a UTF8-formatted | 911 | * If passed a string, will assume it is being passed a UTF8-formatted |
919 | * character and will automatically convert. Otherwise it will assume it | 912 | * character and will automatically convert. Otherwise it will assume it |
920 | * is being passed a numeric unicode value. | 913 | * is being passed a numeric unicode value. |
921 | * | 914 | * |
922 | * Make sure input is of the correct type! | 915 | * Make sure input is of the correct type! |
923 | * | 916 | * |
924 | * @access public | ||
925 | * @param mixed $unicode unicode value or utf8 char | 917 | * @param mixed $unicode unicode value or utf8 char |
918 | * | ||
926 | * @return mixed the block name string or false if not found | 919 | * @return mixed the block name string or false if not found |
927 | * @throws PEAR_Error | 920 | * @throws Text_LanguageDetect_Exception |
928 | */ | 921 | */ |
929 | function unicodeBlockName($unicode) { | 922 | public function unicodeBlockName($unicode) |
923 | { | ||
930 | if (is_string($unicode)) { | 924 | if (is_string($unicode)) { |
931 | // assume it is being passed a utf8 char, so convert it | 925 | // assume it is being passed a utf8 char, so convert it |
932 | 926 | if (self::utf8strlen($unicode) > 1) { | |
933 | // input check | 927 | throw new Text_LanguageDetect_Exception( |
934 | if ($this->utf8strlen($unicode) > 1) { | 928 | 'Pass a single char only to this method', |
935 | throw new Exception('Pass this function only a single char'); | 929 | Text_LanguageDetect_Exception::PARAM_TYPE |
930 | ); | ||
936 | } | 931 | } |
937 | |||
938 | $unicode = $this->_utf8char2unicode($unicode); | 932 | $unicode = $this->_utf8char2unicode($unicode); |
939 | 933 | ||
940 | if ($unicode == -1) { | ||
941 | throw new Exception('Malformatted char'); | ||
942 | } | ||
943 | |||
944 | // input check | ||
945 | } elseif (!is_int($unicode)) { | 934 | } elseif (!is_int($unicode)) { |
946 | throw new Exception('Input must be of type string or int.'); | 935 | throw new Text_LanguageDetect_Exception( |
936 | 'Input must be of type string or int.', | ||
937 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
938 | ); | ||
947 | } | 939 | } |
948 | 940 | ||
949 | $blocks =& $this->_read_unicode_block_db(); | 941 | $blocks = $this->_read_unicode_block_db(); |
950 | 942 | ||
951 | $result = $this->_unicode_block_name($unicode, $blocks); | 943 | $result = $this->_unicode_block_name($unicode, $blocks); |
952 | 944 | ||
@@ -964,14 +956,17 @@ class Text_LanguageDetect | |||
964 | * the public interface for this function, which does input checks which | 956 | * the public interface for this function, which does input checks which |
965 | * this function omits for speed. | 957 | * this function omits for speed. |
966 | * | 958 | * |
967 | * @access protected | 959 | * @param int $unicode the unicode value |
968 | * @param int $unicode the unicode value | 960 | * @param array $blocks the block database |
969 | * @param array &$blocks the block database | 961 | * @param int $block_count the number of defined blocks in the database |
970 | * @param int $block_count the number of defined blocks in the database | 962 | * |
971 | * @see unicodeBlockName() | 963 | * @return mixed Block name, -1 if it failed |
964 | * @see unicodeBlockName() | ||
965 | * @access protected | ||
972 | */ | 966 | */ |
973 | function _unicode_block_name($unicode, &$blocks, $block_count = -1) { | 967 | function _unicode_block_name($unicode, $blocks, $block_count = -1) |
974 | // for a reference, see | 968 | { |
969 | // for a reference, see | ||
975 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | 970 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt |
976 | 971 | ||
977 | // assume that ascii characters are the most common | 972 | // assume that ascii characters are the most common |
@@ -994,35 +989,36 @@ class Text_LanguageDetect | |||
994 | while ($low <= $high) { | 989 | while ($low <= $high) { |
995 | $mid = floor(($low + $high) / 2); | 990 | $mid = floor(($low + $high) / 2); |
996 | 991 | ||
997 | // if it's lower than the lower bound | ||
998 | if ($unicode < $blocks[$mid][0]) { | 992 | if ($unicode < $blocks[$mid][0]) { |
993 | // if it's lower than the lower bound | ||
999 | $high = $mid - 1; | 994 | $high = $mid - 1; |
1000 | 995 | ||
1001 | // if it's higher than the upper bound | ||
1002 | } elseif ($unicode > $blocks[$mid][1]) { | 996 | } elseif ($unicode > $blocks[$mid][1]) { |
997 | // if it's higher than the upper bound | ||
1003 | $low = $mid + 1; | 998 | $low = $mid + 1; |
1004 | 999 | ||
1005 | // found it | ||
1006 | } else { | 1000 | } else { |
1001 | // found it | ||
1007 | return $blocks[$mid]; | 1002 | return $blocks[$mid]; |
1008 | } | 1003 | } |
1009 | } | 1004 | } |
1010 | 1005 | ||
1011 | // failed to find the block | 1006 | // failed to find the block |
1012 | return -1; | 1007 | return -1; |
1013 | 1008 | ||
1014 | // todo: differentiate when it's out of range or when it falls | 1009 | // todo: differentiate when it's out of range or when it falls |
1015 | // into an unassigned range? | 1010 | // into an unassigned range? |
1016 | } | 1011 | } |
1017 | 1012 | ||
1018 | /** | 1013 | /** |
1019 | * Brings up the unicode block database | 1014 | * Brings up the unicode block database |
1020 | * | 1015 | * |
1021 | * @access protected | ||
1022 | * @return array the database of unicode block definitions | 1016 | * @return array the database of unicode block definitions |
1023 | * @throws PEAR_Error | 1017 | * @throws Text_LanguageDetect_Exception |
1018 | * @access protected | ||
1024 | */ | 1019 | */ |
1025 | function &_read_unicode_block_db() { | 1020 | function _read_unicode_block_db() |
1021 | { | ||
1026 | // since the unicode definitions are always going to be the same, | 1022 | // since the unicode definitions are always going to be the same, |
1027 | // might as well share the memory for the db with all other instances | 1023 | // might as well share the memory for the db with all other instances |
1028 | // of this class | 1024 | // of this class |
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect | |||
1037 | 1033 | ||
1038 | /** | 1034 | /** |
1039 | * Calculate the similarities between the language models | 1035 | * Calculate the similarities between the language models |
1040 | * | 1036 | * |
1041 | * Use this function to see how similar languages are to each other. | 1037 | * Use this function to see how similar languages are to each other. |
1042 | * | 1038 | * |
1043 | * If passed 2 language names, will return just those languages compared. | 1039 | * If passed 2 language names, will return just those languages compared. |
1044 | * If passed 1 language name, will return that language compared to | 1040 | * If passed 1 language name, will return that language compared to |
1045 | * all others. | 1041 | * all others. |
1046 | * If passed none, will return an array of every language model compared | 1042 | * If passed none, will return an array of every language model compared |
1047 | * to every other one. | 1043 | * to every other one. |
1048 | * | 1044 | * |
1049 | * @access public | 1045 | * @param string $lang1 the name of the first language to be compared |
1050 | * @param string $lang1 the name of the first language to be compared | 1046 | * @param string $lang2 the name of the second language to be compared |
1051 | * @param string $lang2 the name of the second language to be compared | 1047 | * |
1052 | * @return array scores of every language compared | 1048 | * @return array scores of every language compared |
1053 | * or the score of just the provided languages | 1049 | * or the score of just the provided languages |
1054 | * or null if one of the supplied languages does not exist | 1050 | * or null if one of the supplied languages does not exist |
1055 | * @throws PEAR_Error | 1051 | * @throws Text_LanguageDetect_Exception |
1056 | */ | 1052 | */ |
1057 | function languageSimilarity($lang1 = null, $lang2 = null) | 1053 | public function languageSimilarity($lang1 = null, $lang2 = null) |
1058 | { | 1054 | { |
1059 | if (!$this->_setup_ok($err)) { | 1055 | $lang1 = $this->_convertFromNameMode($lang1); |
1060 | return $err; | 1056 | $lang2 = $this->_convertFromNameMode($lang2); |
1061 | } | ||
1062 | |||
1063 | if ($lang1 != null) { | 1057 | if ($lang1 != null) { |
1064 | $lang1 = strtolower($lang1); | 1058 | $lang1 = strtolower($lang1); |
1065 | 1059 | ||
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect | |||
1069 | } | 1063 | } |
1070 | 1064 | ||
1071 | if ($lang2 != null) { | 1065 | if ($lang2 != null) { |
1072 | 1066 | if (!isset($this->_lang_db[$lang2])) { | |
1073 | // can't only set the second param | 1067 | // check if language model exists |
1074 | if ($lang1 == null) { | ||
1075 | return null; | ||
1076 | // check if language model exists | ||
1077 | } elseif (!isset($this->_lang_db[$lang2])) { | ||
1078 | return null; | 1068 | return null; |
1079 | } | 1069 | } |
1080 | 1070 | ||
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect | |||
1088 | ) | 1078 | ) |
1089 | ); | 1079 | ); |
1090 | 1080 | ||
1091 | |||
1092 | // compare just $lang1 to all languages | ||
1093 | } else { | 1081 | } else { |
1082 | // compare just $lang1 to all languages | ||
1094 | $return_arr = array(); | 1083 | $return_arr = array(); |
1095 | foreach ($this->_lang_db as $key => $value) { | 1084 | foreach ($this->_lang_db as $key => $value) { |
1096 | if ($key != $lang1) { // don't compare a language to itself | 1085 | if ($key != $lang1) { |
1086 | // don't compare a language to itself | ||
1097 | $return_arr[$key] = $this->_normalize_score( | 1087 | $return_arr[$key] = $this->_normalize_score( |
1098 | $this->_distance($this->_lang_db[$lang1], $value)); | 1088 | $this->_distance($this->_lang_db[$lang1], $value) |
1089 | ); | ||
1099 | } | 1090 | } |
1100 | } | 1091 | } |
1101 | asort($return_arr); | 1092 | asort($return_arr); |
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect | |||
1104 | } | 1095 | } |
1105 | 1096 | ||
1106 | 1097 | ||
1107 | // compare all languages to each other | ||
1108 | } else { | 1098 | } else { |
1099 | // compare all languages to each other | ||
1109 | $return_arr = array(); | 1100 | $return_arr = array(); |
1110 | foreach (array_keys($this->_lang_db) as $lang1) { | 1101 | foreach (array_keys($this->_lang_db) as $lang1) { |
1111 | foreach (array_keys($this->_lang_db) as $lang2) { | 1102 | foreach (array_keys($this->_lang_db) as $lang2) { |
1112 | |||
1113 | // skip comparing languages to themselves | 1103 | // skip comparing languages to themselves |
1114 | if ($lang1 != $lang2) { | 1104 | if ($lang1 != $lang2) { |
1115 | |||
1116 | // don't re-calculate what's already been done | ||
1117 | if (isset($return_arr[$lang2][$lang1])) { | ||
1118 | 1105 | ||
1119 | $return_arr[$lang1][$lang2] = | 1106 | if (isset($return_arr[$lang2][$lang1])) { |
1120 | $return_arr[$lang2][$lang1]; | 1107 | // don't re-calculate what's already been done |
1108 | $return_arr[$lang1][$lang2] | ||
1109 | = $return_arr[$lang2][$lang1]; | ||
1121 | 1110 | ||
1122 | // calculate | ||
1123 | } else { | 1111 | } else { |
1124 | 1112 | // calculate | |
1125 | $return_arr[$lang1][$lang2] = | 1113 | $return_arr[$lang1][$lang2] |
1126 | $this->_normalize_score( | 1114 | = $this->_normalize_score( |
1127 | $this->_distance( | 1115 | $this->_distance( |
1128 | $this->_lang_db[$lang1], | 1116 | $this->_lang_db[$lang1], |
1129 | $this->_lang_db[$lang2] | 1117 | $this->_lang_db[$lang2] |
1130 | ) | 1118 | ) |
1131 | ); | 1119 | ); |
1132 | 1120 | ||
1133 | } | 1121 | } |
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect | |||
1150 | * | 1138 | * |
1151 | * @access public | 1139 | * @access public |
1152 | * @return array language cluster data | 1140 | * @return array language cluster data |
1153 | * @throws PEAR_Error | 1141 | * @throws Text_LanguageDetect_Exception |
1154 | * @see languageSimilarity() | 1142 | * @see languageSimilarity() |
1155 | * @deprecated this function will eventually be removed and placed into | 1143 | * @deprecated this function will eventually be removed and placed into |
1156 | * the model generation class | 1144 | * the model generation class |
1157 | */ | 1145 | */ |
1158 | function clusterLanguages() | 1146 | function clusterLanguages() |
1159 | { | 1147 | { |
1160 | // todo: set the maximum number of clusters | 1148 | // todo: set the maximum number of clusters |
1161 | |||
1162 | // setup check | ||
1163 | if (!$this->_setup_ok($err)) { | ||
1164 | return $err; | ||
1165 | } | ||
1166 | |||
1167 | // return cached result, if any | 1149 | // return cached result, if any |
1168 | if (isset($this->_clusters)) { | 1150 | if (isset($this->_clusters)) { |
1169 | return $this->_clusters; | 1151 | return $this->_clusters; |
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect | |||
1177 | 1159 | ||
1178 | foreach ($langs as $lang) { | 1160 | foreach ($langs as $lang) { |
1179 | if (!isset($this->_lang_db[$lang])) { | 1161 | if (!isset($this->_lang_db[$lang])) { |
1180 | throw new Exception("missing $lang!\n"); | 1162 | throw new Text_LanguageDetect_Exception( |
1163 | "missing $lang!", | ||
1164 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | ||
1165 | ); | ||
1181 | } | 1166 | } |
1182 | } | 1167 | } |
1183 | 1168 | ||
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect | |||
1186 | $langs[$lang1] = $lang1; | 1171 | $langs[$lang1] = $lang1; |
1187 | unset($langs[$old_key]); | 1172 | unset($langs[$old_key]); |
1188 | } | 1173 | } |
1189 | 1174 | ||
1175 | $result_data = $really_map = array(); | ||
1176 | |||
1190 | $i = 0; | 1177 | $i = 0; |
1191 | while (count($langs) > 2 && $i++ < 200) { | 1178 | while (count($langs) > 2 && $i++ < 200) { |
1192 | $highest_score = -1; | 1179 | $highest_score = -1; |
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect | |||
1194 | $highest_key2 = ''; | 1181 | $highest_key2 = ''; |
1195 | foreach ($langs as $lang1) { | 1182 | foreach ($langs as $lang1) { |
1196 | foreach ($langs as $lang2) { | 1183 | foreach ($langs as $lang2) { |
1197 | if ( $lang1 != $lang2 | 1184 | if ($lang1 != $lang2 |
1198 | && $arr[$lang1][$lang2] > $highest_score) { | 1185 | && $arr[$lang1][$lang2] > $highest_score |
1186 | ) { | ||
1199 | $highest_score = $arr[$lang1][$lang2]; | 1187 | $highest_score = $arr[$lang1][$lang2]; |
1200 | $highest_key1 = $lang1; | 1188 | $highest_key1 = $lang1; |
1201 | $highest_key2 = $lang2; | 1189 | $highest_key2 = $lang2; |
1202 | } | 1190 | } |
1203 | } | 1191 | } |
1204 | } | 1192 | } |
1205 | 1193 | ||
1206 | if (!$highest_key1) { | 1194 | if (!$highest_key1) { |
1207 | // should not ever happen | 1195 | // should not ever happen |
1208 | throw new Exception("no highest key? (step: $i)"); | 1196 | throw new Text_LanguageDetect_Exception( |
1197 | "no highest key? (step: $i)", | ||
1198 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | ||
1199 | ); | ||
1209 | } | 1200 | } |
1210 | 1201 | ||
1211 | if ($highest_score == 0) { | 1202 | if ($highest_score == 0) { |
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect | |||
1217 | $sum1 = array_sum($arr[$highest_key1]); | 1208 | $sum1 = array_sum($arr[$highest_key1]); |
1218 | $sum2 = array_sum($arr[$highest_key2]); | 1209 | $sum2 = array_sum($arr[$highest_key2]); |
1219 | 1210 | ||
1220 | // use the score for the one that is most similar to the rest of | 1211 | // use the score for the one that is most similar to the rest of |
1221 | // the field as the score for the group | 1212 | // the field as the score for the group |
1222 | // todo: could try averaging or "centroid" method instead | 1213 | // todo: could try averaging or "centroid" method instead |
1223 | // seems like that might make more sense | 1214 | // seems like that might make more sense |
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect | |||
1248 | $really_lang = $replaceme; | 1239 | $really_lang = $replaceme; |
1249 | while (isset($really_map[$really_lang])) { | 1240 | while (isset($really_map[$really_lang])) { |
1250 | $really_lang = $really_map[$really_lang]; | 1241 | $really_lang = $really_map[$really_lang]; |
1251 | } | 1242 | } |
1252 | $really_map[$newkey] = $really_lang; | 1243 | $really_map[$newkey] = $really_lang; |
1253 | 1244 | ||
1254 | 1245 | ||
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect | |||
1259 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | 1250 | $arr[$key1][$newkey] = $arr[$key1][$key2]; |
1260 | unset($arr[$key1][$key2]); | 1251 | unset($arr[$key1][$key2]); |
1261 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | 1252 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] |
1262 | } | 1253 | } |
1263 | 1254 | ||
1264 | if ($key1 == $replaceme) { | 1255 | if ($key1 == $replaceme) { |
1265 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | 1256 | $arr[$newkey][$key2] = $arr[$key1][$key2]; |
1266 | unset($arr[$key1][$key2]); | 1257 | unset($arr[$key1][$key2]); |
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect | |||
1273 | } | 1264 | } |
1274 | } | 1265 | } |
1275 | } | 1266 | } |
1276 | 1267 | ||
1277 | 1268 | ||
1278 | unset($langs[$highest_key1]); | 1269 | unset($langs[$highest_key1]); |
1279 | unset($langs[$highest_key2]); | 1270 | unset($langs[$highest_key2]); |
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect | |||
1293 | } | 1284 | } |
1294 | 1285 | ||
1295 | $return_val = array( | 1286 | $return_val = array( |
1296 | 'open_forks' => $langs, | 1287 | 'open_forks' => $langs, |
1297 | // the top level of clusters | 1288 | // the top level of clusters |
1298 | // clusters that are mutually exclusive | 1289 | // clusters that are mutually exclusive |
1299 | // or specified by a specific maximum | 1290 | // or specified by a specific maximum |
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect | |||
1323 | * use, and it may disappear or its functionality may change in future | 1314 | * use, and it may disappear or its functionality may change in future |
1324 | * releases without notice. | 1315 | * releases without notice. |
1325 | * | 1316 | * |
1326 | * This compares the sample text to top the top level of clusters. If the | 1317 | * This compares the sample text to top the top level of clusters. If the |
1327 | * sample is similar to the cluster it will drop down and compare it to the | 1318 | * sample is similar to the cluster it will drop down and compare it to the |
1328 | * languages in the cluster, and so on until it hits a leaf node. | 1319 | * languages in the cluster, and so on until it hits a leaf node. |
1329 | * | 1320 | * |
1330 | * this should find the language in considerably fewer compares | 1321 | * this should find the language in considerably fewer compares |
1331 | * (the equivalent of a binary search), however clusterLanguages() is costly | 1322 | * (the equivalent of a binary search), however clusterLanguages() is costly |
1332 | * and the loss of accuracy from this technique is significant. | 1323 | * and the loss of accuracy from this technique is significant. |
1333 | * | 1324 | * |
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect | |||
1337 | * was very large, however in such cases some method of Bayesian inference | 1328 | * was very large, however in such cases some method of Bayesian inference |
1338 | * might be more helpful. | 1329 | * might be more helpful. |
1339 | * | 1330 | * |
1340 | * @see clusterLanguages() | 1331 | * @param string $str input string |
1341 | * @access public | 1332 | * |
1342 | * @param string $str input string | 1333 | * @return array language scores (only those compared) |
1343 | * @return array language scores (only those compared) | 1334 | * @throws Text_LanguageDetect_Exception |
1344 | * @throws PEAR_Error | 1335 | * @see clusterLanguages() |
1345 | */ | 1336 | */ |
1346 | function clusteredSearch($str) | 1337 | public function clusteredSearch($str) |
1347 | { | 1338 | { |
1348 | |||
1349 | // input check | 1339 | // input check |
1350 | if (!Text_LanguageDetect_Parser::validateString($str)) { | 1340 | if (!Text_LanguageDetect_Parser::validateString($str)) { |
1351 | return array(); | 1341 | return array(); |
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect | |||
1359 | $dendogram_data = $result['fork_data']; | 1349 | $dendogram_data = $result['fork_data']; |
1360 | $dendogram_alias = $result['name_map']; | 1350 | $dendogram_alias = $result['name_map']; |
1361 | 1351 | ||
1362 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 1352 | $sample_obj = new Text_LanguageDetect_Parser($str); |
1363 | $sample_obj->prepareTrigram(); | 1353 | $sample_obj->prepareTrigram(); |
1364 | $sample_obj->setPadStart(!$this->_perl_compatible); | 1354 | $sample_obj->setPadStart(!$this->_perl_compatible); |
1365 | $sample_obj->analyze(); | 1355 | $sample_obj->analyze(); |
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect | |||
1372 | } | 1362 | } |
1373 | 1363 | ||
1374 | $i = 0; // counts the number of steps | 1364 | $i = 0; // counts the number of steps |
1375 | 1365 | ||
1376 | foreach ($dendogram_start as $lang) { | 1366 | foreach ($dendogram_start as $lang) { |
1377 | if (isset($dendogram_alias[$lang])) { | 1367 | if (isset($dendogram_alias[$lang])) { |
1378 | $lang_key = $dendogram_alias[$lang]; | 1368 | $lang_key = $dendogram_alias[$lang]; |
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect | |||
1382 | 1372 | ||
1383 | $scores[$lang] = $this->_normalize_score( | 1373 | $scores[$lang] = $this->_normalize_score( |
1384 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1374 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1385 | $sample_count); | 1375 | $sample_count |
1376 | ); | ||
1386 | 1377 | ||
1387 | $i++; | 1378 | $i++; |
1388 | } | 1379 | } |
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect | |||
1411 | 1402 | ||
1412 | $scores[$lang] = $this->_normalize_score( | 1403 | $scores[$lang] = $this->_normalize_score( |
1413 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1404 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1414 | $sample_count); | 1405 | $sample_count |
1406 | ); | ||
1415 | 1407 | ||
1416 | //todo: does not need to do same comparison again | 1408 | //todo: does not need to do same comparison again |
1417 | } | 1409 | } |
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect | |||
1428 | 1420 | ||
1429 | $diff = $scores[$cur_key] - $scores[$loser_key]; | 1421 | $diff = $scores[$cur_key] - $scores[$loser_key]; |
1430 | 1422 | ||
1431 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | 1423 | // $cur_key ({$dendogram_alias[$cur_key]}) wins |
1432 | // over $loser_key ({$dendogram_alias[$loser_key]}) | 1424 | // over $loser_key ({$dendogram_alias[$loser_key]}) |
1433 | // with a difference of $diff | 1425 | // with a difference of $diff |
1434 | } | 1426 | } |
1435 | 1427 | ||
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect | |||
1439 | // which paths the algorithm decided to take along the tree | 1431 | // which paths the algorithm decided to take along the tree |
1440 | 1432 | ||
1441 | // but sometimes the last item is only the second highest | 1433 | // but sometimes the last item is only the second highest |
1442 | if ( ($this->_perl_compatible && (end($scores) > prev($scores))) | 1434 | if (($this->_perl_compatible && (end($scores) > prev($scores))) |
1443 | || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { | 1435 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) |
1444 | 1436 | ) { | |
1445 | $real_last_score = current($scores); | 1437 | $real_last_score = current($scores); |
1446 | $real_last_key = key($scores); | 1438 | $real_last_key = key($scores); |
1447 | 1439 | ||
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect | |||
1449 | unset($scores[$real_last_key]); | 1441 | unset($scores[$real_last_key]); |
1450 | $scores[$real_last_key] = $real_last_score; | 1442 | $scores[$real_last_key] = $real_last_score; |
1451 | } | 1443 | } |
1452 | 1444 | ||
1453 | 1445 | ||
1454 | if (!$this->_perl_compatible) { | 1446 | if (!$this->_perl_compatible) { |
1455 | $scores = array_reverse($scores, true); | 1447 | $scores = array_reverse($scores, true); |
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect | |||
1464 | * | 1456 | * |
1465 | * Returns the numbers of characters (not bytes) in a utf8 string | 1457 | * Returns the numbers of characters (not bytes) in a utf8 string |
1466 | * | 1458 | * |
1467 | * @static | 1459 | * @param string $str string to get the length of |
1468 | * @access public | 1460 | * |
1469 | * @param string $str string to get the length of | 1461 | * @return int number of chars |
1470 | * @return int number of chars | ||
1471 | */ | 1462 | */ |
1472 | function utf8strlen($str) | 1463 | public static function utf8strlen($str) |
1473 | { | 1464 | { |
1474 | // utf8_decode() will convert unknown chars to '?', which is actually | 1465 | // utf8_decode() will convert unknown chars to '?', which is actually |
1475 | // ideal for counting. | 1466 | // ideal for counting. |
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect | |||
1482 | /** | 1473 | /** |
1483 | * Returns the unicode value of a utf8 char | 1474 | * Returns the unicode value of a utf8 char |
1484 | * | 1475 | * |
1485 | * @access protected | 1476 | * @param string $char a utf8 (possibly multi-byte) char |
1486 | * @param string $char a utf8 (possibly multi-byte) char | 1477 | * |
1487 | * @return int unicode value or -1 if malformatted | 1478 | * @return int unicode value |
1479 | * @access protected | ||
1480 | * @link http://en.wikipedia.org/wiki/UTF-8 | ||
1488 | */ | 1481 | */ |
1489 | function _utf8char2unicode($char) { | 1482 | function _utf8char2unicode($char) |
1490 | 1483 | { | |
1491 | // strlen() here will actually get the binary length of a single char | 1484 | // strlen() here will actually get the binary length of a single char |
1492 | switch (strlen($char)) { | 1485 | switch (strlen($char)) { |
1493 | 1486 | case 1: | |
1494 | // for a reference, see http://en.wikipedia.org/wiki/UTF-8 | 1487 | // normal ASCII-7 byte |
1495 | 1488 | // 0xxxxxxx --> 0xxxxxxx | |
1496 | case 1: | 1489 | return ord($char{0}); |
1497 | // normal ASCII-7 byte | 1490 | |
1498 | // 0xxxxxxx --> 0xxxxxxx | 1491 | case 2: |
1499 | return ord($char{0}); | 1492 | // 2 byte unicode |
1500 | 1493 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1501 | case 2: | 1494 | $z = (ord($char{0}) & 0x000001F) << 6; |
1502 | // 2 byte unicode | 1495 | $x = (ord($char{1}) & 0x0000003F); |
1503 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | 1496 | return ($z | $x); |
1504 | $z = (ord($char{0}) & 0x000001F) << 6; | 1497 | |
1505 | $x = (ord($char{1}) & 0x0000003F); | 1498 | case 3: |
1506 | 1499 | // 3 byte unicode | |
1507 | return ($z | $x); | 1500 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx |
1508 | 1501 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1509 | case 3: | 1502 | $x1 = (ord($char{1}) & 0x0000003F) << 6; |
1510 | // 3 byte unicode | 1503 | $x2 = (ord($char{2}) & 0x0000003F); |
1511 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | 1504 | return ($z | $x1 | $x2); |
1512 | $z = (ord($char{0}) & 0x0000000F) << 12; | 1505 | |
1513 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | 1506 | case 4: |
1514 | $x2 = (ord($char{2}) & 0x0000003F); | 1507 | // 4 byte unicode |
1515 | 1508 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1516 | return ($z | $x1 | $x2); | 1509 | // 000zzzzz xxxxxxxx xxxxxxxx |
1517 | 1510 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1518 | case 4: | 1511 | $z2 = (ord($char{1}) & 0x0000003F) << 12; |
1519 | // 4 byte unicode | 1512 | $x1 = (ord($char{2}) & 0x0000003F) << 6; |
1520 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | 1513 | $x2 = (ord($char{3}) & 0x0000003F); |
1521 | // 000zzzzz xxxxxxxx xxxxxxxx | 1514 | return ($z1 | $z2 | $x1 | $x2); |
1522 | $z1 = (ord($char{0}) & 0x00000007) << 18; | ||
1523 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | ||
1524 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | ||
1525 | $x2 = (ord($char{3}) & 0x0000003F); | ||
1526 | |||
1527 | return ($z1 | $z2 | $x1 | $x2); | ||
1528 | |||
1529 | default: | ||
1530 | // error: malformatted char? | ||
1531 | return -1; | ||
1532 | } | 1515 | } |
1533 | } | 1516 | } |
1534 | 1517 | ||
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect | |||
1536 | * utf8-safe fast character iterator | 1519 | * utf8-safe fast character iterator |
1537 | * | 1520 | * |
1538 | * Will get the next character starting from $counter, which will then be | 1521 | * Will get the next character starting from $counter, which will then be |
1539 | * incremented. If a multi-byte char the bytes will be concatenated and | 1522 | * incremented. If a multi-byte char the bytes will be concatenated and |
1540 | * $counter will be incremeted by the number of bytes in the char. | 1523 | * $counter will be incremeted by the number of bytes in the char. |
1541 | * | 1524 | * |
1542 | * @access private | 1525 | * @param string $str the string being iterated over |
1543 | * @param string &$str the string being iterated over | 1526 | * @param int &$counter the iterator, will increment by reference |
1544 | * @param int &$counter the iterator, will increment by reference | 1527 | * @param bool $special_convert whether to do special conversions |
1545 | * @param bool $special_convert whether to do special conversions | 1528 | * |
1546 | * @return char the next (possibly multi-byte) char from $counter | 1529 | * @return char the next (possibly multi-byte) char from $counter |
1530 | * @access private | ||
1547 | */ | 1531 | */ |
1548 | function _next_char(&$str, &$counter, $special_convert = false) | 1532 | static function _next_char($str, &$counter, $special_convert = false) |
1549 | { | 1533 | { |
1550 | |||
1551 | $char = $str{$counter++}; | 1534 | $char = $str{$counter++}; |
1552 | $ord = ord($char); | 1535 | $ord = ord($char); |
1553 | 1536 | ||
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect | |||
1556 | 1539 | ||
1557 | // normal ascii one byte char | 1540 | // normal ascii one byte char |
1558 | if ($ord <= 127) { | 1541 | if ($ord <= 127) { |
1559 | |||
1560 | // special conversions needed for this package | 1542 | // special conversions needed for this package |
1561 | // (that only apply to regular ascii characters) | 1543 | // (that only apply to regular ascii characters) |
1562 | // lower case, and convert all non-alphanumeric characters | 1544 | // lower case, and convert all non-alphanumeric characters |
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect | |||
1571 | 1553 | ||
1572 | return $char; | 1554 | return $char; |
1573 | 1555 | ||
1574 | // multi-byte chars | ||
1575 | } elseif ($ord >> 5 == 6) { // two-byte char | 1556 | } elseif ($ord >> 5 == 6) { // two-byte char |
1557 | // multi-byte chars | ||
1576 | $nextchar = $str{$counter++}; // get next byte | 1558 | $nextchar = $str{$counter++}; // get next byte |
1577 | 1559 | ||
1578 | // lower-casing of non-ascii characters is still incomplete | 1560 | // lower-casing of non-ascii characters is still incomplete |
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect | |||
1582 | if ($ord == 195) { | 1564 | if ($ord == 195) { |
1583 | $nextord = ord($nextchar); | 1565 | $nextord = ord($nextchar); |
1584 | $nextord_adj = $nextord + 64; | 1566 | $nextord_adj = $nextord + 64; |
1585 | // for a reference, see | 1567 | // for a reference, see |
1586 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | 1568 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html |
1587 | 1569 | ||
1588 | // À - Þ but not × | 1570 | // À - Þ but not × |
1589 | if ( $nextord_adj >= 192 | 1571 | if ($nextord_adj >= 192 |
1590 | && $nextord_adj <= 222 | 1572 | && $nextord_adj <= 222 |
1591 | && $nextord_adj != 215) { | 1573 | && $nextord_adj != 215 |
1592 | 1574 | ) { | |
1593 | $nextchar = chr($nextord + 32); | 1575 | $nextchar = chr($nextord + 32); |
1594 | } | 1576 | } |
1595 | 1577 | ||
1596 | // lower case cyrillic alphabet | ||
1597 | } elseif ($ord == 208) { | 1578 | } elseif ($ord == 208) { |
1579 | // lower case cyrillic alphabet | ||
1598 | $nextord = ord($nextchar); | 1580 | $nextord = ord($nextchar); |
1599 | // if A - Pe | 1581 | // if A - Pe |
1600 | if ($nextord >= 144 && $nextord <= 159) { | 1582 | if ($nextord >= 144 && $nextord <= 159) { |
1601 | // lower case | 1583 | // lower case |
1602 | $nextchar = chr($nextord + 32); | 1584 | $nextchar = chr($nextord + 32); |
1603 | 1585 | ||
1604 | // if Er - Ya | ||
1605 | } elseif ($nextord >= 160 && $nextord <= 175) { | 1586 | } elseif ($nextord >= 160 && $nextord <= 175) { |
1587 | // if Er - Ya | ||
1606 | // lower case | 1588 | // lower case |
1607 | $char = chr(209); // == $ord++ | 1589 | $char = chr(209); // == $ord++ |
1608 | $nextchar = chr($nextord - 32); | 1590 | $nextchar = chr($nextord - 32); |
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect | |||
1611 | } | 1593 | } |
1612 | 1594 | ||
1613 | // tag on next byte | 1595 | // tag on next byte |
1614 | return $char . $nextchar; | 1596 | return $char . $nextchar; |
1615 | |||
1616 | } elseif ($ord >> 4 == 14) { // three-byte char | 1597 | } elseif ($ord >> 4 == 14) { // three-byte char |
1617 | 1598 | ||
1618 | // tag on next 2 bytes | 1599 | // tag on next 2 bytes |
1619 | return $char . $str{$counter++} . $str{$counter++}; | 1600 | return $char . $str{$counter++} . $str{$counter++}; |
1620 | 1601 | ||
1621 | } elseif ($ord >> 3 == 30) { // four-byte char | 1602 | } elseif ($ord >> 3 == 30) { // four-byte char |
1622 | 1603 | ||
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect | |||
1628 | } | 1609 | } |
1629 | } | 1610 | } |
1630 | 1611 | ||
1631 | } | 1612 | /** |
1613 | * Converts an $language input parameter from the configured mode | ||
1614 | * to the language name that is used internally. | ||
1615 | * | ||
1616 | * Works for strings and arrays. | ||
1617 | * | ||
1618 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1619 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1620 | * converts the keys to the language name. | ||
1621 | * | ||
1622 | * @return string|array Language name | ||
1623 | */ | ||
1624 | function _convertFromNameMode($lang, $convertKey = false) | ||
1625 | { | ||
1626 | if ($this->_name_mode == 0) { | ||
1627 | return $lang; | ||
1628 | } | ||
1629 | |||
1630 | if ($this->_name_mode == 2) { | ||
1631 | $method = 'code2ToName'; | ||
1632 | } else { | ||
1633 | $method = 'code3ToName'; | ||
1634 | } | ||
1635 | |||
1636 | if (is_string($lang)) { | ||
1637 | return (string)Text_LanguageDetect_ISO639::$method($lang); | ||
1638 | } | ||
1639 | |||
1640 | $newlang = array(); | ||
1641 | foreach ($lang as $key => $val) { | ||
1642 | if ($convertKey) { | ||
1643 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | ||
1644 | $newlang[$newkey] = $val; | ||
1645 | } else { | ||
1646 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | ||
1647 | } | ||
1648 | } | ||
1649 | return $newlang; | ||
1650 | } | ||
1632 | 1651 | ||
1633 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | 1652 | /** |
1653 | * Converts an $language output parameter from the language name that is | ||
1654 | * used internally to the configured mode. | ||
1655 | * | ||
1656 | * Works for strings and arrays. | ||
1657 | * | ||
1658 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1659 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1660 | * converts the keys to the language name. | ||
1661 | * | ||
1662 | * @return string|array Language name | ||
1663 | */ | ||
1664 | function _convertToNameMode($lang, $convertKey = false) | ||
1665 | { | ||
1666 | if ($this->_name_mode == 0) { | ||
1667 | return $lang; | ||
1668 | } | ||
1669 | |||
1670 | if ($this->_name_mode == 2) { | ||
1671 | $method = 'nameToCode2'; | ||
1672 | } else { | ||
1673 | $method = 'nameToCode3'; | ||
1674 | } | ||
1675 | |||
1676 | if (is_string($lang)) { | ||
1677 | return Text_LanguageDetect_ISO639::$method($lang); | ||
1678 | } | ||
1679 | |||
1680 | $newlang = array(); | ||
1681 | foreach ($lang as $key => $val) { | ||
1682 | if ($convertKey) { | ||
1683 | $newkey = Text_LanguageDetect_ISO639::$method($key); | ||
1684 | $newlang[$newkey] = $val; | ||
1685 | } else { | ||
1686 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | ||
1687 | } | ||
1688 | } | ||
1689 | return $newlang; | ||
1690 | } | ||
1691 | } | ||
1634 | 1692 | ||
1635 | ?> | 1693 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file |
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php index 2e8991cc..d0f09d74 100644 --- a/inc/3rdparty/libraries/readability/Readability.php +++ b/inc/3rdparty/libraries/readability/Readability.php | |||
@@ -1,1138 +1,1138 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Arc90's Readability ported to PHP for FiveFilters.org | 3 | * Arc90's Readability ported to PHP for FiveFilters.org |
4 | * Based on readability.js version 1.7.1 (without multi-page support) | 4 | * Based on readability.js version 1.7.1 (without multi-page support) |
5 | * Updated to allow HTML5 parsing with html5lib | 5 | * Updated to allow HTML5 parsing with html5lib |
6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds | 6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds |
7 | * ------------------------------------------------------ | 7 | * ------------------------------------------------------ |
8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | 8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js |
9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | 9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ |
10 | * JS Source: http://code.google.com/p/arc90labs-readability | 10 | * JS Source: http://code.google.com/p/arc90labs-readability |
11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | 11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net |
12 | * More information: http://fivefilters.org/content-only/ | 12 | * More information: http://fivefilters.org/content-only/ |
13 | * License: Apache License, Version 2.0 | 13 | * License: Apache License, Version 2.0 |
14 | * Requires: PHP5 | 14 | * Requires: PHP5 |
15 | * Date: 2012-09-19 | 15 | * Date: 2012-09-19 |
16 | * | 16 | * |
17 | * Differences between the PHP port and the original | 17 | * Differences between the PHP port and the original |
18 | * ------------------------------------------------------ | 18 | * ------------------------------------------------------ |
19 | * Arc90's Readability is designed to run in the browser. It works on the DOM | 19 | * Arc90's Readability is designed to run in the browser. It works on the DOM |
20 | * tree (the parsed HTML) after the page's CSS styles have been applied and | 20 | * tree (the parsed HTML) after the page's CSS styles have been applied and |
21 | * Javascript code executed. This PHP port does not run inside a browser. | 21 | * Javascript code executed. This PHP port does not run inside a browser. |
22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | 22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot |
23 | * rely on CSS or Javascript support. As such, the results will not always | 23 | * rely on CSS or Javascript support. As such, the results will not always |
24 | * match Arc90's Readability. (For example, if a web page contains CSS style | 24 | * match Arc90's Readability. (For example, if a web page contains CSS style |
25 | * rules or Javascript code which hide certain HTML elements from display, | 25 | * rules or Javascript code which hide certain HTML elements from display, |
26 | * Arc90's Readability will dismiss those from consideration but our PHP port, | 26 | * Arc90's Readability will dismiss those from consideration but our PHP port, |
27 | * unable to understand CSS or Javascript, will not know any better.) | 27 | * unable to understand CSS or Javascript, will not know any better.) |
28 | * | 28 | * |
29 | * Another significant difference is that the aim of Arc90's Readability is | 29 | * Another significant difference is that the aim of Arc90's Readability is |
30 | * to re-present the main content block of a given web page so users can | 30 | * to re-present the main content block of a given web page so users can |
31 | * read it more easily in their browsers. Correct identification, clean up, | 31 | * read it more easily in their browsers. Correct identification, clean up, |
32 | * and separation of the content block is only a part of this process. | 32 | * and separation of the content block is only a part of this process. |
33 | * This PHP port is only concerned with this part, it does not include code | 33 | * This PHP port is only concerned with this part, it does not include code |
34 | * that relates to presentation in the browser - Arc90 already do | 34 | * that relates to presentation in the browser - Arc90 already do |
35 | * that extremely well, and for PDF output there's FiveFilters.org's | 35 | * that extremely well, and for PDF output there's FiveFilters.org's |
36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | 36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. |
37 | * | 37 | * |
38 | * Finally, this class contains methods that might be useful for developers | 38 | * Finally, this class contains methods that might be useful for developers |
39 | * working on HTML document fragments. So without deviating too much from | 39 | * working on HTML document fragments. So without deviating too much from |
40 | * the original code (which I don't want to do because it makes debugging | 40 | * the original code (which I don't want to do because it makes debugging |
41 | * and updating more difficult), I've tried to make it a little more | 41 | * and updating more difficult), I've tried to make it a little more |
42 | * developer friendly. You should be able to use the methods here on | 42 | * developer friendly. You should be able to use the methods here on |
43 | * existing DOMElement objects without passing an entire HTML document to | 43 | * existing DOMElement objects without passing an entire HTML document to |
44 | * be parsed. | 44 | * be parsed. |
45 | */ | 45 | */ |
46 | 46 | ||
47 | // This class allows us to do JavaScript like assignements to innerHTML | 47 | // This class allows us to do JavaScript like assignements to innerHTML |
48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | 48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); |
49 | 49 | ||
50 | // Alternative usage (for testing only!) | 50 | // Alternative usage (for testing only!) |
51 | // uncomment the lines below and call Readability.php in your browser | 51 | // uncomment the lines below and call Readability.php in your browser |
52 | // passing it the URL of the page you'd like content from, e.g.: | 52 | // passing it the URL of the page you'd like content from, e.g.: |
53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | 53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php |
54 | 54 | ||
55 | /* | 55 | /* |
56 | if (!isset($_GET['url']) || $_GET['url'] == '') { | 56 | if (!isset($_GET['url']) || $_GET['url'] == '') { |
57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | 57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); |
58 | } | 58 | } |
59 | $url = $_GET['url']; | 59 | $url = $_GET['url']; |
60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | 60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; |
61 | $html = file_get_contents($url); | 61 | $html = file_get_contents($url); |
62 | $r = new Readability($html, $url); | 62 | $r = new Readability($html, $url); |
63 | $r->init(); | 63 | $r->init(); |
64 | echo $r->articleContent->innerHTML; | 64 | echo $r->articleContent->innerHTML; |
65 | */ | 65 | */ |
66 | 66 | ||
67 | class Readability | 67 | class Readability |
68 | { | 68 | { |
69 | public $version = '1.7.1-without-multi-page'; | 69 | public $version = '1.7.1-without-multi-page'; |
70 | public $convertLinksToFootnotes = false; | 70 | public $convertLinksToFootnotes = false; |
71 | public $revertForcedParagraphElements = true; | 71 | public $revertForcedParagraphElements = true; |
72 | public $articleTitle; | 72 | public $articleTitle; |
73 | public $articleContent; | 73 | public $articleContent; |
74 | public $dom; | 74 | public $dom; |
75 | public $url = null; // optional - URL where HTML was retrieved | 75 | public $url = null; // optional - URL where HTML was retrieved |
76 | public $debug = false; | 76 | public $debug = false; |
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 | 77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 |
78 | protected $body = null; // | 78 | protected $body = null; // |
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | 79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later |
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | 80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. |
81 | protected $success = false; // indicates whether we were able to extract or not | 81 | protected $success = false; // indicates whether we were able to extract or not |
82 | 82 | ||
83 | /** | 83 | /** |
84 | * All of the regular expressions in use within readability. | 84 | * All of the regular expressions in use within readability. |
85 | * Defined up here so we don't instantiate them repeatedly in loops. | 85 | * Defined up here so we don't instantiate them repeatedly in loops. |
86 | **/ | 86 | **/ |
87 | public $regexps = array( | 87 | public $regexps = array( |
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', | 88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', |
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | 89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', | 90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', |
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | 91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', |
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | 92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', |
93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | 93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', |
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | 94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', |
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | 95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() |
96 | 'normalize' => '/\s{2,}/', | 96 | 'normalize' => '/\s{2,}/', |
97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | 97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', |
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', | 98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', |
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | 99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' |
100 | ); | 100 | ); |
101 | 101 | ||
102 | /* constants */ | 102 | /* constants */ |
103 | const FLAG_STRIP_UNLIKELYS = 1; | 103 | const FLAG_STRIP_UNLIKELYS = 1; |
104 | const FLAG_WEIGHT_CLASSES = 2; | 104 | const FLAG_WEIGHT_CLASSES = 2; |
105 | const FLAG_CLEAN_CONDITIONALLY = 4; | 105 | const FLAG_CLEAN_CONDITIONALLY = 4; |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * Create instance of Readability | 108 | * Create instance of Readability |
109 | * @param string UTF-8 encoded string | 109 | * @param string UTF-8 encoded string |
110 | * @param string (optional) URL associated with HTML (used for footnotes) | 110 | * @param string (optional) URL associated with HTML (used for footnotes) |
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
112 | */ | 112 | */ |
113 | function __construct($html, $url=null, $parser='libxml') | 113 | function __construct($html, $url=null, $parser='libxml') |
114 | { | 114 | { |
115 | $this->url = $url; | 115 | $this->url = $url; |
116 | /* Turn all double br's into p's */ | 116 | /* Turn all double br's into p's */ |
117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | 117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); |
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | 118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); |
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | 119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); |
120 | if (trim($html) == '') $html = '<html></html>'; | 120 | if (trim($html) == '') $html = '<html></html>'; |
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { | 121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { |
122 | // all good | 122 | // all good |
123 | } else { | 123 | } else { |
124 | $this->dom = new DOMDocument(); | 124 | $this->dom = new DOMDocument(); |
125 | $this->dom->preserveWhiteSpace = false; | 125 | $this->dom->preserveWhiteSpace = false; |
126 | @$this->dom->loadHTML($html); | 126 | @$this->dom->loadHTML($html); |
127 | } | 127 | } |
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | 128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); |
129 | } | 129 | } |
130 | 130 | ||
131 | /** | 131 | /** |
132 | * Get article title element | 132 | * Get article title element |
133 | * @return DOMElement | 133 | * @return DOMElement |
134 | */ | 134 | */ |
135 | public function getTitle() { | 135 | public function getTitle() { |
136 | return $this->articleTitle; | 136 | return $this->articleTitle; |
137 | } | 137 | } |
138 | 138 | ||
139 | /** | 139 | /** |
140 | * Get article content element | 140 | * Get article content element |
141 | * @return DOMElement | 141 | * @return DOMElement |
142 | */ | 142 | */ |
143 | public function getContent() { | 143 | public function getContent() { |
144 | return $this->articleContent; | 144 | return $this->articleContent; |
145 | } | 145 | } |
146 | 146 | ||
147 | /** | 147 | /** |
148 | * Runs readability. | 148 | * Runs readability. |
149 | * | 149 | * |
150 | * Workflow: | 150 | * Workflow: |
151 | * 1. Prep the document by removing script tags, css, etc. | 151 | * 1. Prep the document by removing script tags, css, etc. |
152 | * 2. Build readability's DOM tree. | 152 | * 2. Build readability's DOM tree. |
153 | * 3. Grab the article content from the current dom tree. | 153 | * 3. Grab the article content from the current dom tree. |
154 | * 4. Replace the current DOM tree with the new one. | 154 | * 4. Replace the current DOM tree with the new one. |
155 | * 5. Read peacefully. | 155 | * 5. Read peacefully. |
156 | * | 156 | * |
157 | * @return boolean true if we found content, false otherwise | 157 | * @return boolean true if we found content, false otherwise |
158 | **/ | 158 | **/ |
159 | public function init() | 159 | public function init() |
160 | { | 160 | { |
161 | if (!isset($this->dom->documentElement)) return false; | 161 | if (!isset($this->dom->documentElement)) return false; |
162 | $this->removeScripts($this->dom); | 162 | $this->removeScripts($this->dom); |
163 | //die($this->getInnerHTML($this->dom->documentElement)); | 163 | //die($this->getInnerHTML($this->dom->documentElement)); |
164 | 164 | ||
165 | // Assume successful outcome | 165 | // Assume successful outcome |
166 | $this->success = true; | 166 | $this->success = true; |
167 | 167 | ||
168 | $bodyElems = $this->dom->getElementsByTagName('body'); | 168 | $bodyElems = $this->dom->getElementsByTagName('body'); |
169 | if ($bodyElems->length > 0) { | 169 | if ($bodyElems->length > 0) { |
170 | if ($this->bodyCache == null) { | 170 | if ($this->bodyCache == null) { |
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | 171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; |
172 | } | 172 | } |
173 | if ($this->body == null) { | 173 | if ($this->body == null) { |
174 | $this->body = $bodyElems->item(0); | 174 | $this->body = $bodyElems->item(0); |
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | $this->prepDocument(); | 178 | $this->prepDocument(); |
179 | 179 | ||
180 | //die($this->dom->documentElement->parentNode->nodeType); | 180 | //die($this->dom->documentElement->parentNode->nodeType); |
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | 181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); |
182 | //die($this->getInnerHTML($this->dom->documentElement)); | 182 | //die($this->getInnerHTML($this->dom->documentElement)); |
183 | 183 | ||
184 | /* Build readability's DOM tree */ | 184 | /* Build readability's DOM tree */ |
185 | $overlay = $this->dom->createElement('div'); | 185 | $overlay = $this->dom->createElement('div'); |
186 | $innerDiv = $this->dom->createElement('div'); | 186 | $innerDiv = $this->dom->createElement('div'); |
187 | $articleTitle = $this->getArticleTitle(); | 187 | $articleTitle = $this->getArticleTitle(); |
188 | $articleContent = $this->grabArticle(); | 188 | $articleContent = $this->grabArticle(); |
189 | 189 | ||
190 | if (!$articleContent) { | 190 | if (!$articleContent) { |
191 | $this->success = false; | 191 | $this->success = false; |
192 | $articleContent = $this->dom->createElement('div'); | 192 | $articleContent = $this->dom->createElement('div'); |
193 | $articleContent->setAttribute('id', 'readability-content'); | 193 | $articleContent->setAttribute('id', 'readability-content'); |
194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | 194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; |
195 | } | 195 | } |
196 | 196 | ||
197 | $overlay->setAttribute('id', 'readOverlay'); | 197 | $overlay->setAttribute('id', 'readOverlay'); |
198 | $innerDiv->setAttribute('id', 'readInner'); | 198 | $innerDiv->setAttribute('id', 'readInner'); |
199 | 199 | ||
200 | /* Glue the structure of our document together. */ | 200 | /* Glue the structure of our document together. */ |
201 | $innerDiv->appendChild($articleTitle); | 201 | $innerDiv->appendChild($articleTitle); |
202 | $innerDiv->appendChild($articleContent); | 202 | $innerDiv->appendChild($articleContent); |
203 | $overlay->appendChild($innerDiv); | 203 | $overlay->appendChild($innerDiv); |
204 | 204 | ||
205 | /* Clear the old HTML, insert the new content. */ | 205 | /* Clear the old HTML, insert the new content. */ |
206 | $this->body->innerHTML = ''; | 206 | $this->body->innerHTML = ''; |
207 | $this->body->appendChild($overlay); | 207 | $this->body->appendChild($overlay); |
208 | //document.body.insertBefore(overlay, document.body.firstChild); | 208 | //document.body.insertBefore(overlay, document.body.firstChild); |
209 | $this->body->removeAttribute('style'); | 209 | $this->body->removeAttribute('style'); |
210 | 210 | ||
211 | $this->postProcessContent($articleContent); | 211 | $this->postProcessContent($articleContent); |
212 | 212 | ||
213 | // Set title and content instance variables | 213 | // Set title and content instance variables |
214 | $this->articleTitle = $articleTitle; | 214 | $this->articleTitle = $articleTitle; |
215 | $this->articleContent = $articleContent; | 215 | $this->articleContent = $articleContent; |
216 | 216 | ||
217 | return $this->success; | 217 | return $this->success; |
218 | } | 218 | } |
219 | 219 | ||
220 | /** | 220 | /** |
221 | * Debug | 221 | * Debug |
222 | */ | 222 | */ |
223 | protected function dbg($msg) { | 223 | protected function dbg($msg) { |
224 | if ($this->debug) echo '* ',$msg, "\n"; | 224 | if ($this->debug) echo '* ',$msg, "\n"; |
225 | } | 225 | } |
226 | 226 | ||
227 | /** | 227 | /** |
228 | * Run any post-process modifications to article content as necessary. | 228 | * Run any post-process modifications to article content as necessary. |
229 | * | 229 | * |
230 | * @param DOMElement | 230 | * @param DOMElement |
231 | * @return void | 231 | * @return void |
232 | */ | 232 | */ |
233 | public function postProcessContent($articleContent) { | 233 | public function postProcessContent($articleContent) { |
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | 234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { |
235 | $this->addFootnotes($articleContent); | 235 | $this->addFootnotes($articleContent); |
236 | } | 236 | } |
237 | } | 237 | } |
238 | 238 | ||
239 | /** | 239 | /** |
240 | * Get the article title as an H1. | 240 | * Get the article title as an H1. |
241 | * | 241 | * |
242 | * @return DOMElement | 242 | * @return DOMElement |
243 | */ | 243 | */ |
244 | protected function getArticleTitle() { | 244 | protected function getArticleTitle() { |
245 | $curTitle = ''; | 245 | $curTitle = ''; |
246 | $origTitle = ''; | 246 | $origTitle = ''; |
247 | 247 | ||
248 | try { | 248 | try { |
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | 249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); |
250 | } catch(Exception $e) {} | 250 | } catch(Exception $e) {} |
251 | 251 | ||
252 | if (preg_match('/ [\|\-] /', $curTitle)) | 252 | if (preg_match('/ [\|\-] /', $curTitle)) |
253 | { | 253 | { |
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | 254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); |
255 | 255 | ||
256 | if (count(explode(' ', $curTitle)) < 3) { | 256 | if (count(explode(' ', $curTitle)) < 3) { |
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | 257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); |
258 | } | 258 | } |
259 | } | 259 | } |
260 | else if (strpos($curTitle, ': ') !== false) | 260 | else if (strpos($curTitle, ': ') !== false) |
261 | { | 261 | { |
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | 262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); |
263 | 263 | ||
264 | if (count(explode(' ', $curTitle)) < 3) { | 264 | if (count(explode(' ', $curTitle)) < 3) { |
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | 265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | 268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) |
269 | { | 269 | { |
270 | $hOnes = $this->dom->getElementsByTagName('h1'); | 270 | $hOnes = $this->dom->getElementsByTagName('h1'); |
271 | if($hOnes->length == 1) | 271 | if($hOnes->length == 1) |
272 | { | 272 | { |
273 | $curTitle = $this->getInnerText($hOnes->item(0)); | 273 | $curTitle = $this->getInnerText($hOnes->item(0)); |
274 | } | 274 | } |
275 | } | 275 | } |
276 | 276 | ||
277 | $curTitle = trim($curTitle); | 277 | $curTitle = trim($curTitle); |
278 | 278 | ||
279 | if (count(explode(' ', $curTitle)) <= 4) { | 279 | if (count(explode(' ', $curTitle)) <= 4) { |
280 | $curTitle = $origTitle; | 280 | $curTitle = $origTitle; |
281 | } | 281 | } |
282 | 282 | ||
283 | $articleTitle = $this->dom->createElement('h1'); | 283 | $articleTitle = $this->dom->createElement('h1'); |
284 | $articleTitle->innerHTML = $curTitle; | 284 | $articleTitle->innerHTML = $curTitle; |
285 | 285 | ||
286 | return $articleTitle; | 286 | return $articleTitle; |
287 | } | 287 | } |
288 | 288 | ||
289 | /** | 289 | /** |
290 | * Prepare the HTML document for readability to scrape it. | 290 | * Prepare the HTML document for readability to scrape it. |
291 | * This includes things like stripping javascript, CSS, and handling terrible markup. | 291 | * This includes things like stripping javascript, CSS, and handling terrible markup. |
292 | * | 292 | * |
293 | * @return void | 293 | * @return void |
294 | **/ | 294 | **/ |
295 | protected function prepDocument() { | 295 | protected function prepDocument() { |
296 | /** | 296 | /** |
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | 297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) |
298 | * so we create a new body node and append it to the document. | 298 | * so we create a new body node and append it to the document. |
299 | */ | 299 | */ |
300 | if ($this->body == null) | 300 | if ($this->body == null) |
301 | { | 301 | { |
302 | $this->body = $this->dom->createElement('body'); | 302 | $this->body = $this->dom->createElement('body'); |
303 | $this->dom->documentElement->appendChild($this->body); | 303 | $this->dom->documentElement->appendChild($this->body); |
304 | } | 304 | } |
305 | $this->body->setAttribute('id', 'readabilityBody'); | 305 | $this->body->setAttribute('id', 'readabilityBody'); |
306 | 306 | ||
307 | /* Remove all style tags in head */ | 307 | /* Remove all style tags in head */ |
308 | $styleTags = $this->dom->getElementsByTagName('style'); | 308 | $styleTags = $this->dom->getElementsByTagName('style'); |
309 | for ($i = $styleTags->length-1; $i >= 0; $i--) | 309 | for ($i = $styleTags->length-1; $i >= 0; $i--) |
310 | { | 310 | { |
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | 311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); |
312 | } | 312 | } |
313 | 313 | ||
314 | /* Turn all double br's into p's */ | 314 | /* Turn all double br's into p's */ |
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | 315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ |
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | 316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); |
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | 317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. |
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | 318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. |
319 | } | 319 | } |
320 | 320 | ||
321 | /** | 321 | /** |
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | 322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. |
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | 323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php |
324 | * | 324 | * |
325 | * @return void | 325 | * @return void |
326 | **/ | 326 | **/ |
327 | public function addFootnotes($articleContent) { | 327 | public function addFootnotes($articleContent) { |
328 | $footnotesWrapper = $this->dom->createElement('div'); | 328 | $footnotesWrapper = $this->dom->createElement('div'); |
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | 329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); |
330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | 330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; |
331 | 331 | ||
332 | $articleFootnotes = $this->dom->createElement('ol'); | 332 | $articleFootnotes = $this->dom->createElement('ol'); |
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | 333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); |
334 | $footnotesWrapper->appendChild($articleFootnotes); | 334 | $footnotesWrapper->appendChild($articleFootnotes); |
335 | 335 | ||
336 | $articleLinks = $articleContent->getElementsByTagName('a'); | 336 | $articleLinks = $articleContent->getElementsByTagName('a'); |
337 | 337 | ||
338 | $linkCount = 0; | 338 | $linkCount = 0; |
339 | for ($i = 0; $i < $articleLinks->length; $i++) | 339 | for ($i = 0; $i < $articleLinks->length; $i++) |
340 | { | 340 | { |
341 | $articleLink = $articleLinks->item($i); | 341 | $articleLink = $articleLinks->item($i); |
342 | $footnoteLink = $articleLink->cloneNode(true); | 342 | $footnoteLink = $articleLink->cloneNode(true); |
343 | $refLink = $this->dom->createElement('a'); | 343 | $refLink = $this->dom->createElement('a'); |
344 | $footnote = $this->dom->createElement('li'); | 344 | $footnote = $this->dom->createElement('li'); |
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | 345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); |
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | 346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); |
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | 347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, |
348 | $linkText = $this->getInnerText($articleLink); | 348 | $linkText = $this->getInnerText($articleLink); |
349 | 349 | ||
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | 350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { |
351 | continue; | 351 | continue; |
352 | } | 352 | } |
353 | 353 | ||
354 | $linkCount++; | 354 | $linkCount++; |
355 | 355 | ||
356 | /** Add a superscript reference after the article link */ | 356 | /** Add a superscript reference after the article link */ |
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | 357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); |
358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | 358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; |
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | 359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); |
360 | $refLink->setAttribute('style', 'color: inherit;'); | 360 | $refLink->setAttribute('style', 'color: inherit;'); |
361 | 361 | ||
362 | //TODO: does this work or should we use DOMNode.isSameNode()? | 362 | //TODO: does this work or should we use DOMNode.isSameNode()? |
363 | if ($articleLink->parentNode->lastChild == $articleLink) { | 363 | if ($articleLink->parentNode->lastChild == $articleLink) { |
364 | $articleLink->parentNode->appendChild($refLink); | 364 | $articleLink->parentNode->appendChild($refLink); |
365 | } else { | 365 | } else { |
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | 366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); |
367 | } | 367 | } |
368 | 368 | ||
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | 369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | 370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
371 | 371 | ||
372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | 372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; |
373 | 373 | ||
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | 374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); |
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | 375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
376 | 376 | ||
377 | $footnote->appendChild($footnoteLink); | 377 | $footnote->appendChild($footnoteLink); |
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | 378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; |
379 | 379 | ||
380 | $articleFootnotes->appendChild($footnote); | 380 | $articleFootnotes->appendChild($footnote); |
381 | } | 381 | } |
382 | 382 | ||
383 | if ($linkCount > 0) { | 383 | if ($linkCount > 0) { |
384 | $articleContent->appendChild($footnotesWrapper); | 384 | $articleContent->appendChild($footnotesWrapper); |
385 | } | 385 | } |
386 | } | 386 | } |
387 | 387 | ||
388 | /** | 388 | /** |
389 | * Reverts P elements with class 'readability-styled' | 389 | * Reverts P elements with class 'readability-styled' |
390 | * to text nodes - which is what they were before. | 390 | * to text nodes - which is what they were before. |
391 | * | 391 | * |
392 | * @param DOMElement | 392 | * @param DOMElement |
393 | * @return void | 393 | * @return void |
394 | */ | 394 | */ |
395 | function revertReadabilityStyledElements($articleContent) { | 395 | function revertReadabilityStyledElements($articleContent) { |
396 | $xpath = new DOMXPath($articleContent->ownerDocument); | 396 | $xpath = new DOMXPath($articleContent->ownerDocument); |
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | 397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); |
398 | //$elems = $articleContent->getElementsByTagName('p'); | 398 | //$elems = $articleContent->getElementsByTagName('p'); |
399 | for ($i = $elems->length-1; $i >= 0; $i--) { | 399 | for ($i = $elems->length-1; $i >= 0; $i--) { |
400 | $e = $elems->item($i); | 400 | $e = $elems->item($i); |
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | 401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); |
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | 402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { |
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | 403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); |
404 | //} | 404 | //} |
405 | } | 405 | } |
406 | } | 406 | } |
407 | 407 | ||
408 | /** | 408 | /** |
409 | * Prepare the article node for display. Clean out any inline styles, | 409 | * Prepare the article node for display. Clean out any inline styles, |
410 | * iframes, forms, strip extraneous <p> tags, etc. | 410 | * iframes, forms, strip extraneous <p> tags, etc. |
411 | * | 411 | * |
412 | * @param DOMElement | 412 | * @param DOMElement |
413 | * @return void | 413 | * @return void |
414 | */ | 414 | */ |
415 | function prepArticle($articleContent) { | 415 | function prepArticle($articleContent) { |
416 | $this->cleanStyles($articleContent); | 416 | $this->cleanStyles($articleContent); |
417 | $this->killBreaks($articleContent); | 417 | $this->killBreaks($articleContent); |
418 | if ($this->revertForcedParagraphElements) { | 418 | if ($this->revertForcedParagraphElements) { |
419 | $this->revertReadabilityStyledElements($articleContent); | 419 | $this->revertReadabilityStyledElements($articleContent); |
420 | } | 420 | } |
421 | 421 | ||
422 | /* Clean out junk from the article content */ | 422 | /* Clean out junk from the article content */ |
423 | $this->cleanConditionally($articleContent, 'form'); | 423 | $this->cleanConditionally($articleContent, 'form'); |
424 | $this->clean($articleContent, 'object'); | 424 | $this->clean($articleContent, 'object'); |
425 | $this->clean($articleContent, 'h1'); | 425 | $this->clean($articleContent, 'h1'); |
426 | 426 | ||
427 | /** | 427 | /** |
428 | * If there is only one h2, they are probably using it | 428 | * If there is only one h2, they are probably using it |
429 | * as a header and not a subheader, so remove it since we already have a header. | 429 | * as a header and not a subheader, so remove it since we already have a header. |
430 | ***/ | 430 | ***/ |
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { | 431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { |
432 | $this->clean($articleContent, 'h2'); | 432 | $this->clean($articleContent, 'h2'); |
433 | } | 433 | } |
434 | $this->clean($articleContent, 'iframe'); | 434 | $this->clean($articleContent, 'iframe'); |
435 | 435 | ||
436 | $this->cleanHeaders($articleContent); | 436 | $this->cleanHeaders($articleContent); |
437 | 437 | ||
438 | /* Do these last as the previous stuff may have removed junk that will affect these */ | 438 | /* Do these last as the previous stuff may have removed junk that will affect these */ |
439 | $this->cleanConditionally($articleContent, 'table'); | 439 | $this->cleanConditionally($articleContent, 'table'); |
440 | $this->cleanConditionally($articleContent, 'ul'); | 440 | $this->cleanConditionally($articleContent, 'ul'); |
441 | $this->cleanConditionally($articleContent, 'div'); | 441 | $this->cleanConditionally($articleContent, 'div'); |
442 | 442 | ||
443 | /* Remove extra paragraphs */ | 443 | /* Remove extra paragraphs */ |
444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | 444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); |
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | 445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) |
446 | { | 446 | { |
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | 447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; |
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | 448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; |
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | 449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; |
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; | 450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; |
451 | 451 | ||
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | 452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') |
453 | { | 453 | { |
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | 454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); |
455 | } | 455 | } |
456 | } | 456 | } |
457 | 457 | ||
458 | try { | 458 | try { |
459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | 459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); |
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | 460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); |
461 | } | 461 | } |
462 | catch (Exception $e) { | 462 | catch (Exception $e) { |
463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | 463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); |
464 | } | 464 | } |
465 | } | 465 | } |
466 | 466 | ||
467 | /** | 467 | /** |
468 | * Initialize a node with the readability object. Also checks the | 468 | * Initialize a node with the readability object. Also checks the |
469 | * className/id for special names to add to its score. | 469 | * className/id for special names to add to its score. |
470 | * | 470 | * |
471 | * @param Element | 471 | * @param Element |
472 | * @return void | 472 | * @return void |
473 | **/ | 473 | **/ |
474 | protected function initializeNode($node) { | 474 | protected function initializeNode($node) { |
475 | $readability = $this->dom->createAttribute('readability'); | 475 | $readability = $this->dom->createAttribute('readability'); |
476 | $readability->value = 0; // this is our contentScore | 476 | $readability->value = 0; // this is our contentScore |
477 | $node->setAttributeNode($readability); | 477 | $node->setAttributeNode($readability); |
478 | 478 | ||
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | 479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case |
480 | case 'DIV': | 480 | case 'DIV': |
481 | $readability->value += 5; | 481 | $readability->value += 5; |
482 | break; | 482 | break; |
483 | 483 | ||
484 | case 'PRE': | 484 | case 'PRE': |
485 | case 'TD': | 485 | case 'TD': |
486 | case 'BLOCKQUOTE': | 486 | case 'BLOCKQUOTE': |
487 | $readability->value += 3; | 487 | $readability->value += 3; |
488 | break; | 488 | break; |
489 | 489 | ||
490 | case 'ADDRESS': | 490 | case 'ADDRESS': |
491 | case 'OL': | 491 | case 'OL': |
492 | case 'UL': | 492 | case 'UL': |
493 | case 'DL': | 493 | case 'DL': |
494 | case 'DD': | 494 | case 'DD': |
495 | case 'DT': | 495 | case 'DT': |
496 | case 'LI': | 496 | case 'LI': |
497 | case 'FORM': | 497 | case 'FORM': |
498 | $readability->value -= 3; | 498 | $readability->value -= 3; |
499 | break; | 499 | break; |
500 | 500 | ||
501 | case 'H1': | 501 | case 'H1': |
502 | case 'H2': | 502 | case 'H2': |
503 | case 'H3': | 503 | case 'H3': |
504 | case 'H4': | 504 | case 'H4': |
505 | case 'H5': | 505 | case 'H5': |
506 | case 'H6': | 506 | case 'H6': |
507 | case 'TH': | 507 | case 'TH': |
508 | $readability->value -= 5; | 508 | $readability->value -= 5; |
509 | break; | 509 | break; |
510 | } | 510 | } |
511 | $readability->value += $this->getClassWeight($node); | 511 | $readability->value += $this->getClassWeight($node); |
512 | } | 512 | } |
513 | 513 | ||
514 | /*** | 514 | /*** |
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | 515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | 516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
517 | * | 517 | * |
518 | * @return DOMElement | 518 | * @return DOMElement |
519 | **/ | 519 | **/ |
520 | protected function grabArticle($page=null) { | 520 | protected function grabArticle($page=null) { |
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | 521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); |
522 | if (!$page) $page = $this->dom; | 522 | if (!$page) $page = $this->dom; |
523 | $allElements = $page->getElementsByTagName('*'); | 523 | $allElements = $page->getElementsByTagName('*'); |
524 | /** | 524 | /** |
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | 525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs |
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | 526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) |
527 | * | 527 | * |
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | 528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
529 | * TODO: Shouldn't this be a reverse traversal? | 529 | * TODO: Shouldn't this be a reverse traversal? |
530 | **/ | 530 | **/ |
531 | $node = null; | 531 | $node = null; |
532 | $nodesToScore = array(); | 532 | $nodesToScore = array(); |
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | 533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { |
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | 534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { |
535 | //$node = $targetList->item($nodeIndex); | 535 | //$node = $targetList->item($nodeIndex); |
536 | $tagName = strtoupper($node->tagName); | 536 | $tagName = strtoupper($node->tagName); |
537 | /* Remove unlikely candidates */ | 537 | /* Remove unlikely candidates */ |
538 | if ($stripUnlikelyCandidates) { | 538 | if ($stripUnlikelyCandidates) { |
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | 539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); |
540 | if ( | 540 | if ( |
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | 541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && |
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | 542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && |
543 | $tagName != 'BODY' | 543 | $tagName != 'BODY' |
544 | ) | 544 | ) |
545 | { | 545 | { |
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | 546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); |
547 | //$nodesToRemove[] = $node; | 547 | //$nodesToRemove[] = $node; |
548 | $node->parentNode->removeChild($node); | 548 | $node->parentNode->removeChild($node); |
549 | $nodeIndex--; | 549 | $nodeIndex--; |
550 | continue; | 550 | continue; |
551 | } | 551 | } |
552 | } | 552 | } |
553 | 553 | ||
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | 554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { |
555 | $nodesToScore[] = $node; | 555 | $nodesToScore[] = $node; |
556 | } | 556 | } |
557 | 557 | ||
558 | /* Turn all divs that don't have children block level elements into p's */ | 558 | /* Turn all divs that don't have children block level elements into p's */ |
559 | if ($tagName == 'DIV') { | 559 | if ($tagName == 'DIV') { |
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | 560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { |
561 | //$this->dbg('Altering div to p'); | 561 | //$this->dbg('Altering div to p'); |
562 | $newNode = $this->dom->createElement('p'); | 562 | $newNode = $this->dom->createElement('p'); |
563 | try { | 563 | try { |
564 | $newNode->innerHTML = $node->innerHTML; | 564 | $newNode->innerHTML = $node->innerHTML; |
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | 565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); |
566 | $node->parentNode->replaceChild($newNode, $node); | 566 | $node->parentNode->replaceChild($newNode, $node); |
567 | $nodeIndex--; | 567 | $nodeIndex--; |
568 | $nodesToScore[] = $node; // or $newNode? | 568 | $nodesToScore[] = $node; // or $newNode? |
569 | } | 569 | } |
570 | catch(Exception $e) { | 570 | catch(Exception $e) { |
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | 571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); |
572 | } | 572 | } |
573 | } | 573 | } |
574 | else | 574 | else |
575 | { | 575 | { |
576 | /* EXPERIMENTAL */ | 576 | /* EXPERIMENTAL */ |
577 | // TODO: change these p elements back to text nodes after processing | 577 | // TODO: change these p elements back to text nodes after processing |
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | 578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { |
579 | $childNode = $node->childNodes->item($i); | 579 | $childNode = $node->childNodes->item($i); |
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | 580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE |
581 | //$this->dbg('replacing text node with a p tag with the same content.'); | 581 | //$this->dbg('replacing text node with a p tag with the same content.'); |
582 | $p = $this->dom->createElement('p'); | 582 | $p = $this->dom->createElement('p'); |
583 | $p->innerHTML = $childNode->nodeValue; | 583 | $p->innerHTML = $childNode->nodeValue; |
584 | $p->setAttribute('style', 'display: inline;'); | 584 | $p->setAttribute('style', 'display: inline;'); |
585 | $p->setAttribute('class', 'readability-styled'); | 585 | $p->setAttribute('class', 'readability-styled'); |
586 | $childNode->parentNode->replaceChild($p, $childNode); | 586 | $childNode->parentNode->replaceChild($p, $childNode); |
587 | } | 587 | } |
588 | } | 588 | } |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | /** | 593 | /** |
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | 594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
595 | * Then add their score to their parent node. | 595 | * Then add their score to their parent node. |
596 | * | 596 | * |
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | 597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
598 | **/ | 598 | **/ |
599 | $candidates = array(); | 599 | $candidates = array(); |
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | 600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { |
601 | $parentNode = $nodesToScore[$pt]->parentNode; | 601 | $parentNode = $nodesToScore[$pt]->parentNode; |
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | 602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; |
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | 603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); |
604 | $innerText = $this->getInnerText($nodesToScore[$pt]); | 604 | $innerText = $this->getInnerText($nodesToScore[$pt]); |
605 | 605 | ||
606 | if (!$parentNode || !isset($parentNode->tagName)) { | 606 | if (!$parentNode || !isset($parentNode->tagName)) { |
607 | continue; | 607 | continue; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* If this paragraph is less than 25 characters, don't even count it. */ | 610 | /* If this paragraph is less than 25 characters, don't even count it. */ |
611 | if(strlen($innerText) < 25) { | 611 | if(strlen($innerText) < 25) { |
612 | continue; | 612 | continue; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* Initialize readability data for the parent. */ | 615 | /* Initialize readability data for the parent. */ |
616 | if (!$parentNode->hasAttribute('readability')) | 616 | if (!$parentNode->hasAttribute('readability')) |
617 | { | 617 | { |
618 | $this->initializeNode($parentNode); | 618 | $this->initializeNode($parentNode); |
619 | $candidates[] = $parentNode; | 619 | $candidates[] = $parentNode; |
620 | } | 620 | } |
621 | 621 | ||
622 | /* Initialize readability data for the grandparent. */ | 622 | /* Initialize readability data for the grandparent. */ |
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | 623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) |
624 | { | 624 | { |
625 | $this->initializeNode($grandParentNode); | 625 | $this->initializeNode($grandParentNode); |
626 | $candidates[] = $grandParentNode; | 626 | $candidates[] = $grandParentNode; |
627 | } | 627 | } |
628 | 628 | ||
629 | $contentScore = 0; | 629 | $contentScore = 0; |
630 | 630 | ||
631 | /* Add a point for the paragraph itself as a base. */ | 631 | /* Add a point for the paragraph itself as a base. */ |
632 | $contentScore++; | 632 | $contentScore++; |
633 | 633 | ||
634 | /* Add points for any commas within this paragraph */ | 634 | /* Add points for any commas within this paragraph */ |
635 | $contentScore += count(explode(',', $innerText)); | 635 | $contentScore += count(explode(',', $innerText)); |
636 | 636 | ||
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | 637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
638 | $contentScore += min(floor(strlen($innerText) / 100), 3); | 638 | $contentScore += min(floor(strlen($innerText) / 100), 3); |
639 | 639 | ||
640 | /* Add the score to the parent. The grandparent gets half. */ | 640 | /* Add the score to the parent. The grandparent gets half. */ |
641 | $parentNode->getAttributeNode('readability')->value += $contentScore; | 641 | $parentNode->getAttributeNode('readability')->value += $contentScore; |
642 | 642 | ||
643 | if ($grandParentNode) { | 643 | if ($grandParentNode) { |
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | 644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; |
645 | } | 645 | } |
646 | } | 646 | } |
647 | 647 | ||
648 | /** | 648 | /** |
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found | 649 | * After we've calculated scores, loop through all of the possible candidate nodes we found |
650 | * and find the one with the highest score. | 650 | * and find the one with the highest score. |
651 | **/ | 651 | **/ |
652 | $topCandidate = null; | 652 | $topCandidate = null; |
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | 653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) |
654 | { | 654 | { |
655 | /** | 655 | /** |
656 | * Scale the final candidates score based on link density. Good content should have a | 656 | * Scale the final candidates score based on link density. Good content should have a |
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | 657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. |
658 | **/ | 658 | **/ |
659 | $readability = $candidates[$c]->getAttributeNode('readability'); | 659 | $readability = $candidates[$c]->getAttributeNode('readability'); |
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | 660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); |
661 | 661 | ||
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | 662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); |
663 | 663 | ||
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | 664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { |
665 | $topCandidate = $candidates[$c]; | 665 | $topCandidate = $candidates[$c]; |
666 | } | 666 | } |
667 | } | 667 | } |
668 | 668 | ||
669 | /** | 669 | /** |
670 | * If we still have no top candidate, just use the body as a last resort. | 670 | * If we still have no top candidate, just use the body as a last resort. |
671 | * We also have to copy the body node so it is something we can modify. | 671 | * We also have to copy the body node so it is something we can modify. |
672 | **/ | 672 | **/ |
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | 673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') |
674 | { | 674 | { |
675 | $topCandidate = $this->dom->createElement('div'); | 675 | $topCandidate = $this->dom->createElement('div'); |
676 | if ($page instanceof DOMDocument) { | 676 | if ($page instanceof DOMDocument) { |
677 | if (!isset($page->documentElement)) { | 677 | if (!isset($page->documentElement)) { |
678 | // we don't have a body either? what a mess! :) | 678 | // we don't have a body either? what a mess! :) |
679 | } else { | 679 | } else { |
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; | 680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; |
681 | $page->documentElement->innerHTML = ''; | 681 | $page->documentElement->innerHTML = ''; |
682 | $page->documentElement->appendChild($topCandidate); | 682 | $page->documentElement->appendChild($topCandidate); |
683 | } | 683 | } |
684 | } else { | 684 | } else { |
685 | $topCandidate->innerHTML = $page->innerHTML; | 685 | $topCandidate->innerHTML = $page->innerHTML; |
686 | $page->innerHTML = ''; | 686 | $page->innerHTML = ''; |
687 | $page->appendChild($topCandidate); | 687 | $page->appendChild($topCandidate); |
688 | } | 688 | } |
689 | $this->initializeNode($topCandidate); | 689 | $this->initializeNode($topCandidate); |
690 | } | 690 | } |
691 | 691 | ||
692 | /** | 692 | /** |
693 | * Now that we have the top candidate, look through its siblings for content that might also be related. | 693 | * Now that we have the top candidate, look through its siblings for content that might also be related. |
694 | * Things like preambles, content split by ads that we removed, etc. | 694 | * Things like preambles, content split by ads that we removed, etc. |
695 | **/ | 695 | **/ |
696 | $articleContent = $this->dom->createElement('div'); | 696 | $articleContent = $this->dom->createElement('div'); |
697 | $articleContent->setAttribute('id', 'readability-content'); | 697 | $articleContent->setAttribute('id', 'readability-content'); |
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | 698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); |
699 | $siblingNodes = $topCandidate->parentNode->childNodes; | 699 | $siblingNodes = $topCandidate->parentNode->childNodes; |
700 | if (!isset($siblingNodes)) { | 700 | if (!isset($siblingNodes)) { |
701 | $siblingNodes = new stdClass; | 701 | $siblingNodes = new stdClass; |
702 | $siblingNodes->length = 0; | 702 | $siblingNodes->length = 0; |
703 | } | 703 | } |
704 | 704 | ||
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | 705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) |
706 | { | 706 | { |
707 | $siblingNode = $siblingNodes->item($s); | 707 | $siblingNode = $siblingNodes->item($s); |
708 | $append = false; | 708 | $append = false; |
709 | 709 | ||
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | 710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); |
711 | 711 | ||
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | 712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); |
713 | 713 | ||
714 | if ($siblingNode === $topCandidate) | 714 | if ($siblingNode === $topCandidate) |
715 | // or if ($siblingNode->isSameNode($topCandidate)) | 715 | // or if ($siblingNode->isSameNode($topCandidate)) |
716 | { | 716 | { |
717 | $append = true; | 717 | $append = true; |
718 | } | 718 | } |
719 | 719 | ||
720 | $contentBonus = 0; | 720 | $contentBonus = 0; |
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | 721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | 722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { |
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | 723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; |
724 | } | 724 | } |
725 | 725 | ||
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | 726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) |
727 | { | 727 | { |
728 | $append = true; | 728 | $append = true; |
729 | } | 729 | } |
730 | 730 | ||
731 | if (strtoupper($siblingNode->nodeName) == 'P') { | 731 | if (strtoupper($siblingNode->nodeName) == 'P') { |
732 | $linkDensity = $this->getLinkDensity($siblingNode); | 732 | $linkDensity = $this->getLinkDensity($siblingNode); |
733 | $nodeContent = $this->getInnerText($siblingNode); | 733 | $nodeContent = $this->getInnerText($siblingNode); |
734 | $nodeLength = strlen($nodeContent); | 734 | $nodeLength = strlen($nodeContent); |
735 | 735 | ||
736 | if ($nodeLength > 80 && $linkDensity < 0.25) | 736 | if ($nodeLength > 80 && $linkDensity < 0.25) |
737 | { | 737 | { |
738 | $append = true; | 738 | $append = true; |
739 | } | 739 | } |
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | 740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) |
741 | { | 741 | { |
742 | $append = true; | 742 | $append = true; |
743 | } | 743 | } |
744 | } | 744 | } |
745 | 745 | ||
746 | if ($append) | 746 | if ($append) |
747 | { | 747 | { |
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | 748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); |
749 | 749 | ||
750 | $nodeToAppend = null; | 750 | $nodeToAppend = null; |
751 | $sibNodeName = strtoupper($siblingNode->nodeName); | 751 | $sibNodeName = strtoupper($siblingNode->nodeName); |
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | 752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { |
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | 753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
754 | 754 | ||
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | 755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); |
756 | $nodeToAppend = $this->dom->createElement('div'); | 756 | $nodeToAppend = $this->dom->createElement('div'); |
757 | try { | 757 | try { |
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | 758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); |
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | 759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; |
760 | } | 760 | } |
761 | catch(Exception $e) | 761 | catch(Exception $e) |
762 | { | 762 | { |
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | 763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); |
764 | $nodeToAppend = $siblingNode; | 764 | $nodeToAppend = $siblingNode; |
765 | $s--; | 765 | $s--; |
766 | $sl--; | 766 | $sl--; |
767 | } | 767 | } |
768 | } else { | 768 | } else { |
769 | $nodeToAppend = $siblingNode; | 769 | $nodeToAppend = $siblingNode; |
770 | $s--; | 770 | $s--; |
771 | $sl--; | 771 | $sl--; |
772 | } | 772 | } |
773 | 773 | ||
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | 774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ |
775 | $nodeToAppend->removeAttribute('class'); | 775 | $nodeToAppend->removeAttribute('class'); |
776 | 776 | ||
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | 777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
778 | $articleContent->appendChild($nodeToAppend); | 778 | $articleContent->appendChild($nodeToAppend); |
779 | } | 779 | } |
780 | } | 780 | } |
781 | 781 | ||
782 | /** | 782 | /** |
783 | * So we have all of the content that we need. Now we clean it up for presentation. | 783 | * So we have all of the content that we need. Now we clean it up for presentation. |
784 | **/ | 784 | **/ |
785 | $this->prepArticle($articleContent); | 785 | $this->prepArticle($articleContent); |
786 | 786 | ||
787 | /** | 787 | /** |
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | 788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. |
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | 789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | 790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
791 | * finding the -right- content. | 791 | * finding the -right- content. |
792 | **/ | 792 | **/ |
793 | if (strlen($this->getInnerText($articleContent, false)) < 250) | 793 | if (strlen($this->getInnerText($articleContent, false)) < 250) |
794 | { | 794 | { |
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 | 795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 |
796 | // in the meantime, we check and create an empty element if it's not there. | 796 | // in the meantime, we check and create an empty element if it's not there. |
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); | 797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); |
798 | $this->body->innerHTML = $this->bodyCache; | 798 | $this->body->innerHTML = $this->bodyCache; |
799 | 799 | ||
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | 800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { |
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | 801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); |
802 | return $this->grabArticle($this->body); | 802 | return $this->grabArticle($this->body); |
803 | } | 803 | } |
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | 805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); |
806 | return $this->grabArticle($this->body); | 806 | return $this->grabArticle($this->body); |
807 | } | 807 | } |
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | 809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); |
810 | return $this->grabArticle($this->body); | 810 | return $this->grabArticle($this->body); |
811 | } | 811 | } |
812 | else { | 812 | else { |
813 | return false; | 813 | return false; |
814 | } | 814 | } |
815 | } | 815 | } |
816 | return $articleContent; | 816 | return $articleContent; |
817 | } | 817 | } |
818 | 818 | ||
819 | /** | 819 | /** |
820 | * Remove script tags from document | 820 | * Remove script tags from document |
821 | * | 821 | * |
822 | * @param DOMElement | 822 | * @param DOMElement |
823 | * @return void | 823 | * @return void |
824 | */ | 824 | */ |
825 | public function removeScripts($doc) { | 825 | public function removeScripts($doc) { |
826 | $scripts = $doc->getElementsByTagName('script'); | 826 | $scripts = $doc->getElementsByTagName('script'); |
827 | for($i = $scripts->length-1; $i >= 0; $i--) | 827 | for($i = $scripts->length-1; $i >= 0; $i--) |
828 | { | 828 | { |
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | 829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); |
830 | } | 830 | } |
831 | } | 831 | } |
832 | 832 | ||
833 | /** | 833 | /** |
834 | * Get the inner text of a node. | 834 | * Get the inner text of a node. |
835 | * This also strips out any excess whitespace to be found. | 835 | * This also strips out any excess whitespace to be found. |
836 | * | 836 | * |
837 | * @param DOMElement $ | 837 | * @param DOMElement $ |
838 | * @param boolean $normalizeSpaces (default: true) | 838 | * @param boolean $normalizeSpaces (default: true) |
839 | * @return string | 839 | * @return string |
840 | **/ | 840 | **/ |
841 | public function getInnerText($e, $normalizeSpaces=true) { | 841 | public function getInnerText($e, $normalizeSpaces=true) { |
842 | $textContent = ''; | 842 | $textContent = ''; |
843 | 843 | ||
844 | if (!isset($e->textContent) || $e->textContent == '') { | 844 | if (!isset($e->textContent) || $e->textContent == '') { |
845 | return ''; | 845 | return ''; |
846 | } | 846 | } |
847 | 847 | ||
848 | $textContent = trim($e->textContent); | 848 | $textContent = trim($e->textContent); |
849 | 849 | ||
850 | if ($normalizeSpaces) { | 850 | if ($normalizeSpaces) { |
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | 851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); |
852 | } else { | 852 | } else { |
853 | return $textContent; | 853 | return $textContent; |
854 | } | 854 | } |
855 | } | 855 | } |
856 | 856 | ||
857 | /** | 857 | /** |
858 | * Get the number of times a string $s appears in the node $e. | 858 | * Get the number of times a string $s appears in the node $e. |
859 | * | 859 | * |
860 | * @param DOMElement $e | 860 | * @param DOMElement $e |
861 | * @param string - what to count. Default is "," | 861 | * @param string - what to count. Default is "," |
862 | * @return number (integer) | 862 | * @return number (integer) |
863 | **/ | 863 | **/ |
864 | public function getCharCount($e, $s=',') { | 864 | public function getCharCount($e, $s=',') { |
865 | return substr_count($this->getInnerText($e), $s); | 865 | return substr_count($this->getInnerText($e), $s); |
866 | } | 866 | } |
867 | 867 | ||
868 | /** | 868 | /** |
869 | * Remove the style attribute on every $e and under. | 869 | * Remove the style attribute on every $e and under. |
870 | * | 870 | * |
871 | * @param DOMElement $e | 871 | * @param DOMElement $e |
872 | * @return void | 872 | * @return void |
873 | */ | 873 | */ |
874 | public function cleanStyles($e) { | 874 | public function cleanStyles($e) { |
875 | if (!is_object($e)) return; | 875 | if (!is_object($e)) return; |
876 | $elems = $e->getElementsByTagName('*'); | 876 | $elems = $e->getElementsByTagName('*'); |
877 | foreach ($elems as $elem) { | 877 | foreach ($elems as $elem) { |
878 | $elem->removeAttribute('style'); | 878 | $elem->removeAttribute('style'); |
879 | } | 879 | } |
880 | } | 880 | } |
881 | 881 | ||
882 | /** | 882 | /** |
883 | * Get the density of links as a percentage of the content | 883 | * Get the density of links as a percentage of the content |
884 | * This is the amount of text that is inside a link divided by the total text in the node. | 884 | * This is the amount of text that is inside a link divided by the total text in the node. |
885 | * | 885 | * |
886 | * @param DOMElement $e | 886 | * @param DOMElement $e |
887 | * @return number (float) | 887 | * @return number (float) |
888 | */ | 888 | */ |
889 | public function getLinkDensity($e) { | 889 | public function getLinkDensity($e) { |
890 | $links = $e->getElementsByTagName('a'); | 890 | $links = $e->getElementsByTagName('a'); |
891 | $textLength = strlen($this->getInnerText($e)); | 891 | $textLength = strlen($this->getInnerText($e)); |
892 | $linkLength = 0; | 892 | $linkLength = 0; |
893 | for ($i=0, $il=$links->length; $i < $il; $i++) | 893 | for ($i=0, $il=$links->length; $i < $il; $i++) |
894 | { | 894 | { |
895 | $linkLength += strlen($this->getInnerText($links->item($i))); | 895 | $linkLength += strlen($this->getInnerText($links->item($i))); |
896 | } | 896 | } |
897 | if ($textLength > 0) { | 897 | if ($textLength > 0) { |
898 | return $linkLength / $textLength; | 898 | return $linkLength / $textLength; |
899 | } else { | 899 | } else { |
900 | return 0; | 900 | return 0; |
901 | } | 901 | } |
902 | } | 902 | } |
903 | 903 | ||
904 | /** | 904 | /** |
905 | * Get an elements class/id weight. Uses regular expressions to tell if this | 905 | * Get an elements class/id weight. Uses regular expressions to tell if this |
906 | * element looks good or bad. | 906 | * element looks good or bad. |
907 | * | 907 | * |
908 | * @param DOMElement $e | 908 | * @param DOMElement $e |
909 | * @return number (Integer) | 909 | * @return number (Integer) |
910 | */ | 910 | */ |
911 | public function getClassWeight($e) { | 911 | public function getClassWeight($e) { |
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
913 | return 0; | 913 | return 0; |
914 | } | 914 | } |
915 | 915 | ||
916 | $weight = 0; | 916 | $weight = 0; |
917 | 917 | ||
918 | /* Look for a special classname */ | 918 | /* Look for a special classname */ |
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | 919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') |
920 | { | 920 | { |
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | 921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { |
922 | $weight -= 25; | 922 | $weight -= 25; |
923 | } | 923 | } |
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | 924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { |
925 | $weight += 25; | 925 | $weight += 25; |
926 | } | 926 | } |
927 | } | 927 | } |
928 | 928 | ||
929 | /* Look for a special ID */ | 929 | /* Look for a special ID */ |
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | 930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') |
931 | { | 931 | { |
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | 932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { |
933 | $weight -= 25; | 933 | $weight -= 25; |
934 | } | 934 | } |
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | 935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { |
936 | $weight += 25; | 936 | $weight += 25; |
937 | } | 937 | } |
938 | } | 938 | } |
939 | return $weight; | 939 | return $weight; |
940 | } | 940 | } |
941 | 941 | ||
942 | /** | 942 | /** |
943 | * Remove extraneous break tags from a node. | 943 | * Remove extraneous break tags from a node. |
944 | * | 944 | * |
945 | * @param DOMElement $node | 945 | * @param DOMElement $node |
946 | * @return void | 946 | * @return void |
947 | */ | 947 | */ |
948 | public function killBreaks($node) { | 948 | public function killBreaks($node) { |
949 | $html = $node->innerHTML; | 949 | $html = $node->innerHTML; |
950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | 950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); |
951 | $node->innerHTML = $html; | 951 | $node->innerHTML = $html; |
952 | } | 952 | } |
953 | 953 | ||
954 | /** | 954 | /** |
955 | * Clean a node of all elements of type "tag". | 955 | * Clean a node of all elements of type "tag". |
956 | * (Unless it's a youtube/vimeo video. People love movies.) | 956 | * (Unless it's a youtube/vimeo video. People love movies.) |
957 | * | 957 | * |
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes | 958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes |
959 | * | 959 | * |
960 | * @param DOMElement $e | 960 | * @param DOMElement $e |
961 | * @param string $tag | 961 | * @param string $tag |
962 | * @return void | 962 | * @return void |
963 | */ | 963 | */ |
964 | public function clean($e, $tag) { | 964 | public function clean($e, $tag) { |
965 | $targetList = $e->getElementsByTagName($tag); | 965 | $targetList = $e->getElementsByTagName($tag); |
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); | 966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); |
967 | 967 | ||
968 | for ($y=$targetList->length-1; $y >= 0; $y--) { | 968 | for ($y=$targetList->length-1; $y >= 0; $y--) { |
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | 969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ |
970 | if ($isEmbed) { | 970 | if ($isEmbed) { |
971 | $attributeValues = ''; | 971 | $attributeValues = ''; |
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | 972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { |
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | 973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) |
974 | } | 974 | } |
975 | 975 | ||
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | 976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ |
977 | if (preg_match($this->regexps['video'], $attributeValues)) { | 977 | if (preg_match($this->regexps['video'], $attributeValues)) { |
978 | continue; | 978 | continue; |
979 | } | 979 | } |
980 | 980 | ||
981 | /* Then check the elements inside this element for the same. */ | 981 | /* Then check the elements inside this element for the same. */ |
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | 982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { |
983 | continue; | 983 | continue; |
984 | } | 984 | } |
985 | } | 985 | } |
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | 986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); |
987 | } | 987 | } |
988 | } | 988 | } |
989 | 989 | ||
990 | /** | 990 | /** |
991 | * Clean an element of all tags of type "tag" if they look fishy. | 991 | * Clean an element of all tags of type "tag" if they look fishy. |
992 | * "Fishy" is an algorithm based on content length, classnames, | 992 | * "Fishy" is an algorithm based on content length, classnames, |
993 | * link density, number of images & embeds, etc. | 993 | * link density, number of images & embeds, etc. |
994 | * | 994 | * |
995 | * @param DOMElement $e | 995 | * @param DOMElement $e |
996 | * @param string $tag | 996 | * @param string $tag |
997 | * @return void | 997 | * @return void |
998 | */ | 998 | */ |
999 | public function cleanConditionally($e, $tag) { | 999 | public function cleanConditionally($e, $tag) { |
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
1001 | return; | 1001 | return; |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | $tagsList = $e->getElementsByTagName($tag); | 1004 | $tagsList = $e->getElementsByTagName($tag); |
1005 | $curTagsLength = $tagsList->length; | 1005 | $curTagsLength = $tagsList->length; |
1006 | 1006 | ||
1007 | /** | 1007 | /** |
1008 | * Gather counts for other typical elements embedded within. | 1008 | * Gather counts for other typical elements embedded within. |
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | 1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. |
1010 | * | 1010 | * |
1011 | * TODO: Consider taking into account original contentScore here. | 1011 | * TODO: Consider taking into account original contentScore here. |
1012 | */ | 1012 | */ |
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | 1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { |
1014 | $weight = $this->getClassWeight($tagsList->item($i)); | 1014 | $weight = $this->getClassWeight($tagsList->item($i)); |
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | 1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; |
1016 | 1016 | ||
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | 1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); |
1018 | 1018 | ||
1019 | if ($weight + $contentScore < 0) { | 1019 | if ($weight + $contentScore < 0) { |
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1021 | } | 1021 | } |
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | 1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { |
1023 | /** | 1023 | /** |
1024 | * If there are not very many commas, and the number of | 1024 | * If there are not very many commas, and the number of |
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | 1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. |
1026 | **/ | 1026 | **/ |
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | 1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; |
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | 1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; |
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | 1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; |
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | 1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; |
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; | 1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; |
1032 | 1032 | ||
1033 | $embedCount = 0; | 1033 | $embedCount = 0; |
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | 1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); |
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1037 | $embedCount++; | 1037 | $embedCount++; |
1038 | } | 1038 | } |
1039 | } | 1039 | } |
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); | 1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); |
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1043 | $embedCount++; | 1043 | $embedCount++; |
1044 | } | 1044 | } |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | 1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); |
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | 1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); |
1049 | $toRemove = false; | 1049 | $toRemove = false; |
1050 | 1050 | ||
1051 | if ($this->lightClean) { | 1051 | if ($this->lightClean) { |
1052 | $this->dbg('Light clean...'); | 1052 | $this->dbg('Light clean...'); |
1053 | if ( ($img > $p) && ($img > 4) ) { | 1053 | if ( ($img > $p) && ($img > 4) ) { |
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); | 1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); |
1055 | $toRemove = true; | 1055 | $toRemove = true; |
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1058 | $toRemove = true; | 1058 | $toRemove = true; |
1059 | } else if ( $input > floor($p/3) ) { | 1059 | } else if ( $input > floor($p/3) ) { |
1060 | $this->dbg(' too many <input> elements'); | 1060 | $this->dbg(' too many <input> elements'); |
1061 | $toRemove = true; | 1061 | $toRemove = true; |
1062 | } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { | 1062 | } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) { |
1063 | $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); | 1063 | $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images'); |
1064 | $toRemove = true; | 1064 | $toRemove = true; |
1065 | } else if($weight < 25 && $linkDensity > 0.2) { | 1065 | } else if($weight < 25 && $linkDensity > 0.2) { |
1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1067 | $toRemove = true; | 1067 | $toRemove = true; |
1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { | 1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { |
1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); | 1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); |
1070 | $toRemove = true; | 1070 | $toRemove = true; |
1071 | } else if($embedCount > 3) { | 1071 | } else if($embedCount > 3) { |
1072 | $this->dbg(' more than 3 embeds'); | 1072 | $this->dbg(' more than 3 embeds'); |
1073 | $toRemove = true; | 1073 | $toRemove = true; |
1074 | } | 1074 | } |
1075 | } else { | 1075 | } else { |
1076 | $this->dbg('Standard clean...'); | 1076 | $this->dbg('Standard clean...'); |
1077 | if ( $img > $p ) { | 1077 | if ( $img > $p ) { |
1078 | $this->dbg(' more image elements than paragraph elements'); | 1078 | $this->dbg(' more image elements than paragraph elements'); |
1079 | $toRemove = true; | 1079 | $toRemove = true; |
1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1082 | $toRemove = true; | 1082 | $toRemove = true; |
1083 | } else if ( $input > floor($p/3) ) { | 1083 | } else if ( $input > floor($p/3) ) { |
1084 | $this->dbg(' too many <input> elements'); | 1084 | $this->dbg(' too many <input> elements'); |
1085 | $toRemove = true; | 1085 | $toRemove = true; |
1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | 1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { |
1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); | 1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); |
1088 | $toRemove = true; | 1088 | $toRemove = true; |
1089 | } else if($weight < 25 && $linkDensity > 0.2) { | 1089 | } else if($weight < 25 && $linkDensity > 0.2) { |
1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1091 | $toRemove = true; | 1091 | $toRemove = true; |
1092 | } else if($weight >= 25 && $linkDensity > 0.5) { | 1092 | } else if($weight >= 25 && $linkDensity > 0.5) { |
1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); | 1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); |
1094 | $toRemove = true; | 1094 | $toRemove = true; |
1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | 1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { |
1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); | 1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
1097 | $toRemove = true; | 1097 | $toRemove = true; |
1098 | } | 1098 | } |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | if ($toRemove) { | 1101 | if ($toRemove) { |
1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); | 1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); |
1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1104 | } | 1104 | } |
1105 | } | 1105 | } |
1106 | } | 1106 | } |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | /** | 1109 | /** |
1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | 1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
1111 | * | 1111 | * |
1112 | * @param DOMElement $e | 1112 | * @param DOMElement $e |
1113 | * @return void | 1113 | * @return void |
1114 | */ | 1114 | */ |
1115 | public function cleanHeaders($e) { | 1115 | public function cleanHeaders($e) { |
1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | 1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); | 1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); |
1118 | for ($i=$headers->length-1; $i >=0; $i--) { | 1118 | for ($i=$headers->length-1; $i >=0; $i--) { |
1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | 1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { |
1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | 1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); |
1121 | } | 1121 | } |
1122 | } | 1122 | } |
1123 | } | 1123 | } |
1124 | } | 1124 | } |
1125 | 1125 | ||
1126 | public function flagIsActive($flag) { | 1126 | public function flagIsActive($flag) { |
1127 | return ($this->flags & $flag) > 0; | 1127 | return ($this->flags & $flag) > 0; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | public function addFlag($flag) { | 1130 | public function addFlag($flag) { |
1131 | $this->flags = $this->flags | $flag; | 1131 | $this->flags = $this->flags | $flag; |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | public function removeFlag($flag) { | 1134 | public function removeFlag($flag) { |
1135 | $this->flags = $this->flags & ~$flag; | 1135 | $this->flags = $this->flags & ~$flag; |
1136 | } | 1136 | } |
1137 | } | 1137 | } |
1138 | ?> \ No newline at end of file | 1138 | ?> \ No newline at end of file |