aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc
diff options
context:
space:
mode:
authorNicolas Lœuillet <nicolas@loeuillet.org>2014-05-29 12:50:28 +0200
committerNicolas Lœuillet <nicolas@loeuillet.org>2014-05-29 12:50:28 +0200
commit87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b (patch)
tree558818975ac41403e7d55ad07c5b0ac29806e907 /inc
parentab157bbb75ba226917145c9bf906cbf764a85cd0 (diff)
parent0b9bb8cb7868f24137c5d8b85c39cc88ea877411 (diff)
downloadwallabag-87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b.tar.gz
wallabag-87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b.tar.zst
wallabag-87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b.zip
Merge pull request #707 from mariroz/dev
update to 3.2 version of full-text-rss, issue #694
Diffstat (limited to 'inc')
-rwxr-xr-xinc/3rdparty/config.php104
-rw-r--r--inc/3rdparty/libraries/content-extractor/ContentExtractor.php1455
-rw-r--r--inc/3rdparty/libraries/content-extractor/SiteConfig.php681
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/feedwriter/FeedItem.php100
-rwxr-xr-xinc/3rdparty/libraries/feedwriter/FeedWriter.php17
-rw-r--r--inc/3rdparty/libraries/html5/TreeBuilder.php13
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/CookieJar.php807
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php1589
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php157
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect.php992
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php57
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php339
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php (renamed from inc/3rdparty/libraries/language-detect/Parser.php)19
-rw-r--r--inc/3rdparty/libraries/readability/Readability.php2274
-rwxr-xr-xinc/3rdparty/makefulltextfeed.php349
-rwxr-xr-xinc/3rdparty/makefulltextfeedHelpers.php42
-rwxr-xr-xinc/3rdparty/site_config/custom/dailymotion.com.txt12
-rw-r--r--inc/3rdparty/site_config/custom/index.php3
-rwxr-xr-xinc/3rdparty/site_config/custom/ted.com.txt11
-rw-r--r--inc/3rdparty/site_config/index.php5
-rw-r--r--inc/3rdparty/site_config/standard/version.txt2
-rwxr-xr-xinc/poche/Poche.class.php11
22 files changed, 4849 insertions, 4190 deletions
diff --git a/inc/3rdparty/config.php b/inc/3rdparty/config.php
index e618117b..ec680d86 100755
--- a/inc/3rdparty/config.php
+++ b/inc/3rdparty/config.php
@@ -19,7 +19,7 @@ if (!isset($options)) $options = new stdClass();
19// Enable service 19// Enable service
20// ---------------------- 20// ----------------------
21// Set this to false if you want to disable the service. 21// Set this to false if you want to disable the service.
22// If set to false, no feed is produced and users will 22// If set to false, no feed is produced and users will
23// be told that the service is disabled. 23// be told that the service is disabled.
24$options->enabled = true; 24$options->enabled = true;
25 25
@@ -43,10 +43,64 @@ $options->default_entries = 5;
43// ---------------------- 43// ----------------------
44// The maximum number of feed items to process when no access key is supplied. 44// The maximum number of feed items to process when no access key is supplied.
45// This limits the user-supplied &max=x value. For example, if the user 45// This limits the user-supplied &max=x value. For example, if the user
46// asks for 20 items to be processed (&max=20), if max_entries is set to 46// asks for 20 items to be processed (&max=20), if max_entries is set to
47// 10, only 10 will be processed. 47// 10, only 10 will be processed.
48$options->max_entries = 10; 48$options->max_entries = 10;
49 49
50// Full content
51// ----------------------
52// By default Full-Text RSS includes the extracted content in the output.
53// You can exclude this from the output by passing '&content=0' in the querystring.
54//
55// Possible values...
56// Always include: true
57// Never include: false
58// Include unless user overrides (&content=0): 'user' (default)
59//
60// Note: currently this does not disable full content extraction. It simply omits it
61// from the output.
62$options->content = 'user';
63
64// Excerpts
65// ----------------------
66// By default Full-Text RSS does not include excerpts in the output.
67// You can enable this by passing '&summary=1' in the querystring.
68// This will include a plain text excerpt from the extracted content.
69//
70// Possible values...
71// Always include: true (recommended for new users)
72// Never include: false
73// Don't include unless user overrides (&summary=1): 'user' (default)
74//
75// Important: if both content and excerpts are requested, the excerpt will be
76// placed in the description element and the full content inside content:encoded.
77// If excerpts are not requested, the full content will go inside the description element.
78//
79// Why are we not returning both excerpts and content by default?
80// Mainly for backward compatibility.
81// Excerpts should appear in the feed item's description element. Previous versions
82// of Full-Text RSS did not return excerpts, so the description element was always
83// used for the full content (as recommended by the RSS advisory). When returning both,
84// we need somewhere else to place the content (content:encoded).
85// Having both enabled should not create any problems for news readers, but it may create
86// problems for developers upgrading from one of our earlier versions who may now find
87// their applications are returning excerpts instead of the full content they were
88// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
89// excerpts must be explicitly requested in the querystring by default.
90//
91// Why not use a different element name for excerpts?
92// According to the RSS advisory:
93// "Publishers who employ summaries should store the summary in description and
94// the full content in content:encoded, ordering description first within the item.
95// On items with no summary, the full content should be stored in description."
96// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
97//
98// For more consistent element naming, we recommend new users set this option to true.
99// The full content can still be excluded via the querystring, but the element names
100// will not change: when $options->summary = true, the description element will always
101// be reserved for the excerpt and content:encoded always for full content.
102$options->summary = 'user';
103
50// Rewrite relative URLs 104// Rewrite relative URLs
51// ---------------------- 105// ----------------------
52// With this enabled relative URLs found in the extracted content 106// With this enabled relative URLs found in the extracted content
@@ -67,7 +121,7 @@ $options->exclude_items_on_fail = 'user';
67// Enable multi-page support 121// Enable multi-page support
68// ------------------------- 122// -------------------------
69// If enabled, we will try to follow next page links on multi-page articles. 123// If enabled, we will try to follow next page links on multi-page articles.
70// Currently this only happens for sites where next_page_link has been defined 124// Currently this only happens for sites where next_page_link has been defined
71// in a site config file. 125// in a site config file.
72$options->multipage = true; 126$options->multipage = true;
73 127
@@ -125,10 +179,10 @@ $options->detect_language = 1;
125 179
126// Registration key 180// Registration key
127// --------------- 181// ---------------
128// The registration key is optional. It is not required to use Full-Text RSS, 182// The registration key is optional. It is not required to use Full-Text RSS,
129// and does not affect the normal operation of Full-Text RSS. It is currently 183// and does not affect the normal operation of Full-Text RSS. It is currently
130// only used on admin pages which help you update site patterns with the 184// only used on admin pages which help you update site patterns with the
131// latest version offered by FiveFilters.org. For these admin-related 185// latest version offered by FiveFilters.org. For these admin-related
132// tasks to complete, we will require a valid registration key. 186// tasks to complete, we will require a valid registration key.
133// If you would like one, you can purchase the latest version of Full-Text RSS 187// If you would like one, you can purchase the latest version of Full-Text RSS
134// at http://fivefilters.org/content-only/ 188// at http://fivefilters.org/content-only/
@@ -144,12 +198,12 @@ $options->registration_key = '';
144// ---------------------- 198// ----------------------
145// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials. 199// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials.
146// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages. 200// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages.
147// If no password or username is set, pages requiring admin privelages will be inaccessible. 201// If no password or username is set, pages requiring admin privelages will be inaccessible.
148// The default username is 'admin'. 202// The default username is 'admin'.
149// If overriding with an environment variable, separate username and password with a colon, e.g.: 203// If overriding with an environment variable, separate username and password with a colon, e.g.:
150// ftr_admin_credentials: admin:my-secret-password 204// ftr_admin_credentials: admin:my-secret-password
151// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password'); 205// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
152$options->admin_credentials = array('username'=>'admin', 'password'=>'admin'); 206$options->admin_credentials = array('username'=>'admin', 'password'=>'');
153 207
154// URLs to allow 208// URLs to allow
155// ---------------------- 209// ----------------------
@@ -178,12 +232,12 @@ $options->key_required = false;
178// ---------------------- 232// ----------------------
179// By default, when processing feeds, we assume item titles in the feed 233// By default, when processing feeds, we assume item titles in the feed
180// have not been truncated. So after processing web pages, the extracted titles 234// have not been truncated. So after processing web pages, the extracted titles
181// are not used in the generated feed. If you prefer to have extracted titles in 235// are not used in the generated feed. If you prefer to have extracted titles in
182// the feed you can either set this to false, in which case we will always favour 236// the feed you can either set this to false, in which case we will always favour
183// extracted titles. Alternatively, if set to 'user' (default) we'll use the 237// extracted titles. Alternatively, if set to 'user' (default) we'll use the
184// extracted title if you pass '&use_extracted_title' in the querystring. 238// extracted title if you pass '&use_extracted_title' in the querystring.
185// Possible values: 239// Possible values:
186// * Favour feed titles: true 240// * Favour feed titles: true
187// * Favour extracted titles: false 241// * Favour extracted titles: false
188// * Favour feed titles with user override: 'user' (default) 242// * Favour feed titles with user override: 'user' (default)
189// Note: this has no effect when the input URL is to a web page - in these cases 243// Note: this has no effect when the input URL is to a web page - in these cases
@@ -192,17 +246,17 @@ $options->favour_feed_titles = 'user';
192 246
193// Access keys (password protected access) 247// Access keys (password protected access)
194// ------------------------------------ 248// ------------------------------------
195// NOTE: You do not need an API key from fivefilters.org to run your own 249// NOTE: You do not need an API key from fivefilters.org to run your own
196// copy of the code. This is here if you'd like to restrict access to 250// copy of the code. This is here if you'd like to restrict access to
197// _your_ copy. 251// _your_ copy.
198// Keys let you group users - those with a key and those without - and 252// Keys let you group users - those with a key and those without - and
199// restrict access to the service to those without a key. 253// restrict access to the service to those without a key.
200// If you want everyone to access the service in the same way, you can 254// If you want everyone to access the service in the same way, you can
201// leave the array below empty and ignore the access key options further down. 255// leave the array below empty and ignore the access key options further down.
202// The options further down let you control how the service should behave 256// The options further down let you control how the service should behave
203// in each mode. 257// in each mode.
204// Note: Explicitly including the index number (1 and 2 in the examples below) 258// Note: Explicitly including the index number (1 and 2 in the examples below)
205// is highly recommended (when generating feeds, we encode the key and 259// is highly recommended (when generating feeds, we encode the key and
206// refer to it by index number and hash). 260// refer to it by index number and hash).
207$options->api_keys = array(); 261$options->api_keys = array();
208// Example: 262// Example:
@@ -232,13 +286,13 @@ $options->max_entries_with_key = 10;
232// filter the resulting HTML for XSS attacks, making it redundant for 286// filter the resulting HTML for XSS attacks, making it redundant for
233// Full-Text RSS do the same. Similarly with frameworks/CMS which display 287// Full-Text RSS do the same. Similarly with frameworks/CMS which display
234// feed content - the content should be treated like any other user-submitted content. 288// feed content - the content should be treated like any other user-submitted content.
235// 289//
236// If you are writing an application yourself which is processing feeds generated by 290// If you are writing an application yourself which is processing feeds generated by
237// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks 291// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks
238// or enable this option. This might be useful if you are processing our generated 292// or enable this option. This might be useful if you are processing our generated
239// feeds with JavaScript on the client side - although there's client side xss 293// feeds with JavaScript on the client side - although there's client side xss
240// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer 294// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer
241// 295//
242// If enabled, we'll pass retrieved HTML content through htmLawed with 296// If enabled, we'll pass retrieved HTML content through htmLawed with
243// safe flag on and style attributes denied, see 297// safe flag on and style attributes denied, see
244// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6 298// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6
@@ -253,8 +307,8 @@ $options->xss_filter = 'user';
253// Allowed parsers 307// Allowed parsers
254// ---------------------- 308// ----------------------
255// Full-Text RSS attempts to use PHP's libxml extension to process HTML. 309// Full-Text RSS attempts to use PHP's libxml extension to process HTML.
256// While fast, on some sites it may not always produce good results. 310// While fast, on some sites it may not always produce good results.
257// For these sites, you can specify an alternative HTML parser: 311// For these sites, you can specify an alternative HTML parser:
258// parser: html5lib 312// parser: html5lib
259// The html5lib parser is bundled with Full-Text RSS. 313// The html5lib parser is bundled with Full-Text RSS.
260// see http://code.google.com/p/html5lib/ 314// see http://code.google.com/p/html5lib/
@@ -273,7 +327,7 @@ $options->cors = false;
273 327
274// Use APC user cache? 328// Use APC user cache?
275// ---------------------- 329// ----------------------
276// If enabled we will store site config files (when requested 330// If enabled we will store site config files (when requested
277// for the first time) in APC's user cache. Keys prefixed with 'sc.' 331// for the first time) in APC's user cache. Keys prefixed with 'sc.'
278// This improves performance by reducing disk access. 332// This improves performance by reducing disk access.
279// Note: this has no effect if APC is unavailable on your server. 333// Note: this has no effect if APC is unavailable on your server.
@@ -346,7 +400,7 @@ $options->rewrite_url = array(
346// Valid actions: 400// Valid actions:
347// * 'exclude' - exclude this item from the result 401// * 'exclude' - exclude this item from the result
348// * 'link' - create HTML link to the item 402// * 'link' - create HTML link to the item
349$options->content_type_exc = array( 403$options->content_type_exc = array(
350 'application/pdf' => array('action'=>'link', 'name'=>'PDF'), 404 'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
351 'image' => array('action'=>'link', 'name'=>'Image'), 405 'image' => array('action'=>'link', 'name'=>'Image'),
352 'audio' => array('action'=>'link', 'name'=>'Audio'), 406 'audio' => array('action'=>'link', 'name'=>'Audio'),
@@ -375,13 +429,13 @@ $options->cache_cleanup = 100;
375/// DO NOT CHANGE ANYTHING BELOW THIS /////////// 429/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
376///////////////////////////////////////////////// 430/////////////////////////////////////////////////
377 431
378if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1'); 432if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
379 433
380if (basename(__FILE__) == 'config.php') { 434if (basename(__FILE__) == 'config.php') {
381 if (file_exists(dirname(__FILE__).'/custom_config.php')) { 435 if (file_exists(dirname(__FILE__).'/custom_config.php')) {
382 require_once dirname(__FILE__).'/custom_config.php'; 436 require_once dirname(__FILE__).'/custom_config.php';
383 } 437 }
384 438
385 // check for environment variables - often used on cloud platforms 439 // check for environment variables - often used on cloud platforms
386 // environment variables should be prefixed with 'ftr_', e.g. 440 // environment variables should be prefixed with 'ftr_', e.g.
387 // ftr_max_entries: 1 441 // ftr_max_entries: 1
diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
index ddd33bb5..21e693e7 100644
--- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
+++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
@@ -1,728 +1,727 @@
1<?php 1<?php
2/** 2/**
3 * Content Extractor 3 * Content Extractor
4 * 4 *
5 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 5 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
6 * to extract content from HTML files. 6 * to extract content from HTML files.
7 * 7 *
8 * @version 1.0 8 * @version 1.0
9 * @date 2013-02-05 9 * @date 2013-02-05
10 * @author Keyvan Minoukadeh 10 * @author Keyvan Minoukadeh
11 * @copyright 2013 Keyvan Minoukadeh 11 * @copyright 2013 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
13 */ 13 */
14 14
15class ContentExtractor 15class ContentExtractor
16{ 16{
17 protected static $tidy_config = array( 17 protected static $tidy_config = array(
18 'clean' => true, 18 'clean' => true,
19 'output-xhtml' => true, 19 'output-xhtml' => true,
20 'logical-emphasis' => true, 20 'logical-emphasis' => true,
21 'show-body-only' => false, 21 'show-body-only' => false,
22 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 22 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
23 'new-inline-tags' => 'mark, time, meter, progress, data', 23 'new-inline-tags' => 'mark, time, meter, progress, data',
24 'wrap' => 0, 24 'wrap' => 0,
25 'drop-empty-paras' => true, 25 'drop-empty-paras' => true,
26 'drop-proprietary-attributes' => false, 26 'drop-proprietary-attributes' => false,
27 'enclose-text' => true, 27 'enclose-text' => true,
28 'enclose-block-text' => true, 28 'enclose-block-text' => true,
29 'merge-divs' => true, 29 'merge-divs' => true,
30 'merge-spans' => true, 30 'merge-spans' => true,
31 'char-encoding' => 'utf8', 31 'char-encoding' => 'utf8',
32 'hide-comments' => true 32 'hide-comments' => true
33 ); 33 );
34 protected $html; 34 protected $html;
35 protected $config; 35 protected $config;
36 protected $title; 36 protected $title;
37 protected $author = array(); 37 protected $author = array();
38 protected $language; 38 protected $language;
39 protected $date; 39 protected $date;
40 protected $body; 40 protected $body;
41 protected $success = false; 41 protected $success = false;
42 protected $nextPageUrl; 42 protected $nextPageUrl;
43 public $allowedParsers = array('libxml', 'html5lib'); 43 public $allowedParsers = array('libxml', 'html5lib');
44 public $fingerprints = array(); 44 public $fingerprints = array();
45 public $readability; 45 public $readability;
46 public $debug = false; 46 public $debug = false;
47 public $debugVerbose = false; 47 public $debugVerbose = false;
48 48
49 function __construct($path, $fallback=null) { 49 function __construct($path, $fallback=null) {
50 SiteConfig::set_config_path($path, $fallback); 50 SiteConfig::set_config_path($path, $fallback);
51 } 51 }
52 52
53 protected function debug($msg) { 53 protected function debug($msg) {
54 if ($this->debug) { 54 if ($this->debug) {
55 $mem = round(memory_get_usage()/1024, 2); 55 $mem = round(memory_get_usage()/1024, 2);
56 $memPeak = round(memory_get_peak_usage()/1024, 2); 56 $memPeak = round(memory_get_peak_usage()/1024, 2);
57 echo '* ',$msg; 57 echo '* ',$msg;
58 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; 58 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
59 echo "\n"; 59 echo "\n";
60 ob_flush(); 60 ob_flush();
61 flush(); 61 flush();
62 } 62 }
63 } 63 }
64 64
65 public function reset() { 65 public function reset() {
66 $this->html = null; 66 $this->html = null;
67 $this->readability = null; 67 $this->readability = null;
68 $this->config = null; 68 $this->config = null;
69 $this->title = null; 69 $this->title = null;
70 $this->body = null; 70 $this->body = null;
71 $this->author = array(); 71 $this->author = array();
72 $this->language = null; 72 $this->language = null;
73 $this->date = null; 73 $this->date = null;
74 $this->nextPageUrl = null; 74 $this->nextPageUrl = null;
75 $this->success = false; 75 $this->success = false;
76 } 76 }
77 77
78 public function findHostUsingFingerprints($html) { 78 public function findHostUsingFingerprints($html) {
79 $this->debug('Checking fingerprints...'); 79 $this->debug('Checking fingerprints...');
80 $head = substr($html, 0, 8000); 80 $head = substr($html, 0, 8000);
81 foreach ($this->fingerprints as $_fp => $_fphost) { 81 foreach ($this->fingerprints as $_fp => $_fphost) {
82 $lookin = 'html'; 82 $lookin = 'html';
83 if (is_array($_fphost)) { 83 if (is_array($_fphost)) {
84 if (isset($_fphost['head']) && $_fphost['head']) { 84 if (isset($_fphost['head']) && $_fphost['head']) {
85 $lookin = 'head'; 85 $lookin = 'head';
86 } 86 }
87 $_fphost = $_fphost['hostname']; 87 $_fphost = $_fphost['hostname'];
88 } 88 }
89 if (strpos($$lookin, $_fp) !== false) { 89 if (strpos($$lookin, $_fp) !== false) {
90 $this->debug("Found match: $_fphost"); 90 $this->debug("Found match: $_fphost");
91 return $_fphost; 91 return $_fphost;
92 } 92 }
93 } 93 }
94 $this->debug('No fingerprint matches'); 94 $this->debug('No fingerprint matches');
95 return false; 95 return false;
96 } 96 }
97 97
98 // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) 98 // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
99 public function buildSiteConfig($url, $html='', $add_to_cache=true) { 99 public function buildSiteConfig($url, $html='', $add_to_cache=true) {
100 // extract host name 100 // extract host name
101 $host = @parse_url($url, PHP_URL_HOST); 101 $host = @parse_url($url, PHP_URL_HOST);
102 $host = strtolower($host); 102 $host = strtolower($host);
103 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); 103 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
104 // is merged version already cached? 104 // is merged version already cached?
105 if (SiteConfig::is_cached("$host.merged")) { 105 if (SiteConfig::is_cached("$host.merged")) {
106 $this->debug("Returning cached and merged site config for $host"); 106 $this->debug("Returning cached and merged site config for $host");
107 return SiteConfig::build("$host.merged"); 107 return SiteConfig::build("$host.merged");
108 } 108 }
109 // let's build from site_config/custom/ and standard/ 109 // let's build from site_config/custom/ and standard/
110 $config = SiteConfig::build($host); 110 $config = SiteConfig::build($host);
111 if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { 111 if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
112 SiteConfig::add_to_cache($host, $config); 112 SiteConfig::add_to_cache($host, $config);
113 } 113 }
114 // if no match, use defaults 114 // if no match, use defaults
115 if (!$config) $config = new SiteConfig(); 115 if (!$config) $config = new SiteConfig();
116 // load fingerprint config? 116 // load fingerprint config?
117 if ($config->autodetect_on_failure()) { 117 if ($config->autodetect_on_failure()) {
118 // check HTML for fingerprints 118 // check HTML for fingerprints
119 if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { 119 if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
120 if ($config_fingerprint = SiteConfig::build($_fphost)) { 120 if ($config_fingerprint = SiteConfig::build($_fphost)) {
121 $this->debug("Appending site config settings from $_fphost (fingerprint match)"); 121 $this->debug("Appending site config settings from $_fphost (fingerprint match)");
122 $config->append($config_fingerprint); 122 $config->append($config_fingerprint);
123 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { 123 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
124 //$config_fingerprint->cache_in_apc = true; 124 //$config_fingerprint->cache_in_apc = true;
125 SiteConfig::add_to_cache($_fphost, $config_fingerprint); 125 SiteConfig::add_to_cache($_fphost, $config_fingerprint);
126 } 126 }
127 } 127 }
128 } 128 }
129 } 129 }
130 // load global config? 130 // load global config?
131 if ($config->autodetect_on_failure()) { 131 if ($config->autodetect_on_failure()) {
132 if ($config_global = SiteConfig::build('global', true)) { 132 if ($config_global = SiteConfig::build('global', true)) {
133 $this->debug('Appending site config settings from global.txt'); 133 $this->debug('Appending site config settings from global.txt');
134 $config->append($config_global); 134 $config->append($config_global);
135 if ($add_to_cache && !SiteConfig::is_cached('global')) { 135 if ($add_to_cache && !SiteConfig::is_cached('global')) {
136 //$config_global->cache_in_apc = true; 136 //$config_global->cache_in_apc = true;
137 SiteConfig::add_to_cache('global', $config_global); 137 SiteConfig::add_to_cache('global', $config_global);
138 } 138 }
139 } 139 }
140 } 140 }
141 // store copy of merged config 141 // store copy of merged config
142 if ($add_to_cache) { 142 if ($add_to_cache) {
143 // do not store in APC if wildcard match 143 // do not store in APC if wildcard match
144 $use_apc = ($host == $config->cache_key); 144 $use_apc = ($host == $config->cache_key);
145 $config->cache_key = null; 145 $config->cache_key = null;
146 SiteConfig::add_to_cache("$host.merged", $config, $use_apc); 146 SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
147 } 147 }
148 return $config; 148 return $config;
149 } 149 }
150 150
151 // returns true on success, false on failure 151 // returns true on success, false on failure
152 // $smart_tidy indicates that if tidy is used and no results are produced, we will 152 // $smart_tidy indicates that if tidy is used and no results are produced, we will
153 // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time 153 // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
154 // but it has problems of its own which we try to avoid with this option. 154 // but it has problems of its own which we try to avoid with this option.
155 public function process($html, $url, $smart_tidy=true) { 155 public function process($html, $url, $smart_tidy=true) {
156 $this->reset(); 156 $this->reset();
157 $this->config = $this->buildSiteConfig($url, $html); 157 $this->config = $this->buildSiteConfig($url, $html);
158 158
159 // do string replacements 159 // do string replacements
160 if (!empty($this->config->find_string)) { 160 if (!empty($this->config->find_string)) {
161 if (count($this->config->find_string) == count($this->config->replace_string)) { 161 if (count($this->config->find_string) == count($this->config->replace_string)) {
162 $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); 162 $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
163 $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); 163 $this->debug("Strings replaced: $_count (find_string and/or replace_string)");
164 } else { 164 } else {
165 $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); 165 $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
166 } 166 }
167 unset($_count); 167 unset($_count);
168 } 168 }
169 169
170 // use tidy (if it exists)? 170 // use tidy (if it exists)?
171 // This fixes problems with some sites which would otherwise 171 // This fixes problems with some sites which would otherwise
172 // trouble DOMDocument's HTML parsing. (Although sometimes it 172 // trouble DOMDocument's HTML parsing. (Although sometimes it
173 // makes matters worse, which is why you can override it in site config files.) 173 // makes matters worse, which is why you can override it in site config files.)
174 $tidied = false; 174 $tidied = false;
175 if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { 175 if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
176 $this->debug('Using Tidy'); 176 $this->debug('Using Tidy');
177 $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); 177 $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
178 if (tidy_clean_repair($tidy)) { 178 if (tidy_clean_repair($tidy)) {
179 $original_html = $html; 179 $original_html = $html;
180 $tidied = true; 180 $tidied = true;
181 $html = $tidy->value; 181 $html = $tidy->value;
182 } 182 }
183 unset($tidy); 183 unset($tidy);
184 } 184 }
185 185
186 // load and parse html 186 // load and parse html
187 $_parser = $this->config->parser(); 187 $_parser = $this->config->parser();
188 if (!in_array($_parser, $this->allowedParsers)) { 188 if (!in_array($_parser, $this->allowedParsers)) {
189 $this->debug("HTML parser $_parser not listed, using libxml instead"); 189 $this->debug("HTML parser $_parser not listed, using libxml instead");
190 $_parser = 'libxml'; 190 $_parser = 'libxml';
191 } 191 }
192 $this->debug("Attempting to parse HTML with $_parser"); 192 $this->debug("Attempting to parse HTML with $_parser");
193 $this->readability = new Readability($html, $url, $_parser); 193 $this->readability = new Readability($html, $url, $_parser);
194 194
195 // we use xpath to find elements in the given HTML document 195 // we use xpath to find elements in the given HTML document
196 // see http://en.wikipedia.org/wiki/XPath_1.0 196 // see http://en.wikipedia.org/wiki/XPath_1.0
197 $xpath = new DOMXPath($this->readability->dom); 197 $xpath = new DOMXPath($this->readability->dom);
198 198
199 // try to get next page link 199 // try to get next page link
200 foreach ($this->config->next_page_link as $pattern) { 200 foreach ($this->config->next_page_link as $pattern) {
201 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 201 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
202 if (is_string($elems)) { 202 if (is_string($elems)) {
203 $this->nextPageUrl = trim($elems); 203 $this->nextPageUrl = trim($elems);
204 break; 204 break;
205 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 205 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
206 foreach ($elems as $item) { 206 foreach ($elems as $item) {
207 if ($item instanceof DOMElement && $item->hasAttribute('href')) { 207 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
208 $this->nextPageUrl = $item->getAttribute('href'); 208 $this->nextPageUrl = $item->getAttribute('href');
209 break 2; 209 break 2;
210 } elseif ($item instanceof DOMAttr && $item->value) { 210 } elseif ($item instanceof DOMAttr && $item->value) {
211 $this->nextPageUrl = $item->value; 211 $this->nextPageUrl = $item->value;
212 break 2; 212 break 2;
213 } 213 }
214 } 214 }
215 } 215 }
216 } 216 }
217 217
218 // try to get title 218 // try to get title
219 foreach ($this->config->title as $pattern) { 219 foreach ($this->config->title as $pattern) {
220 // $this->debug("Trying $pattern"); 220 // $this->debug("Trying $pattern");
221 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 221 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
222 if (is_string($elems)) { 222 if (is_string($elems)) {
223 $this->title = trim($elems); 223 $this->title = trim($elems);
224 $this->debug('Title expression evaluated as string: '.$this->title); 224 $this->debug('Title expression evaluated as string: '.$this->title);
225 $this->debug("...XPath match: $pattern"); 225 $this->debug("...XPath match: $pattern");
226 break; 226 break;
227 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 227 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
228 $this->title = $elems->item(0)->textContent; 228 $this->title = $elems->item(0)->textContent;
229 $this->debug('Title matched: '.$this->title); 229 $this->debug('Title matched: '.$this->title);
230 $this->debug("...XPath match: $pattern"); 230 $this->debug("...XPath match: $pattern");
231 // remove title from document 231 // remove title from document
232 try { 232 try {
233 $elems->item(0)->parentNode->removeChild($elems->item(0)); 233 @$elems->item(0)->parentNode->removeChild($elems->item(0));
234 } catch (DOMException $e) { 234 } catch (DOMException $e) {
235 // do nothing 235 // do nothing
236 } 236 }
237 break; 237 break;
238 } 238 }
239 } 239 }
240 240
241 // try to get author (if it hasn't already been set) 241 // try to get author (if it hasn't already been set)
242 if (empty($this->author)) { 242 if (empty($this->author)) {
243 foreach ($this->config->author as $pattern) { 243 foreach ($this->config->author as $pattern) {
244 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 244 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
245 if (is_string($elems)) { 245 if (is_string($elems)) {
246 if (trim($elems) != '') { 246 if (trim($elems) != '') {
247 $this->author[] = trim($elems); 247 $this->author[] = trim($elems);
248 $this->debug('Author expression evaluated as string: '.trim($elems)); 248 $this->debug('Author expression evaluated as string: '.trim($elems));
249 $this->debug("...XPath match: $pattern"); 249 $this->debug("...XPath match: $pattern");
250 break; 250 break;
251 } 251 }
252 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 252 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
253 foreach ($elems as $elem) { 253 foreach ($elems as $elem) {
254 if (!isset($elem->parentNode)) continue; 254 if (!isset($elem->parentNode)) continue;
255 $this->author[] = trim($elem->textContent); 255 $this->author[] = trim($elem->textContent);
256 $this->debug('Author matched: '.trim($elem->textContent)); 256 $this->debug('Author matched: '.trim($elem->textContent));
257 } 257 }
258 if (!empty($this->author)) { 258 if (!empty($this->author)) {
259 $this->debug("...XPath match: $pattern"); 259 $this->debug("...XPath match: $pattern");
260 break; 260 break;
261 } 261 }
262 } 262 }
263 } 263 }
264 } 264 }
265 265
266 // try to get language 266 // try to get language
267 $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); 267 $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
268 foreach ($_lang_xpath as $pattern) { 268 foreach ($_lang_xpath as $pattern) {
269 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 269 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
270 if (is_string($elems)) { 270 if (is_string($elems)) {
271 if (trim($elems) != '') { 271 if (trim($elems) != '') {
272 $this->language = trim($elems); 272 $this->language = trim($elems);
273 $this->debug('Language matched: '.$this->language); 273 $this->debug('Language matched: '.$this->language);
274 break; 274 break;
275 } 275 }
276 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 276 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
277 foreach ($elems as $elem) { 277 foreach ($elems as $elem) {
278 if (!isset($elem->parentNode)) continue; 278 if (!isset($elem->parentNode)) continue;
279 $this->language = trim($elem->textContent); 279 $this->language = trim($elem->textContent);
280 $this->debug('Language matched: '.$this->language); 280 $this->debug('Language matched: '.$this->language);
281 } 281 }
282 if ($this->language) break; 282 if ($this->language) break;
283 } 283 }
284 } 284 }
285 285
286 // try to get date 286 // try to get date
287 foreach ($this->config->date as $pattern) { 287 foreach ($this->config->date as $pattern) {
288 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 288 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
289 if (is_string($elems)) { 289 if (is_string($elems)) {
290 $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); 290 $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));
291 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 291 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
292 $this->date = $elems->item(0)->textContent; 292 $this->date = $elems->item(0)->textContent;
293 $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); 293 $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
294 // remove date from document 294 // remove date from document
295 // $elems->item(0)->parentNode->removeChild($elems->item(0)); 295 // $elems->item(0)->parentNode->removeChild($elems->item(0));
296 } 296 }
297 if (!$this->date) { 297 if (!$this->date) {
298 $this->date = null; 298 $this->date = null;
299 } else { 299 } else {
300 $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); 300 $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
301 $this->debug("...XPath match: $pattern"); 301 $this->debug("...XPath match: $pattern");
302 break; 302 break;
303 } 303 }
304 } 304 }
305 305
306 // strip elements (using xpath expressions) 306 // strip elements (using xpath expressions)
307 foreach ($this->config->strip as $pattern) { 307 foreach ($this->config->strip as $pattern) {
308 $elems = @$xpath->query($pattern, $this->readability->dom); 308 $elems = @$xpath->query($pattern, $this->readability->dom);
309 // check for matches 309 // check for matches
310 if ($elems && $elems->length > 0) { 310 if ($elems && $elems->length > 0) {
311 $this->debug('Stripping '.$elems->length.' elements (strip)'); 311 $this->debug('Stripping '.$elems->length.' elements (strip)');
312 for ($i=$elems->length-1; $i >= 0; $i--) { 312 for ($i=$elems->length-1; $i >= 0; $i--) {
313 $elems->item($i)->parentNode->removeChild($elems->item($i)); 313 $elems->item($i)->parentNode->removeChild($elems->item($i));
314 } 314 }
315 } 315 }
316 } 316 }
317 317
318 // strip elements (using id and class attribute values) 318 // strip elements (using id and class attribute values)
319 foreach ($this->config->strip_id_or_class as $string) { 319 foreach ($this->config->strip_id_or_class as $string) {
320 $string = strtr($string, array("'"=>'', '"'=>'')); 320 $string = strtr($string, array("'"=>'', '"'=>''));
321 $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); 321 $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
322 // check for matches 322 // check for matches
323 if ($elems && $elems->length > 0) { 323 if ($elems && $elems->length > 0) {
324 $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); 324 $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
325 for ($i=$elems->length-1; $i >= 0; $i--) { 325 for ($i=$elems->length-1; $i >= 0; $i--) {
326 $elems->item($i)->parentNode->removeChild($elems->item($i)); 326 $elems->item($i)->parentNode->removeChild($elems->item($i));
327 } 327 }
328 } 328 }
329 } 329 }
330 330
331 // strip images (using src attribute values) 331 // strip images (using src attribute values)
332 foreach ($this->config->strip_image_src as $string) { 332 foreach ($this->config->strip_image_src as $string) {
333 $string = strtr($string, array("'"=>'', '"'=>'')); 333 $string = strtr($string, array("'"=>'', '"'=>''));
334 $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); 334 $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
335 // check for matches 335 // check for matches
336 if ($elems && $elems->length > 0) { 336 if ($elems && $elems->length > 0) {
337 $this->debug('Stripping '.$elems->length.' image elements'); 337 $this->debug('Stripping '.$elems->length.' image elements');
338 for ($i=$elems->length-1; $i >= 0; $i--) { 338 for ($i=$elems->length-1; $i >= 0; $i--) {
339 $elems->item($i)->parentNode->removeChild($elems->item($i)); 339 $elems->item($i)->parentNode->removeChild($elems->item($i));
340 } 340 }
341 } 341 }
342 } 342 }
343 // strip elements using Readability.com and Instapaper.com ignore class names 343 // strip elements using Readability.com and Instapaper.com ignore class names
344 // .entry-unrelated and .instapaper_ignore 344 // .entry-unrelated and .instapaper_ignore
345 // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines 345 // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
346 // and http://blog.instapaper.com/post/730281947 346 // and http://blog.instapaper.com/post/730281947
347 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); 347 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
348 // check for matches 348 // check for matches
349 if ($elems && $elems->length > 0) { 349 if ($elems && $elems->length > 0) {
350 $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); 350 $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
351 for ($i=$elems->length-1; $i >= 0; $i--) { 351 for ($i=$elems->length-1; $i >= 0; $i--) {
352 $elems->item($i)->parentNode->removeChild($elems->item($i)); 352 $elems->item($i)->parentNode->removeChild($elems->item($i));
353 } 353 }
354 } 354 }
355 355
356 // strip elements that contain style="display: none;" 356 // strip elements that contain style="display: none;"
357 $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); 357 $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
358 // check for matches 358 // check for matches
359 if ($elems && $elems->length > 0) { 359 if ($elems && $elems->length > 0) {
360 $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); 360 $this->debug('Stripping '.$elems->length.' elements with inline display:none style');
361 for ($i=$elems->length-1; $i >= 0; $i--) { 361 for ($i=$elems->length-1; $i >= 0; $i--) {
362 $elems->item($i)->parentNode->removeChild($elems->item($i)); 362 $elems->item($i)->parentNode->removeChild($elems->item($i));
363 } 363 }
364 } 364 }
365 365
366 // try to get body 366 // try to get body
367 foreach ($this->config->body as $pattern) { 367 foreach ($this->config->body as $pattern) {
368 $elems = @$xpath->query($pattern, $this->readability->dom); 368 $elems = @$xpath->query($pattern, $this->readability->dom);
369 // check for matches 369 // check for matches
370 if ($elems && $elems->length > 0) { 370 if ($elems && $elems->length > 0) {
371 $this->debug('Body matched'); 371 $this->debug('Body matched');
372 $this->debug("...XPath match: $pattern"); 372 $this->debug("...XPath match: $pattern");
373 if ($elems->length == 1) { 373 if ($elems->length == 1) {
374 $this->body = $elems->item(0); 374 $this->body = $elems->item(0);
375 // prune (clean up elements that may not be content) 375 // prune (clean up elements that may not be content)
376 if ($this->config->prune()) { 376 if ($this->config->prune()) {
377 $this->debug('...pruning content'); 377 $this->debug('...pruning content');
378 $this->readability->prepArticle($this->body); 378 $this->readability->prepArticle($this->body);
379 } 379 }
380 break; 380 break;
381 } else { 381 } else {
382 $this->body = $this->readability->dom->createElement('div'); 382 $this->body = $this->readability->dom->createElement('div');
383 $this->debug($elems->length.' body elems found'); 383 $this->debug($elems->length.' body elems found');
384 foreach ($elems as $elem) { 384 foreach ($elems as $elem) {
385 if (!isset($elem->parentNode)) continue; 385 if (!isset($elem->parentNode)) continue;
386 $isDescendant = false; 386 $isDescendant = false;
387 foreach ($this->body->childNodes as $parent) { 387 foreach ($this->body->childNodes as $parent) {
388 if ($this->isDescendant($parent, $elem)) { 388 if ($this->isDescendant($parent, $elem)) {
389 $isDescendant = true; 389 $isDescendant = true;
390 break; 390 break;
391 } 391 }
392 } 392 }
393 if ($isDescendant) { 393 if ($isDescendant) {
394 $this->debug('...element is child of another body element, skipping.'); 394 $this->debug('...element is child of another body element, skipping.');
395 } else { 395 } else {
396 // prune (clean up elements that may not be content) 396 // prune (clean up elements that may not be content)
397 if ($this->config->prune()) { 397 if ($this->config->prune()) {
398 $this->debug('Pruning content'); 398 $this->debug('Pruning content');
399 $this->readability->prepArticle($elem); 399 $this->readability->prepArticle($elem);
400 } 400 }
401 $this->debug('...element added to body'); 401 $this->debug('...element added to body');
402 $this->body->appendChild($elem); 402 $this->body->appendChild($elem);
403 } 403 }
404 } 404 }
405 if ($this->body->hasChildNodes()) break; 405 if ($this->body->hasChildNodes()) break;
406 } 406 }
407 } 407 }
408 } 408 }
409 409
410 // auto detect? 410 // auto detect?
411 $detect_title = $detect_body = $detect_author = $detect_date = false; 411 $detect_title = $detect_body = $detect_author = $detect_date = false;
412 // detect title? 412 // detect title?
413 if (!isset($this->title)) { 413 if (!isset($this->title)) {
414 if (empty($this->config->title) || $this->config->autodetect_on_failure()) { 414 if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
415 $detect_title = true; 415 $detect_title = true;
416 } 416 }
417 } 417 }
418 // detect body? 418 // detect body?
419 if (!isset($this->body)) { 419 if (!isset($this->body)) {
420 if (empty($this->config->body) || $this->config->autodetect_on_failure()) { 420 if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
421 $detect_body = true; 421 $detect_body = true;
422 } 422 }
423 } 423 }
424 // detect author? 424 // detect author?
425 if (empty($this->author)) { 425 if (empty($this->author)) {
426 if (empty($this->config->author) || $this->config->autodetect_on_failure()) { 426 if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
427 $detect_author = true; 427 $detect_author = true;
428 } 428 }
429 } 429 }
430 // detect date? 430 // detect date?
431 if (!isset($this->date)) { 431 if (!isset($this->date)) {
432 if (empty($this->config->date) || $this->config->autodetect_on_failure()) { 432 if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
433 $detect_date = true; 433 $detect_date = true;
434 } 434 }
435 } 435 }
436 436
437 // check for hNews 437 // check for hNews
438 if ($detect_title || $detect_body) { 438 if ($detect_title || $detect_body) {
439 // check for hentry 439 // check for hentry
440 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); 440 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
441 if ($elems && $elems->length > 0) { 441 if ($elems && $elems->length > 0) {
442 $this->debug('hNews: found hentry'); 442 $this->debug('hNews: found hentry');
443 $hentry = $elems->item(0); 443 $hentry = $elems->item(0);
444 444
445 if ($detect_title) { 445 if ($detect_title) {
446 // check for entry-title 446 // check for entry-title
447 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); 447 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
448 if ($elems && $elems->length > 0) { 448 if ($elems && $elems->length > 0) {
449 $this->title = $elems->item(0)->textContent; 449 $this->title = $elems->item(0)->textContent;
450 $this->debug('hNews: found entry-title: '.$this->title); 450 $this->debug('hNews: found entry-title: '.$this->title);
451 // remove title from document 451 // remove title from document
452 $elems->item(0)->parentNode->removeChild($elems->item(0)); 452 $elems->item(0)->parentNode->removeChild($elems->item(0));
453 $detect_title = false; 453 $detect_title = false;
454 } 454 }
455 } 455 }
456 456
457 if ($detect_date) { 457 if ($detect_date) {
458 // check for time element with pubdate attribute 458 // check for time element with pubdate attribute
459 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); 459 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
460 if ($elems && $elems->length > 0) { 460 if ($elems && $elems->length > 0) {
461 $this->date = strtotime(trim($elems->item(0)->textContent)); 461 $this->date = strtotime(trim($elems->item(0)->textContent));
462 // remove date from document 462 // remove date from document
463 //$elems->item(0)->parentNode->removeChild($elems->item(0)); 463 //$elems->item(0)->parentNode->removeChild($elems->item(0));
464 if ($this->date) { 464 if ($this->date) {
465 $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); 465 $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
466 $detect_date = false; 466 $detect_date = false;
467 } else { 467 } else {
468 $this->date = null; 468 $this->date = null;
469 } 469 }
470 } 470 }
471 } 471 }
472 472
473 if ($detect_author) { 473 if ($detect_author) {
474 // check for time element with pubdate attribute 474 // check for time element with pubdate attribute
475 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); 475 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
476 if ($elems && $elems->length > 0) { 476 if ($elems && $elems->length > 0) {
477 $author = $elems->item(0); 477 $author = $elems->item(0);
478 $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); 478 $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
479 if ($fn && $fn->length > 0) { 479 if ($fn && $fn->length > 0) {
480 foreach ($fn as $_fn) { 480 foreach ($fn as $_fn) {
481 if (trim($_fn->textContent) != '') { 481 if (trim($_fn->textContent) != '') {
482 $this->author[] = trim($_fn->textContent); 482 $this->author[] = trim($_fn->textContent);
483 $this->debug('hNews: found author: '.trim($_fn->textContent)); 483 $this->debug('hNews: found author: '.trim($_fn->textContent));
484 } 484 }
485 } 485 }
486 } else { 486 } else {
487 if (trim($author->textContent) != '') { 487 if (trim($author->textContent) != '') {
488 $this->author[] = trim($author->textContent); 488 $this->author[] = trim($author->textContent);
489 $this->debug('hNews: found author: '.trim($author->textContent)); 489 $this->debug('hNews: found author: '.trim($author->textContent));
490 } 490 }
491 } 491 }
492 $detect_author = empty($this->author); 492 $detect_author = empty($this->author);
493 } 493 }
494 } 494 }
495 495
496 // check for entry-content. 496 // check for entry-content.
497 // according to hAtom spec, if there are multiple elements marked entry-content, 497 // according to hAtom spec, if there are multiple elements marked entry-content,
498 // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content 498 // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
499 if ($detect_body) { 499 if ($detect_body) {
500 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); 500 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
501 if ($elems && $elems->length > 0) { 501 if ($elems && $elems->length > 0) {
502 $this->debug('hNews: found entry-content'); 502 $this->debug('hNews: found entry-content');
503 if ($elems->length == 1) { 503 if ($elems->length == 1) {
504 // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) 504 // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
505 $e = $elems->item(0); 505 $e = $elems->item(0);
506 if (($e->tagName == 'img') || (trim($e->textContent) != '')) { 506 if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
507 $this->body = $elems->item(0); 507 $this->body = $elems->item(0);
508 // prune (clean up elements that may not be content) 508 // prune (clean up elements that may not be content)
509 if ($this->config->prune()) { 509 if ($this->config->prune()) {
510 $this->debug('Pruning content'); 510 $this->debug('Pruning content');
511 $this->readability->prepArticle($this->body); 511 $this->readability->prepArticle($this->body);
512 } 512 }
513 $detect_body = false; 513 $detect_body = false;
514 } else { 514 } else {
515 $this->debug('hNews: skipping entry-content - appears not to contain content'); 515 $this->debug('hNews: skipping entry-content - appears not to contain content');
516 } 516 }
517 unset($e); 517 unset($e);
518 } else { 518 } else {
519 $this->body = $this->readability->dom->createElement('div'); 519 $this->body = $this->readability->dom->createElement('div');
520 $this->debug($elems->length.' entry-content elems found'); 520 $this->debug($elems->length.' entry-content elems found');
521 foreach ($elems as $elem) { 521 foreach ($elems as $elem) {
522 if (!isset($elem->parentNode)) continue; 522 if (!isset($elem->parentNode)) continue;
523 $isDescendant = false; 523 $isDescendant = false;
524 foreach ($this->body->childNodes as $parent) { 524 foreach ($this->body->childNodes as $parent) {
525 if ($this->isDescendant($parent, $elem)) { 525 if ($this->isDescendant($parent, $elem)) {
526 $isDescendant = true; 526 $isDescendant = true;
527 break; 527 break;
528 } 528 }
529 } 529 }
530 if ($isDescendant) { 530 if ($isDescendant) {
531 $this->debug('Element is child of another body element, skipping.'); 531 $this->debug('Element is child of another body element, skipping.');
532 } else { 532 } else {
533 // prune (clean up elements that may not be content) 533 // prune (clean up elements that may not be content)
534 if ($this->config->prune()) { 534 if ($this->config->prune()) {
535 $this->debug('Pruning content'); 535 $this->debug('Pruning content');
536 $this->readability->prepArticle($elem); 536 $this->readability->prepArticle($elem);
537 } 537 }
538 $this->debug('Element added to body'); 538 $this->debug('Element added to body');
539 $this->body->appendChild($elem); 539 $this->body->appendChild($elem);
540 } 540 }
541 } 541 }
542 $detect_body = false; 542 $detect_body = false;
543 } 543 }
544 } 544 }
545 } 545 }
546 } 546 }
547 } 547 }
548 548
549 // check for elements marked with instapaper_title 549 // check for elements marked with instapaper_title
550 if ($detect_title) { 550 if ($detect_title) {
551 // check for instapaper_title 551 // check for instapaper_title
552 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); 552 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
553 if ($elems && $elems->length > 0) { 553 if ($elems && $elems->length > 0) {
554 $this->title = $elems->item(0)->textContent; 554 $this->title = $elems->item(0)->textContent;
555 $this->debug('Title found (.instapaper_title): '.$this->title); 555 $this->debug('Title found (.instapaper_title): '.$this->title);
556 // remove title from document 556 // remove title from document
557 $elems->item(0)->parentNode->removeChild($elems->item(0)); 557 $elems->item(0)->parentNode->removeChild($elems->item(0));
558 $detect_title = false; 558 $detect_title = false;
559 } 559 }
560 } 560 }
561 // check for elements marked with instapaper_body 561 // check for elements marked with instapaper_body
562 if ($detect_body) { 562 if ($detect_body) {
563 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); 563 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
564 if ($elems && $elems->length > 0) { 564 if ($elems && $elems->length > 0) {
565 $this->debug('body found (.instapaper_body)'); 565 $this->debug('body found (.instapaper_body)');
566 $this->body = $elems->item(0); 566 $this->body = $elems->item(0);
567 // prune (clean up elements that may not be content) 567 // prune (clean up elements that may not be content)
568 if ($this->config->prune()) { 568 if ($this->config->prune()) {
569 $this->debug('Pruning content'); 569 $this->debug('Pruning content');
570 $this->readability->prepArticle($this->body); 570 $this->readability->prepArticle($this->body);
571 } 571 }
572 $detect_body = false; 572 $detect_body = false;
573 } 573 }
574 } 574 }
575 575
576 // Find author in rel="author" marked element 576 // Find author in rel="author" marked element
577 // We only use this if there's exactly one. 577 // We only use this if there's exactly one.
578 // If there's more than one, it could indicate more than 578 // If there's more than one, it could indicate more than
579 // one author, but it could also indicate that we're processing 579 // one author, but it could also indicate that we're processing
580 // a page listing different articles with different authors. 580 // a page listing different articles with different authors.
581 if ($detect_author) { 581 if ($detect_author) {
582 $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); 582 $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
583 if ($elems && $elems->length == 1) { 583 if ($elems && $elems->length == 1) {
584 $author = trim($elems->item(0)->textContent); 584 $author = trim($elems->item(0)->textContent);
585 if ($author != '') { 585 if ($author != '') {
586 $this->debug("Author found (rel=\"author\"): $author"); 586 $this->debug("Author found (rel=\"author\"): $author");
587 $this->author[] = $author; 587 $this->author[] = $author;
588 $detect_author = false; 588 $detect_author = false;
589 } 589 }
590 } 590 }
591 } 591 }
592 592
593 // Find date in pubdate marked time element 593 // Find date in pubdate marked time element
594 // For the same reason given above, we only use this 594 // For the same reason given above, we only use this
595 // if there's exactly one element. 595 // if there's exactly one element.
596 if ($detect_date) { 596 if ($detect_date) {
597 $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); 597 $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
598 if ($elems && $elems->length == 1) { 598 if ($elems && $elems->length == 1) {
599 $this->date = strtotime(trim($elems->item(0)->textContent)); 599 $this->date = strtotime(trim($elems->item(0)->textContent));
600 // remove date from document 600 // remove date from document
601 //$elems->item(0)->parentNode->removeChild($elems->item(0)); 601 //$elems->item(0)->parentNode->removeChild($elems->item(0));
602 if ($this->date) { 602 if ($this->date) {
603 $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); 603 $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
604 $detect_date = false; 604 $detect_date = false;
605 } else { 605 } else {
606 $this->date = null; 606 $this->date = null;
607 } 607 }
608 } 608 }
609 } 609 }
610 610
611 // still missing title or body, so we detect using Readability 611 // still missing title or body, so we detect using Readability
612 if ($detect_title || $detect_body) { 612 if ($detect_title || $detect_body) {
613 $this->debug('Using Readability'); 613 $this->debug('Using Readability');
614 // clone body if we're only using Readability for title (otherwise it may interfere with body element) 614 // clone body if we're only using Readability for title (otherwise it may interfere with body element)
615 if (isset($this->body)) $this->body = $this->body->cloneNode(true); 615 if (isset($this->body)) $this->body = $this->body->cloneNode(true);
616 $success = $this->readability->init(); 616 $success = $this->readability->init();
617 } 617 }
618 if ($detect_title) { 618 if ($detect_title) {
619 $this->debug('Detecting title'); 619 $this->debug('Detecting title');
620 $this->title = $this->readability->getTitle()->textContent; 620 $this->title = $this->readability->getTitle()->textContent;
621 } 621 }
622 if ($detect_body && $success) { 622 if ($detect_body && $success) {
623 $this->debug('Detecting body'); 623 $this->debug('Detecting body');
624 $this->body = $this->readability->getContent(); 624 $this->body = $this->readability->getContent();
625 if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { 625 if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
626 $this->body = $this->body->firstChild; 626 $this->body = $this->body->firstChild;
627 } 627 }
628 // prune (clean up elements that may not be content) 628 // prune (clean up elements that may not be content)
629 if ($this->config->prune()) { 629 if ($this->config->prune()) {
630 $this->debug('Pruning content'); 630 $this->debug('Pruning content');
631 $this->readability->prepArticle($this->body); 631 $this->readability->prepArticle($this->body);
632 } 632 }
633 } 633 }
634 if (isset($this->body)) { 634 if (isset($this->body)) {
635 // remove scripts 635 // remove scripts
636 $this->readability->removeScripts($this->body); 636 $this->readability->removeScripts($this->body);
637 // remove any h1-h6 elements that appear as first thing in the body 637 // remove any h1-h6 elements that appear as first thing in the body
638 // and which match our title 638 // and which match our title
639 if (isset($this->title) && ($this->title != '')) { 639 if (isset($this->title) && ($this->title != '')) {
640 $firstChild = $this->body->firstChild; 640 $firstChild = $this->body->firstChild;
641 while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { 641 while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
642 $firstChild = $firstChild->nextSibling; 642 $firstChild = $firstChild->nextSibling;
643 } 643 }
644 if (($firstChild->nodeType === XML_ELEMENT_NODE) 644 if (($firstChild->nodeType === XML_ELEMENT_NODE)
645 && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) 645 && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
646 && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { 646 && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
647 $this->body->removeChild($firstChild); 647 $this->body->removeChild($firstChild);
648 } 648 }
649 } 649 }
650 // prevent self-closing iframes 650 // prevent self-closing iframes
651 $elems = $this->body->getElementsByTagName('iframe'); 651 $elems = $this->body->getElementsByTagName('iframe');
652 for ($i = $elems->length-1; $i >= 0; $i--) { 652 for ($i = $elems->length-1; $i >= 0; $i--) {
653 $e = $elems->item($i); 653 $e = $elems->item($i);
654 if (!$e->hasChildNodes()) { 654 if (!$e->hasChildNodes()) {
655 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); 655 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
656 } 656 }
657 } 657 }
658 // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ 658 // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
659 // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src 659 // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
660 // inside the data-lazy-src attribute. It also places the original image inside a noscript element 660 // inside the data-lazy-src attribute. It also places the original image inside a noscript element
661 // next to the amended one. 661 // next to the amended one.
662 $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); 662 $elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
663 for ($i = $elems->length-1; $i >= 0; $i--) { 663 for ($i = $elems->length-1; $i >= 0; $i--) {
664 $e = $elems->item($i); 664 $e = $elems->item($i);
665 // let's see if we can grab image from noscript 665 // let's see if we can grab image from noscript
666 if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { 666 if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
667 $_new_elem = $e->ownerDocument->createDocumentFragment(); 667 $_new_elem = $e->ownerDocument->createDocumentFragment();
668 @$_new_elem->appendXML($e->nextSibling->innerHTML); 668 @$_new_elem->appendXML($e->nextSibling->innerHTML);
669 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); 669 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
670 $e->parentNode->removeChild($e); 670 $e->parentNode->removeChild($e);
671 } else { 671 } else {
672 // Use data-lazy-src as src value 672 // Use data-lazy-src as src value
673 $e->setAttribute('src', $e->getAttribute('data-lazy-src')); 673 $e->setAttribute('src', $e->getAttribute('data-lazy-src'));
674 $e->removeAttribute('data-lazy-src'); 674 $e->removeAttribute('data-lazy-src');
675 } 675 }
676 } 676 }
677 677
678 $this->success = true; 678 $this->success = true;
679 } 679 }
680 680
681 // if we've had no success and we've used tidy, there's a chance 681 // if we've had no success and we've used tidy, there's a chance
682 // that tidy has messed up. So let's try again without tidy... 682 // that tidy has messed up. So let's try again without tidy...
683 if (!$this->success && $tidied && $smart_tidy) { 683 if (!$this->success && $tidied && $smart_tidy) {
684 $this->debug('Trying again without tidy'); 684 $this->debug('Trying again without tidy');
685 $this->process($original_html, $url, false); 685 $this->process($original_html, $url, false);
686 } 686 }
687 687
688 return $this->success; 688 return $this->success;
689 } 689 }
690 690
691 private function isDescendant(DOMElement $parent, DOMElement $child) { 691 private function isDescendant(DOMElement $parent, DOMElement $child) {
692 $node = $child->parentNode; 692 $node = $child->parentNode;
693 while ($node != null) { 693 while ($node != null) {
694 if ($node->isSameNode($parent)) return true; 694 if ($node->isSameNode($parent)) return true;
695 $node = $node->parentNode; 695 $node = $node->parentNode;
696 } 696 }
697 return false; 697 return false;
698 } 698 }
699 699
700 public function getContent() { 700 public function getContent() {
701 return $this->body; 701 return $this->body;
702 } 702 }
703 703
704 public function getTitle() { 704 public function getTitle() {
705 return $this->title; 705 return $this->title;
706 } 706 }
707 707
708 public function getAuthors() { 708 public function getAuthors() {
709 return $this->author; 709 return $this->author;
710 } 710 }
711 711
712 public function getLanguage() { 712 public function getLanguage() {
713 return $this->language; 713 return $this->language;
714 } 714 }
715 715
716 public function getDate() { 716 public function getDate() {
717 return $this->date; 717 return $this->date;
718 } 718 }
719 719
720 public function getSiteConfig() { 720 public function getSiteConfig() {
721 return $this->config; 721 return $this->config;
722 } 722 }
723 723
724 public function getNextPageUrl() { 724 public function getNextPageUrl() {
725 return $this->nextPageUrl; 725 return $this->nextPageUrl;
726 } 726 }
727} 727} \ No newline at end of file
728?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
index c5e300d7..1f6a7603 100644
--- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php
+++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
@@ -1,338 +1,343 @@
1<?php 1<?php
2/** 2/**
3 * Site Config 3 * Site Config
4 * 4 *
5 * Each instance of this class should hold extraction patterns and other directives 5 * Each instance of this class should hold extraction patterns and other directives
6 * for a website. See ContentExtractor class to see how it's used. 6 * for a website. See ContentExtractor class to see how it's used.
7 * 7 *
8 * @version 0.7 8 * @version 0.8
9 * @date 2012-08-27 9 * @date 2013-04-16
10 * @author Keyvan Minoukadeh 10 * @author Keyvan Minoukadeh
11 * @copyright 2012 Keyvan Minoukadeh 11 * @copyright 2013 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
13 */ 13 */
14 14
15class SiteConfig 15class SiteConfig
16{ 16{
17 // Use first matching element as title (0 or more xpath expressions) 17 // Use first matching element as title (0 or more xpath expressions)
18 public $title = array(); 18 public $title = array();
19 19
20 // Use first matching element as body (0 or more xpath expressions) 20 // Use first matching element as body (0 or more xpath expressions)
21 public $body = array(); 21 public $body = array();
22 22
23 // Use first matching element as author (0 or more xpath expressions) 23 // Use first matching element as author (0 or more xpath expressions)
24 public $author = array(); 24 public $author = array();
25 25
26 // Use first matching element as date (0 or more xpath expressions) 26 // Use first matching element as date (0 or more xpath expressions)
27 public $date = array(); 27 public $date = array();
28 28
29 // Strip elements matching these xpath expressions (0 or more) 29 // Strip elements matching these xpath expressions (0 or more)
30 public $strip = array(); 30 public $strip = array();
31 31
32 // Strip elements which contain these strings (0 or more) in the id or class attribute 32 // Strip elements which contain these strings (0 or more) in the id or class attribute
33 public $strip_id_or_class = array(); 33 public $strip_id_or_class = array();
34 34
35 // Strip images which contain these strings (0 or more) in the src attribute 35 // Strip images which contain these strings (0 or more) in the src attribute
36 public $strip_image_src = array(); 36 public $strip_image_src = array();
37 37
38 // Additional HTTP headers to send 38 // Additional HTTP headers to send
39 // NOT YET USED 39 // NOT YET USED
40 public $http_header = array(); 40 public $http_header = array();
41 41
42 // Process HTML with tidy before creating DOM (bool or null if undeclared) 42 // Process HTML with tidy before creating DOM (bool or null if undeclared)
43 public $tidy = null; 43 public $tidy = null;
44 44
45 protected $default_tidy = true; // used if undeclared 45 protected $default_tidy = true; // used if undeclared
46 46
47 // Autodetect title/body if xpath expressions fail to produce results. 47 // Autodetect title/body if xpath expressions fail to produce results.
48 // Note that this applies to title and body separately, ie. 48 // Note that this applies to title and body separately, ie.
49 // * if we get a body match but no title match, this option will determine whether we autodetect title 49 // * if we get a body match but no title match, this option will determine whether we autodetect title
50 // * if neither match, this determines whether we autodetect title and body. 50 // * if neither match, this determines whether we autodetect title and body.
51 // Also note that this only applies when there is at least one xpath expression in title or body, ie. 51 // Also note that this only applies when there is at least one xpath expression in title or body, ie.
52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) 52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. 53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). 54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
55 // bool or null if undeclared 55 // bool or null if undeclared
56 public $autodetect_on_failure = null; 56 public $autodetect_on_failure = null;
57 protected $default_autodetect_on_failure = true; // used if undeclared 57 protected $default_autodetect_on_failure = true; // used if undeclared
58 58
59 // Clean up content block - attempt to remove elements that appear to be superfluous 59 // Clean up content block - attempt to remove elements that appear to be superfluous
60 // bool or null if undeclared 60 // bool or null if undeclared
61 public $prune = null; 61 public $prune = null;
62 protected $default_prune = true; // used if undeclared 62 protected $default_prune = true; // used if undeclared
63 63
64 // Test URL - if present, can be used to test the config above 64 // Test URL - if present, can be used to test the config above
65 public $test_url = array(); 65 public $test_url = array();
66 66
67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article 67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article
68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to 68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page 69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page
70 // which displays the entire article on one page (e.g. 'print view'). 70 // which displays the entire article on one page (e.g. 'print view').
71 // This should be an XPath expression identifying the link to that page. If present and we find a match, 71 // This should be an XPath expression identifying the link to that page. If present and we find a match,
72 // we will retrieve that page and the rest of the options in this config will be applied to the new page. 72 // we will retrieve that page and the rest of the options in this config will be applied to the new page.
73 public $single_page_link = array(); 73 public $single_page_link = array();
74 74
75 public $next_page_link = array(); 75 public $next_page_link = array();
76 76
77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed 77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
78 public $single_page_link_in_feed = array(); 78 public $single_page_link_in_feed = array();
79 79
80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') 80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
81 // string or null if undeclared 81 // string or null if undeclared
82 public $parser = null; 82 public $parser = null;
83 protected $default_parser = 'libxml'; // used if undeclared 83 protected $default_parser = 'libxml'; // used if undeclared
84 84
85 // Strings to search for in HTML before processing begins (used with $replace_string) 85 // Strings to search for in HTML before processing begins (used with $replace_string)
86 public $find_string = array(); 86 public $find_string = array();
87 // Strings to replace those found in $find_string before HTML processing begins 87 // Strings to replace those found in $find_string before HTML processing begins
88 public $replace_string = array(); 88 public $replace_string = array();
89 89
90 // the options below cannot be set in the config files which this class represents 90 // the options below cannot be set in the config files which this class represents
91 91
92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not 92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not
93 public $cache_key = null; 93 public $cache_key = null;
94 public static $debug = false; 94 public static $debug = false;
95 protected static $apc = false; 95 protected static $apc = false;
96 protected static $config_path; 96 protected static $config_path;
97 protected static $config_path_fallback; 97 protected static $config_path_fallback;
98 protected static $config_cache = array(); 98 protected static $config_cache = array();
99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; 99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
100 100
101 protected static function debug($msg) { 101 protected static function debug($msg) {
102 if (self::$debug) { 102 if (self::$debug) {
103 //$mem = round(memory_get_usage()/1024, 2); 103 //$mem = round(memory_get_usage()/1024, 2);
104 //$memPeak = round(memory_get_peak_usage()/1024, 2); 104 //$memPeak = round(memory_get_peak_usage()/1024, 2);
105 echo '* ',$msg; 105 echo '* ',$msg;
106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; 106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n";
107 echo "\n"; 107 echo "\n";
108 ob_flush(); 108 ob_flush();
109 flush(); 109 flush();
110 } 110 }
111 } 111 }
112 112
113 // enable APC caching of certain site config files? 113 // enable APC caching of certain site config files?
114 // If enabled the following site config files will be 114 // If enabled the following site config files will be
115 // cached in APC cache (when requested for first time): 115 // cached in APC cache (when requested for first time):
116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/ 116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/
117 // * the site config files associated with HTML fingerprints 117 // * the site config files associated with HTML fingerprints
118 // * the global site config file 118 // * the global site config file
119 // returns true if enabled, false otherwise 119 // returns true if enabled, false otherwise
120 public static function use_apc($apc=true) { 120 public static function use_apc($apc=true) {
121 if (!function_exists('apc_add')) { 121 if (!function_exists('apc_add')) {
122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); 122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
123 return false; 123 return false;
124 } 124 }
125 self::$apc = $apc; 125 self::$apc = $apc;
126 return $apc; 126 return $apc;
127 } 127 }
128 128
129 // return bool or null 129 // return bool or null
130 public function tidy($use_default=true) { 130 public function tidy($use_default=true) {
131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; 131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
132 return $this->tidy; 132 return $this->tidy;
133 } 133 }
134 134
135 // return bool or null 135 // return bool or null
136 public function prune($use_default=true) { 136 public function prune($use_default=true) {
137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; 137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
138 return $this->prune; 138 return $this->prune;
139 } 139 }
140 140
141 // return string or null 141 // return string or null
142 public function parser($use_default=true) { 142 public function parser($use_default=true) {
143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; 143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
144 return $this->parser; 144 return $this->parser;
145 } 145 }
146 146
147 // return bool or null 147 // return bool or null
148 public function autodetect_on_failure($use_default=true) { 148 public function autodetect_on_failure($use_default=true) {
149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; 149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
150 return $this->autodetect_on_failure; 150 return $this->autodetect_on_failure;
151 } 151 }
152 152
153 public static function set_config_path($path, $fallback=null) { 153 public static function set_config_path($path, $fallback=null) {
154 self::$config_path = $path; 154 self::$config_path = $path;
155 self::$config_path_fallback = $fallback; 155 self::$config_path_fallback = $fallback;
156 } 156 }
157 157
158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { 158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
159 $key = strtolower($key); 159 $key = strtolower($key);
160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); 160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
161 if ($config->cache_key) $key = $config->cache_key; 161 if ($config->cache_key) $key = $config->cache_key;
162 self::$config_cache[$key] = $config; 162 self::$config_cache[$key] = $config;
163 if (self::$apc && $use_apc) { 163 if (self::$apc && $use_apc) {
164 self::debug("Adding site config to APC cache with key sc.$key"); 164 self::debug("Adding site config to APC cache with key sc.$key");
165 apc_add("sc.$key", $config); 165 apc_add("sc.$key", $config);
166 } 166 }
167 self::debug("Cached site config with key $key"); 167 self::debug("Cached site config with key $key");
168 } 168 }
169 169
170 public static function is_cached($key) { 170 public static function is_cached($key) {
171 $key = strtolower($key); 171 $key = strtolower($key);
172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); 172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
173 if (array_key_exists($key, self::$config_cache)) { 173 if (array_key_exists($key, self::$config_cache)) {
174 return true; 174 return true;
175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { 175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
176 return true; 176 return true;
177 } 177 }
178 return false; 178 return false;
179 } 179 }
180 180
181 public function append(SiteConfig $newconfig) { 181 public function append(SiteConfig $newconfig) {
182 // check for commands where we accept multiple statements (no test_url) 182 // check for commands where we accept multiple statements (no test_url)
183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { 183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
184 // append array elements for this config variable from $newconfig to this config 184 // append array elements for this config variable from $newconfig to this config
185 //$this->$var = $this->$var + $newconfig->$var; 185 //$this->$var = $this->$var + $newconfig->$var;
186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); 186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
187 } 187 }
188 // check for single statement commands 188 // check for single statement commands
189 // we do not overwrite existing non null values 189 // we do not overwrite existing non null values
190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { 190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
191 if ($this->$var === null) $this->$var = $newconfig->$var; 191 if ($this->$var === null) $this->$var = $newconfig->$var;
192 } 192 }
193 } 193 // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
194 194 foreach (array('find_string', 'replace_string') as $var) {
195 // returns SiteConfig instance if an appropriate one is found, false otherwise 195 // append array elements for this config variable from $newconfig to this config
196 // if $exact_host_match is true, we will not look for wildcard config matches 196 //$this->$var = $this->$var + $newconfig->$var;
197 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists 197 $this->$var = array_merge($this->$var, $newconfig->$var);
198 public static function build($host, $exact_host_match=false) { 198 }
199 $host = strtolower($host); 199 }
200 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); 200
201 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; 201 // returns SiteConfig instance if an appropriate one is found, false otherwise
202 // check for site configuration 202 // if $exact_host_match is true, we will not look for wildcard config matches
203 $try = array($host); 203 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
204 // should we look for wildcard matches 204 public static function build($host, $exact_host_match=false) {
205 if (!$exact_host_match) { 205 $host = strtolower($host);
206 $split = explode('.', $host); 206 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
207 if (count($split) > 1) { 207 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
208 array_shift($split); 208 // check for site configuration
209 $try[] = '.'.implode('.', $split); 209 $try = array($host);
210 } 210 // should we look for wildcard matches
211 } 211 if (!$exact_host_match) {
212 212 $split = explode('.', $host);
213 // look for site config file in primary folder 213 if (count($split) > 1) {
214 self::debug(". looking for site config for $host in primary folder"); 214 array_shift($split);
215 foreach ($try as $h) { 215 $try[] = '.'.implode('.', $split);
216 if (array_key_exists($h, self::$config_cache)) { 216 }
217 self::debug("... site config for $h already loaded in this request"); 217 }
218 return self::$config_cache[$h]; 218
219 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { 219 // look for site config file in primary folder
220 self::debug("... site config for $h in APC cache"); 220 self::debug(". looking for site config for $host in primary folder");
221 return $sconfig; 221 foreach ($try as $h) {
222 } elseif (file_exists(self::$config_path."/$h.txt")) { 222 if (array_key_exists($h, self::$config_cache)) {
223 self::debug("... found site config ($h.txt)"); 223 self::debug("... site config for $h already loaded in this request");
224 $file_primary = self::$config_path."/$h.txt"; 224 return self::$config_cache[$h];
225 $matched_name = $h; 225 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
226 break; 226 self::debug("... site config for $h in APC cache");
227 } 227 return $sconfig;
228 } 228 } elseif (file_exists(self::$config_path."/$h.txt")) {
229 229 self::debug("... found site config ($h.txt)");
230 // if we found site config, process it 230 $file_primary = self::$config_path."/$h.txt";
231 if (isset($file_primary)) { 231 $matched_name = $h;
232 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 232 break;
233 if (!$config_lines || !is_array($config_lines)) return false; 233 }
234 $config = self::build_from_array($config_lines); 234 }
235 // if APC caching is available and enabled, mark this for cache 235
236 //$config->cache_in_apc = true; 236 // if we found site config, process it
237 $config->cache_key = $matched_name; 237 if (isset($file_primary)) {
238 238 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
239 // if autodetec on failure is off (on by default) we do not need to look 239 if (!$config_lines || !is_array($config_lines)) return false;
240 // in secondary folder 240 $config = self::build_from_array($config_lines);
241 if (!$config->autodetect_on_failure()) { 241 // if APC caching is available and enabled, mark this for cache
242 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); 242 //$config->cache_in_apc = true;
243 return $config; 243 $config->cache_key = $matched_name;
244 } 244
245 } 245 // if autodetec on failure is off (on by default) we do not need to look
246 246 // in secondary folder
247 // look for site config file in secondary folder 247 if (!$config->autodetect_on_failure()) {
248 if (isset(self::$config_path_fallback)) { 248 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
249 self::debug(". looking for site config for $host in secondary folder"); 249 return $config;
250 foreach ($try as $h) { 250 }
251 if (file_exists(self::$config_path_fallback."/$h.txt")) { 251 }
252 self::debug("... found site config in secondary folder ($h.txt)"); 252
253 $file_secondary = self::$config_path_fallback."/$h.txt"; 253 // look for site config file in secondary folder
254 $matched_name = $h; 254 if (isset(self::$config_path_fallback)) {
255 break; 255 self::debug(". looking for site config for $host in secondary folder");
256 } 256 foreach ($try as $h) {
257 } 257 if (file_exists(self::$config_path_fallback."/$h.txt")) {
258 if (!isset($file_secondary)) { 258 self::debug("... found site config in secondary folder ($h.txt)");
259 self::debug("... no site config match in secondary folder"); 259 $file_secondary = self::$config_path_fallback."/$h.txt";
260 } 260 $matched_name = $h;
261 } 261 break;
262 262 }
263 // return false if no config file found 263 }
264 if (!isset($file_primary) && !isset($file_secondary)) { 264 if (!isset($file_secondary)) {
265 self::debug("... no site config match for $host"); 265 self::debug("... no site config match in secondary folder");
266 return false; 266 }
267 } 267 }
268 268
269 // return primary config if secondary not found 269 // return false if no config file found
270 if (!isset($file_secondary) && isset($config)) { 270 if (!isset($file_primary) && !isset($file_secondary)) {
271 return $config; 271 self::debug("... no site config match for $host");
272 } 272 return false;
273 273 }
274 // process secondary config file 274
275 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 275 // return primary config if secondary not found
276 if (!$config_lines || !is_array($config_lines)) { 276 if (!isset($file_secondary) && isset($config)) {
277 // failed to process secondary 277 return $config;
278 if (isset($config)) { 278 }
279 // return primary config 279
280 return $config; 280 // process secondary config file
281 } else { 281 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
282 return false; 282 if (!$config_lines || !is_array($config_lines)) {
283 } 283 // failed to process secondary
284 } 284 if (isset($config)) {
285 285 // return primary config
286 // merge with primary and return 286 return $config;
287 if (isset($config)) { 287 } else {
288 self::debug('. merging config files'); 288 return false;
289 $config->append(self::build_from_array($config_lines)); 289 }
290 return $config; 290 }
291 } else { 291
292 // return just secondary 292 // merge with primary and return
293 $config = self::build_from_array($config_lines); 293 if (isset($config)) {
294 // if APC caching is available and enabled, mark this for cache 294 self::debug('. merging config files');
295 //$config->cache_in_apc = true; 295 $config->append(self::build_from_array($config_lines));
296 $config->cache_key = $matched_name; 296 return $config;
297 return $config; 297 } else {
298 } 298 // return just secondary
299 } 299 $config = self::build_from_array($config_lines);
300 300 // if APC caching is available and enabled, mark this for cache
301 public static function build_from_array(array $lines) { 301 //$config->cache_in_apc = true;
302 $config = new SiteConfig(); 302 $config->cache_key = $matched_name;
303 foreach ($lines as $line) { 303 return $config;
304 $line = trim($line); 304 }
305 305 }
306 // skip comments, empty lines 306
307 if ($line == '' || $line[0] == '#') continue; 307 public static function build_from_array(array $lines) {
308 308 $config = new SiteConfig();
309 // get command 309 foreach ($lines as $line) {
310 $command = explode(':', $line, 2); 310 $line = trim($line);
311 // if there's no colon ':', skip this line 311
312 if (count($command) != 2) continue; 312 // skip comments, empty lines
313 $val = trim($command[1]); 313 if ($line == '' || $line[0] == '#') continue;
314 $command = trim($command[0]); 314
315 if ($command == '' || $val == '') continue; 315 // get command
316 316 $command = explode(':', $line, 2);
317 // check for commands where we accept multiple statements 317 // if there's no colon ':', skip this line
318 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { 318 if (count($command) != 2) continue;
319 array_push($config->$command, $val); 319 $val = trim($command[1]);
320 // check for single statement commands that evaluate to true or false 320 $command = trim($command[0]);
321 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { 321 if ($command == '' || $val == '') continue;
322 $config->$command = ($val == 'yes'); 322
323 // check for single statement commands stored as strings 323 // check for commands where we accept multiple statements
324 } elseif (in_array($command, array('parser'))) { 324 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
325 $config->$command = $val; 325 array_push($config->$command, $val);
326 // check for replace_string(find): replace 326 // check for single statement commands that evaluate to true or false
327 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { 327 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
328 if (in_array($match[1], array('replace_string'))) { 328 $config->$command = ($val == 'yes');
329 $command = $match[1]; 329 // check for single statement commands stored as strings
330 array_push($config->find_string, $match[2]); 330 } elseif (in_array($command, array('parser'))) {
331 array_push($config->$command, $val); 331 $config->$command = $val;
332 } 332 // check for replace_string(find): replace
333 } 333 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
334 } 334 if (in_array($match[1], array('replace_string'))) {
335 return $config; 335 $command = $match[1];
336 } 336 array_push($config->find_string, $match[2]);
337} 337 array_push($config->$command, $val);
338?> \ No newline at end of file 338 }
339 }
340 }
341 return $config;
342 }
343} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php
index 54a56f22..40786598 100644..100755
--- a/inc/3rdparty/libraries/feedwriter/FeedItem.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php
@@ -1,7 +1,7 @@
1<?php 1<?php
2 /** 2 /**
3 * Univarsel Feed Writer 3 * Univarsel Feed Writer
4 * 4 *
5 * FeedItem class - Used as feed element in FeedWriter class 5 * FeedItem class - Used as feed element in FeedWriter class
6 * 6 *
7 * @package UnivarselFeedWriter 7 * @package UnivarselFeedWriter
@@ -12,20 +12,20 @@
12 { 12 {
13 private $elements = array(); //Collection of feed elements 13 private $elements = array(); //Collection of feed elements
14 private $version; 14 private $version;
15 15
16 /** 16 /**
17 * Constructor 17 * Constructor
18 * 18 *
19 * @param contant (RSS1/RSS2/ATOM) RSS2 is default. 19 * @param contant (RSS1/RSS2/ATOM) RSS2 is default.
20 */ 20 */
21 function __construct($version = RSS2) 21 function __construct($version = RSS2)
22 { 22 {
23 $this->version = $version; 23 $this->version = $version;
24 } 24 }
25 25
26 /** 26 /**
27 * Set element (overwrites existing elements with $elementName) 27 * Set element (overwrites existing elements with $elementName)
28 * 28 *
29 * @access public 29 * @access public
30 * @param srting The tag name of an element 30 * @param srting The tag name of an element
31 * @param srting The content of tag 31 * @param srting The content of tag
@@ -38,11 +38,11 @@
38 unset($this->elements[$elementName]); 38 unset($this->elements[$elementName]);
39 } 39 }
40 $this->addElement($elementName, $content, $attributes); 40 $this->addElement($elementName, $content, $attributes);
41 } 41 }
42 42
43 /** 43 /**
44 * Add an element to elements array 44 * Add an element to elements array
45 * 45 *
46 * @access public 46 * @access public
47 * @param srting The tag name of an element 47 * @param srting The tag name of an element
48 * @param srting The content of tag 48 * @param srting The content of tag
@@ -61,11 +61,11 @@
61 $this->elements[$elementName][$i]['content'] = $content; 61 $this->elements[$elementName][$i]['content'] = $content;
62 $this->elements[$elementName][$i]['attributes'] = $attributes; 62 $this->elements[$elementName][$i]['attributes'] = $attributes;
63 } 63 }
64 64
65 /** 65 /**
66 * Set multiple feed elements from an array. 66 * Set multiple feed elements from an array.
67 * Elements which have attributes cannot be added by this method 67 * Elements which have attributes cannot be added by this method
68 * 68 *
69 * @access public 69 * @access public
70 * @param array array of elements in 'tagName' => 'tagContent' format. 70 * @param array array of elements in 'tagName' => 'tagContent' format.
71 * @return void 71 * @return void
@@ -73,15 +73,15 @@
73 public function addElementArray($elementArray) 73 public function addElementArray($elementArray)
74 { 74 {
75 if(! is_array($elementArray)) return; 75 if(! is_array($elementArray)) return;
76 foreach ($elementArray as $elementName => $content) 76 foreach ($elementArray as $elementName => $content)
77 { 77 {
78 $this->addElement($elementName, $content); 78 $this->addElement($elementName, $content);
79 } 79 }
80 } 80 }
81 81
82 /** 82 /**
83 * Return the collection of elements in this feed item 83 * Return the collection of elements in this feed item
84 * 84 *
85 * @access public 85 * @access public
86 * @return array 86 * @return array
87 */ 87 */
@@ -89,68 +89,74 @@
89 { 89 {
90 return $this->elements; 90 return $this->elements;
91 } 91 }
92 92
93 // Wrapper functions ------------------------------------------------------ 93 // Wrapper functions ------------------------------------------------------
94 94
95 /** 95 /**
96 * Set the 'dscription' element of feed item 96 * Set the 'dscription' element of feed item
97 * 97 *
98 * @access public 98 * @access public
99 * @param string The content of 'description' element 99 * @param string The content of 'description' element
100 * @return void 100 * @return void
101 */ 101 */
102 public function setDescription($description) 102 public function setDescription($description)
103 { 103 {
104 $this->setElement('description', $description); 104 $tag = ($this->version == ATOM)? 'summary' : 'description';
105 $this->setElement($tag, $description);
105 } 106 }
106 107
107 /** 108 /**
108 * @desc Set the 'title' element of feed item 109 * @desc Set the 'title' element of feed item
109 * @access public 110 * @access public
110 * @param string The content of 'title' element 111 * @param string The content of 'title' element
111 * @return void 112 * @return void
112 */ 113 */
113 public function setTitle($title) 114 public function setTitle($title)
114 { 115 {
115 $this->setElement('title', $title); 116 $this->setElement('title', $title);
116 } 117 }
117 118
118 /** 119 /**
119 * Set the 'date' element of feed item 120 * Set the 'date' element of feed item
120 * 121 *
121 * @access public 122 * @access public
122 * @param string The content of 'date' element 123 * @param string The content of 'date' element
123 * @return void 124 * @return void
124 */ 125 */
125 public function setDate($date) 126 public function setDate($date)
126 { 127 {
127 if(! is_numeric($date)) 128 if(! is_numeric($date))
128 { 129 {
129 $date = strtotime($date); 130 $date = strtotime($date);
130 } 131 }
131 132
132 if($this->version == RSS2) 133 if($this->version == ATOM)
134 {
135 $tag = 'updated';
136 $value = date(DATE_ATOM, $date);
137 }
138 elseif($this->version == RSS2)
133 { 139 {
134 $tag = 'pubDate'; 140 $tag = 'pubDate';
135 $value = date(DATE_RSS, $date); 141 $value = date(DATE_RSS, $date);
136 } 142 }
137 else 143 else
138 { 144 {
139 $tag = 'dc:date'; 145 $tag = 'dc:date';
140 $value = date("Y-m-d", $date); 146 $value = date("Y-m-d", $date);
141 } 147 }
142 148
143 $this->setElement($tag, $value); 149 $this->setElement($tag, $value);
144 } 150 }
145 151
146 /** 152 /**
147 * Set the 'link' element of feed item 153 * Set the 'link' element of feed item
148 * 154 *
149 * @access public 155 * @access public
150 * @param string The content of 'link' element 156 * @param string The content of 'link' element
151 * @return void 157 * @return void
152 */ 158 */
153 public function setLink($link) 159 public function setLink($link)
154 { 160 {
155 if($this->version == RSS2 || $this->version == RSS1) 161 if($this->version == RSS2 || $this->version == RSS1)
156 { 162 {
@@ -161,27 +167,27 @@
161 { 167 {
162 $this->setElement('link','',array('href'=>$link)); 168 $this->setElement('link','',array('href'=>$link));
163 $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); 169 $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:'));
164 } 170 }
165 171
166 } 172 }
167 173
168 /** 174 /**
169 * Set the 'source' element of feed item 175 * Set the 'source' element of feed item
170 * 176 *
171 * @access public 177 * @access public
172 * @param string The content of 'source' element 178 * @param string The content of 'source' element
173 * @return void 179 * @return void
174 */ 180 */
175 public function setSource($link) 181 public function setSource($link)
176 { 182 {
177 $attributes = array('url'=>$link); 183 $attributes = array('url'=>$link);
178 $this->setElement('source', "wallabag",$attributes); 184 $this->setElement('source', "wallabag",$attributes);
179 } 185 }
180 186
181 /** 187 /**
182 * Set the 'encloser' element of feed item 188 * Set the 'encloser' element of feed item
183 * For RSS 2.0 only 189 * For RSS 2.0 only
184 * 190 *
185 * @access public 191 * @access public
186 * @param string The url attribute of encloser tag 192 * @param string The url attribute of encloser tag
187 * @param string The length attribute of encloser tag 193 * @param string The length attribute of encloser tag
@@ -193,6 +199,6 @@
193 $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); 199 $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);
194 $this->setElement('enclosure','',$attributes); 200 $this->setElement('enclosure','',$attributes);
195 } 201 }
196 202
197 } // end of class FeedItem 203 } // end of class FeedItem
198?> \ No newline at end of file 204?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
index d708e99b..77755690 100755
--- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
@@ -97,15 +97,12 @@ define('JSONP', 3, true);
97 header('X-content-type-options: nosniff'); 97 header('X-content-type-options: nosniff');
98 } elseif ($this->version == JSON) { 98 } elseif ($this->version == JSON) {
99 header('Content-type: application/json; charset=UTF-8'); 99 header('Content-type: application/json; charset=UTF-8');
100 $this->json = new stdClass();
100 } elseif ($this->version == JSONP) { 101 } elseif ($this->version == JSONP) {
101 header('Content-type: application/javascript; charset=UTF-8'); 102 header('Content-type: application/javascript; charset=UTF-8');
103 $this->json = new stdClass();
102 } 104 }
103 } 105 }
104
105 if ($this->version == JSON || $this->version == JSONP) {
106 $this->json = new stdClass();
107 }
108
109 106
110 $this->printHead(); 107 $this->printHead();
111 $this->printChannels(); 108 $this->printChannels();
@@ -116,6 +113,11 @@ define('JSONP', 3, true);
116 } 113 }
117 } 114 }
118 115
116 public function &getItems()
117 {
118 return $this->items;
119 }
120
119 /** 121 /**
120 * Create a new FeedItem. 122 * Create a new FeedItem.
121 * 123 *
@@ -199,7 +201,8 @@ define('JSONP', 3, true);
199 */ 201 */
200 public function setDescription($description) 202 public function setDescription($description)
201 { 203 {
202 $this->setChannelElement('description', $description); 204 $tag = ($this->version == ATOM)? 'subtitle' : 'description';
205 $this->setChannelElement($tag, $desciption);
203 } 206 }
204 207
205 /** 208 /**
@@ -244,7 +247,7 @@ define('JSONP', 3, true);
244 { 247 {
245 $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; 248 $out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
246 if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; 249 if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
247 $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; 250 $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
248 echo $out; 251 echo $out;
249 } 252 }
250 elseif ($this->version == JSON || $this->version == JSONP) 253 elseif ($this->version == JSON || $this->version == JSONP)
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php
index 2f5244f9..c4a48b21 100644
--- a/inc/3rdparty/libraries/html5/TreeBuilder.php
+++ b/inc/3rdparty/libraries/html5/TreeBuilder.php
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
134 134
135 // Namespaces for foreign content 135 // Namespaces for foreign content
136 const NS_HTML = null; // to prevent DOM from requiring NS on everything 136 const NS_HTML = null; // to prevent DOM from requiring NS on everything
137 const NS_XHTML = 'http://www.w3.org/1999/xhtml';
137 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; 138 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
138 const NS_SVG = 'http://www.w3.org/2000/svg'; 139 const NS_SVG = 'http://www.w3.org/2000/svg';
139 const NS_XLINK = 'http://www.w3.org/1999/xlink'; 140 const NS_XLINK = 'http://www.w3.org/1999/xlink';
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
3157 } 3158 }
3158 3159
3159 private function insertElement($token, $append = true) { 3160 private function insertElement($token, $append = true) {
3160 $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); 3161 //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
3162 $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
3163 $el = $this->dom->createElementNS($namespaceURI, $token['name']);
3161 3164
3162 if (!empty($token['attr'])) { 3165 if (!empty($token['attr'])) {
3163 foreach($token['attr'] as $attr) { 3166 foreach($token['attr'] as $attr) {
3164 if(!$el->hasAttribute($attr['name'])) { 3167
3168 // mike@macgirvin.com 2011-11-17, check attribute name for
3169 // validity (ignoring extenders and combiners) as illegal chars in names
3170 // causes everything to abort
3171
3172 $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
3173 if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
3165 $el->setAttribute($attr['name'], $attr['value']); 3174 $el->setAttribute($attr['name'], $attr['value']);
3166 } 3175 }
3167 } 3176 }
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
index 83e94f14..e4d5f495 100644
--- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
+++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
@@ -1,404 +1,403 @@
1<?php 1<?php
2/** 2/**
3 * Cookie Jar 3 * Cookie Jar
4 * 4 *
5 * PHP class for handling cookies, as defined by the Netscape spec: 5 * PHP class for handling cookies, as defined by the Netscape spec:
6 * <http://curl.haxx.se/rfc/cookie_spec.html> 6 * <http://curl.haxx.se/rfc/cookie_spec.html>
7 * 7 *
8 * This class should be used to handle cookies (storing cookies from HTTP response messages, and 8 * This class should be used to handle cookies (storing cookies from HTTP response messages, and
9 * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org 9 * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org
10 * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ 10 * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
11 * 11 *
12 * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ 12 * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
13 * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. 13 * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
14 * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. 14 * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
15 * 15 *
16 * @version 0.5 16 * @version 0.5
17 * @date 2011-03-15 17 * @date 2011-03-15
18 * @see http://php.net/HttpRequestPool 18 * @see http://php.net/HttpRequestPool
19 * @author Keyvan Minoukadeh 19 * @author Keyvan Minoukadeh
20 * @copyright 2011 Keyvan Minoukadeh 20 * @copyright 2011 Keyvan Minoukadeh
21 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 21 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
22 */ 22 */
23 23
24class CookieJar 24class CookieJar
25{ 25{
26 /** 26 /**
27 * Cookies - array containing all cookies. 27 * Cookies - array containing all cookies.
28 * 28 *
29 * <pre> 29 * <pre>
30 * Cookies are stored like this: 30 * Cookies are stored like this:
31 * [domain][path][name] = array 31 * [domain][path][name] = array
32 * where array is: 32 * where array is:
33 * 0 => value, 1 => secure, 2 => expires 33 * 0 => value, 1 => secure, 2 => expires
34 * </pre> 34 * </pre>
35 * @var array 35 * @var array
36 * @access private 36 * @access private
37 */ 37 */
38 public $cookies = array(); 38 public $cookies = array();
39 public $debug = false; 39 public $debug = false;
40 40
41 /** 41 /**
42 * Constructor 42 * Constructor
43 */ 43 */
44 function __construct() { 44 function __construct() {
45 } 45 }
46 46
47 protected function debug($msg, $file=null, $line=null) { 47 protected function debug($msg, $file=null, $line=null) {
48 if ($this->debug) { 48 if ($this->debug) {
49 $mem = round(memory_get_usage()/1024, 2); 49 $mem = round(memory_get_usage()/1024, 2);
50 $memPeak = round(memory_get_peak_usage()/1024, 2); 50 $memPeak = round(memory_get_peak_usage()/1024, 2);
51 echo '* ',$msg; 51 echo '* ',$msg;
52 if (isset($file, $line)) echo " ($file line $line)"; 52 if (isset($file, $line)) echo " ($file line $line)";
53 echo ' - mem used: ',$mem," (peak: $memPeak)\n"; 53 echo ' - mem used: ',$mem," (peak: $memPeak)\n";
54 ob_flush(); 54 ob_flush();
55 flush(); 55 flush();
56 } 56 }
57 } 57 }
58 58
59 /** 59 /**
60 * Get matching cookies 60 * Get matching cookies
61 * 61 *
62 * Only use this method if you cannot use add_cookie_header(), for example, if you want to use 62 * Only use this method if you cannot use add_cookie_header(), for example, if you want to use
63 * this cookie jar class without using the request class. 63 * this cookie jar class without using the request class.
64 * 64 *
65 * @param array $param associative array containing 'domain', 'path', 'secure' keys 65 * @param array $param associative array containing 'domain', 'path', 'secure' keys
66 * @return string 66 * @return string
67 * @see add_cookie_header() 67 * @see add_cookie_header()
68 */ 68 */
69 public function getMatchingCookies($url) 69 public function getMatchingCookies($url)
70 { 70 {
71 if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { 71 if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
72 $param['domain'] = $parts['host']; 72 $param['domain'] = $parts['host'];
73 $param['path'] = $parts['path']; 73 $param['path'] = $parts['path'];
74 $param['secure'] = (strtolower($parts['scheme']) == 'https'); 74 $param['secure'] = (strtolower($parts['scheme']) == 'https');
75 unset($parts); 75 unset($parts);
76 } else { 76 } else {
77 return false; 77 return false;
78 } 78 }
79 // RFC 2965 notes: 79 // RFC 2965 notes:
80 // If multiple cookies satisfy the criteria above, they are ordered in 80 // If multiple cookies satisfy the criteria above, they are ordered in
81 // the Cookie header such that those with more specific Path attributes 81 // the Cookie header such that those with more specific Path attributes
82 // precede those with less specific. Ordering with respect to other 82 // precede those with less specific. Ordering with respect to other
83 // attributes (e.g., Domain) is unspecified. 83 // attributes (e.g., Domain) is unspecified.
84 $domain = $param['domain']; 84 $domain = $param['domain'];
85 if (strpos($domain, '.') === false) $domain .= '.local'; 85 if (strpos($domain, '.') === false) $domain .= '.local';
86 $request_path = $param['path']; 86 $request_path = $param['path'];
87 if ($request_path == '') $request_path = '/'; 87 if ($request_path == '') $request_path = '/';
88 $request_secure = $param['secure']; 88 $request_secure = $param['secure'];
89 $now = time(); 89 $now = time();
90 $matched_cookies = array(); 90 $matched_cookies = array();
91 // domain - find matching domains 91 // domain - find matching domains
92 $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); 92 $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
93 while (strpos($domain, '.') !== false) { 93 while (strpos($domain, '.') !== false) {
94 if (isset($this->cookies[$domain])) { 94 if (isset($this->cookies[$domain])) {
95 $this->debug(' domain match found: '.$domain); 95 $this->debug(' domain match found: '.$domain);
96 $cookies =& $this->cookies[$domain]; 96 $cookies =& $this->cookies[$domain];
97 } else { 97 } else {
98 $domain = $this->_reduce_domain($domain); 98 $domain = $this->_reduce_domain($domain);
99 continue; 99 continue;
100 } 100 }
101 // paths - find matching paths starting from most specific 101 // paths - find matching paths starting from most specific
102 $this->debug(' - Finding matching paths for '.$request_path); 102 $this->debug(' - Finding matching paths for '.$request_path);
103 $paths = array_keys($cookies); 103 $paths = array_keys($cookies);
104 usort($paths, array($this, '_cmp_length')); 104 usort($paths, array($this, '_cmp_length'));
105 foreach ($paths as $path) { 105 foreach ($paths as $path) {
106 // continue to next cookie if request path does not path-match cookie path 106 // continue to next cookie if request path does not path-match cookie path
107 if (!$this->_path_match($request_path, $path)) continue; 107 if (!$this->_path_match($request_path, $path)) continue;
108 // loop through cookie names 108 // loop through cookie names
109 $this->debug(' path match found: '.$path); 109 $this->debug(' path match found: '.$path);
110 foreach ($cookies[$path] as $name => $values) { 110 foreach ($cookies[$path] as $name => $values) {
111 // if this cookie is secure but request isn't, continue to next cookie 111 // if this cookie is secure but request isn't, continue to next cookie
112 if ($values[1] && !$request_secure) continue; 112 if ($values[1] && !$request_secure) continue;
113 // if cookie is not a session cookie and has expired, continue to next cookie 113 // if cookie is not a session cookie and has expired, continue to next cookie
114 if (is_int($values[2]) && ($values[2] < $now)) continue; 114 if (is_int($values[2]) && ($values[2] < $now)) continue;
115 // cookie matches request 115 // cookie matches request
116 $this->debug(' cookie match: '.$name.'='.$values[0]); 116 $this->debug(' cookie match: '.$name.'='.$values[0]);
117 $matched_cookies[] = $name.'='.$values[0]; 117 $matched_cookies[] = $name.'='.$values[0];
118 } 118 }
119 } 119 }
120 $domain = $this->_reduce_domain($domain); 120 $domain = $this->_reduce_domain($domain);
121 } 121 }
122 // return cookies 122 // return cookies
123 return implode('; ', $matched_cookies); 123 return implode('; ', $matched_cookies);
124 } 124 }
125 125
126 /** 126 /**
127 * Parse Set-Cookie values. 127 * Parse Set-Cookie values.
128 * 128 *
129 * Only use this method if you cannot use extract_cookies(), for example, if you want to use 129 * Only use this method if you cannot use extract_cookies(), for example, if you want to use
130 * this cookie jar class without using the response class. 130 * this cookie jar class without using the response class.
131 * 131 *
132 * @param array $set_cookies array holding 1 or more "Set-Cookie" header values 132 * @param array $set_cookies array holding 1 or more "Set-Cookie" header values
133 * @param array $param associative array containing 'host', 'path' keys 133 * @param array $param associative array containing 'host', 'path' keys
134 * @return void 134 * @return void
135 * @see extract_cookies() 135 * @see extract_cookies()
136 */ 136 */
137 public function storeCookies($url, $set_cookies) 137 public function storeCookies($url, $set_cookies)
138 { 138 {
139 if (count($set_cookies) == 0) return; 139 if (count($set_cookies) == 0) return;
140 $param = @parse_url($url); 140 $param = @parse_url($url);
141 if (!is_array($param) || !isset($param['host'])) return; 141 if (!is_array($param) || !isset($param['host'])) return;
142 $request_host = $param['host']; 142 $request_host = $param['host'];
143 if (strpos($request_host, '.') === false) $request_host .= '.local'; 143 if (strpos($request_host, '.') === false) $request_host .= '.local';
144 $request_path = @$param['path']; 144 $request_path = @$param['path'];
145 if ($request_path == '') $request_path = '/'; 145 if ($request_path == '') $request_path = '/';
146 // 146 //
147 // loop through set-cookie headers 147 // loop through set-cookie headers
148 // 148 //
149 foreach ($set_cookies as $set_cookie) { 149 foreach ($set_cookies as $set_cookie) {
150 $this->debug('Parsing: '.$set_cookie); 150 $this->debug('Parsing: '.$set_cookie);
151 // temporary cookie store (before adding to jar) 151 // temporary cookie store (before adding to jar)
152 $tmp_cookie = array(); 152 $tmp_cookie = array();
153 $param = explode(';', $set_cookie); 153 $param = explode(';', $set_cookie);
154 // loop through params 154 // loop through params
155 for ($x=0; $x<count($param); $x++) { 155 for ($x=0; $x<count($param); $x++) {
156 $key_val = explode('=', $param[$x], 2); 156 $key_val = explode('=', $param[$x], 2);
157 if (count($key_val) != 2) { 157 if (count($key_val) != 2) {
158 // if the first param isn't a name=value pair, continue to the next set-cookie 158 // if the first param isn't a name=value pair, continue to the next set-cookie
159 // header 159 // header
160 if ($x == 0) continue 2; 160 if ($x == 0) continue 2;
161 // check for secure flag 161 // check for secure flag
162 if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; 162 if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
163 // continue to next param 163 // continue to next param
164 continue; 164 continue;
165 } 165 }
166 list($key, $val) = array_map('trim', $key_val); 166 list($key, $val) = array_map('trim', $key_val);
167 // first name=value pair is the cookie name and value 167 // first name=value pair is the cookie name and value
168 // the name and value are stored under 'name' and 'value' to avoid conflicts 168 // the name and value are stored under 'name' and 'value' to avoid conflicts
169 // with later parameters. 169 // with later parameters.
170 if ($x == 0) { 170 if ($x == 0) {
171 $tmp_cookie = array('name'=>$key, 'value'=>$val); 171 $tmp_cookie = array('name'=>$key, 'value'=>$val);
172 continue; 172 continue;
173 } 173 }
174 $key = strtolower($key); 174 $key = strtolower($key);
175 if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { 175 if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
176 $tmp_cookie[$key] = $val; 176 $tmp_cookie[$key] = $val;
177 } 177 }
178 } 178 }
179 // 179 //
180 // set cookie 180 // set cookie
181 // 181 //
182 // check domain 182 // check domain
183 if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && 183 if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
184 ($tmp_cookie['domain'] != ".$request_host")) { 184 ($tmp_cookie['domain'] != ".$request_host")) {
185 $domain = $tmp_cookie['domain']; 185 $domain = $tmp_cookie['domain'];
186 if ((strpos($domain, '.') === false) && ($domain != 'local')) { 186 if ((strpos($domain, '.') === false) && ($domain != 'local')) {
187 $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); 187 $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
188 continue; 188 continue;
189 } 189 }
190 if (preg_match('/\.[0-9]+$/', $domain)) { 190 if (preg_match('/\.[0-9]+$/', $domain)) {
191 $this->debug(' - domain "'.$domain.'" appears to be an ip address'); 191 $this->debug(' - domain "'.$domain.'" appears to be an ip address');
192 continue; 192 continue;
193 } 193 }
194 if (substr($domain, 0, 1) != '.') $domain = ".$domain"; 194 if (substr($domain, 0, 1) != '.') $domain = ".$domain";
195 if (!$this->_domain_match($request_host, $domain)) { 195 if (!$this->_domain_match($request_host, $domain)) {
196 $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); 196 $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
197 continue; 197 continue;
198 } 198 }
199 } else { 199 } else {
200 // if domain is not specified in the set-cookie header, domain will default to 200 // if domain is not specified in the set-cookie header, domain will default to
201 // the request host 201 // the request host
202 $domain = $request_host; 202 $domain = $request_host;
203 } 203 }
204 // check path 204 // check path
205 if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { 205 if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
206 $path = urldecode($tmp_cookie['path']); 206 $path = urldecode($tmp_cookie['path']);
207 if (!$this->_path_match($request_path, $path)) { 207 if (!$this->_path_match($request_path, $path)) {
208 $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); 208 $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
209 continue; 209 continue;
210 } 210 }
211 } else { 211 } else {
212 $path = $request_path; 212 $path = $request_path;
213 $path = substr($path, 0, strrpos($path, '/')); 213 $path = substr($path, 0, strrpos($path, '/'));
214 if ($path == '') $path = '/'; 214 if ($path == '') $path = '/';
215 } 215 }
216 // check if secure 216 // check if secure
217 $secure = (isset($tmp_cookie['secure'])) ? true : false; 217 $secure = (isset($tmp_cookie['secure'])) ? true : false;
218 // check expiry 218 // check expiry
219 if (isset($tmp_cookie['expires'])) { 219 if (isset($tmp_cookie['expires'])) {
220 if (($expires = strtotime($tmp_cookie['expires'])) < 0) { 220 if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
221 $expires = null; 221 $expires = null;
222 } 222 }
223 } else { 223 } else {
224 $expires = null; 224 $expires = null;
225 } 225 }
226 // set cookie 226 // set cookie
227 $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); 227 $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
228 } 228 }
229 } 229 }
230 230
231 // return array of set-cookie values extracted from HTTP response headers (string $h) 231 // return array of set-cookie values extracted from HTTP response headers (string $h)
232 public function extractCookies($h) { 232 public function extractCookies($h) {
233 $x = 0; 233 $x = 0;
234 $lines = 0; 234 $lines = 0;
235 $headers = array(); 235 $headers = array();
236 $last_match = false; 236 $last_match = false;
237 $h = explode("\n", $h); 237 $h = explode("\n", $h);
238 foreach ($h as $line) { 238 foreach ($h as $line) {
239 $line = rtrim($line); 239 $line = rtrim($line);
240 $lines++; 240 $lines++;
241 241
242 $trimmed_line = trim($line); 242 $trimmed_line = trim($line);
243 if (isset($line_last)) { 243 if (isset($line_last)) {
244 // check if we have \r\n\r\n (indicating the end of headers) 244 // check if we have \r\n\r\n (indicating the end of headers)
245 // some servers will not use CRLF (\r\n), so we make CR (\r) optional. 245 // some servers will not use CRLF (\r\n), so we make CR (\r) optional.
246 // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { 246 // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
247 // break; 247 // break;
248 // } 248 // }
249 // As an alternative, we can check if the current trimmed line is empty 249 // As an alternative, we can check if the current trimmed line is empty
250 if ($trimmed_line == '') { 250 if ($trimmed_line == '') {
251 break; 251 break;
252 } 252 }
253 253
254 // check for continuation line... 254 // check for continuation line...
255 // RFC 2616 Section 2.2 "Basic Rules": 255 // RFC 2616 Section 2.2 "Basic Rules":
256 // HTTP/1.1 header field values can be folded onto multiple lines if the 256 // HTTP/1.1 header field values can be folded onto multiple lines if the
257 // continuation line begins with a space or horizontal tab. All linear 257 // continuation line begins with a space or horizontal tab. All linear
258 // white space, including folding, has the same semantics as SP. A 258 // white space, including folding, has the same semantics as SP. A
259 // recipient MAY replace any linear white space with a single SP before 259 // recipient MAY replace any linear white space with a single SP before
260 // interpreting the field value or forwarding the message downstream. 260 // interpreting the field value or forwarding the message downstream.
261 if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { 261 if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
262 // append to previous header value 262 // append to previous header value
263 $headers[$x-1] .= ' '.rtrim($match[1]); 263 $headers[$x-1] .= ' '.rtrim($match[1]);
264 continue; 264 continue;
265 } 265 }
266 } 266 }
267 $line_last = $line; 267 $line_last = $line;
268 268
269 // split header name and value 269 // split header name and value
270 if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { 270 if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
271 $headers[$x++] = rtrim($match[1]); 271 $headers[$x++] = rtrim($match[1]);
272 $last_match = true; 272 $last_match = true;
273 } else { 273 } else {
274 $last_match = false; 274 $last_match = false;
275 } 275 }
276 } 276 }
277 return $headers; 277 return $headers;
278 } 278 }
279 279
280 /** 280 /**
281 * Set Cookie 281 * Set Cookie
282 * @param string $domain 282 * @param string $domain
283 * @param string $path 283 * @param string $path
284 * @param string $name cookie name 284 * @param string $name cookie name
285 * @param string $value cookie value 285 * @param string $value cookie value
286 * @param bool $secure 286 * @param bool $secure
287 * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) 287 * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
288 * @return void 288 * @return void
289 */ 289 */
290 function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) 290 function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
291 { 291 {
292 if ($domain == '') return; 292 if ($domain == '') return;
293 if ($path == '') return; 293 if ($path == '') return;
294 if ($name == '') return; 294 if ($name == '') return;
295 // check if cookie needs to go 295 // check if cookie needs to go
296 if (isset($expires) && ($expires <= 0)) { 296 if (isset($expires) && ($expires <= 0)) {
297 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); 297 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
298 return; 298 return;
299 } 299 }
300 if ($value == '') return; 300 if ($value == '') return;
301 $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); 301 $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
302 return; 302 return;
303 } 303 }
304 304
305 /** 305 /**
306 * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. 306 * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
307 * @param string $domain 307 * @param string $domain
308 * @param string $path 308 * @param string $path
309 * @param string $name 309 * @param string $name
310 * @return void 310 * @return void
311 */ 311 */
312 function clear($domain=null, $path=null, $name=null) 312 function clear($domain=null, $path=null, $name=null)
313 { 313 {
314 if (!isset($domain)) { 314 if (!isset($domain)) {
315 $this->cookies = array(); 315 $this->cookies = array();
316 } elseif (!isset($path)) { 316 } elseif (!isset($path)) {
317 if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); 317 if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
318 } elseif (!isset($name)) { 318 } elseif (!isset($name)) {
319 if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); 319 if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
320 } elseif (isset($name)) { 320 } elseif (isset($name)) {
321 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); 321 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
322 } 322 }
323 } 323 }
324 324
325 /** 325 /**
326 * Compare string length - used for sorting 326 * Compare string length - used for sorting
327 * @access private 327 * @access private
328 * @return int 328 * @return int
329 */ 329 */
330 function _cmp_length($a, $b) 330 function _cmp_length($a, $b)
331 { 331 {
332 $la = strlen($a); $lb = strlen($b); 332 $la = strlen($a); $lb = strlen($b);
333 if ($la == $lb) return 0; 333 if ($la == $lb) return 0;
334 return ($la > $lb) ? -1 : 1; 334 return ($la > $lb) ? -1 : 1;
335 } 335 }
336 336
337 /** 337 /**
338 * Reduce domain 338 * Reduce domain
339 * @param string $domain 339 * @param string $domain
340 * @return string 340 * @return string
341 * @access private 341 * @access private
342 */ 342 */
343 function _reduce_domain($domain) 343 function _reduce_domain($domain)
344 { 344 {
345 if ($domain == '') return ''; 345 if ($domain == '') return '';
346 if (substr($domain, 0, 1) == '.') return substr($domain, 1); 346 if (substr($domain, 0, 1) == '.') return substr($domain, 1);
347 return substr($domain, strpos($domain, '.')); 347 return substr($domain, strpos($domain, '.'));
348 } 348 }
349 349
350 /** 350 /**
351 * Path match - check if path1 path-matches path2 351 * Path match - check if path1 path-matches path2
352 * 352 *
353 * From RFC 2965: 353 * From RFC 2965:
354 * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 354 * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
355 * if P2 is a prefix of P1 (including the case where P1 and P2 string- 355 * if P2 is a prefix of P1 (including the case where P1 and P2 string-
356 * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> 356 * compare equal). Thus, the string /tec/waldo path-matches /tec.</i>
357 * @param string $path1 357 * @param string $path1
358 * @param string $path2 358 * @param string $path2
359 * @return bool 359 * @return bool
360 * @access private 360 * @access private
361 */ 361 */
362 function _path_match($path1, $path2) 362 function _path_match($path1, $path2)
363 { 363 {
364 return (substr($path1, 0, strlen($path2)) == $path2); 364 return (substr($path1, 0, strlen($path2)) == $path2);
365 } 365 }
366 366
367 /** 367 /**
368 * Domain match - check if domain1 domain-matches domain2 368 * Domain match - check if domain1 domain-matches domain2
369 * 369 *
370 * A few extracts from RFC 2965: 370 * A few extracts from RFC 2965:
371 * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com 371 * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
372 * would be rejected, because H is y.x and contains a dot. 372 * would be rejected, because H is y.x and contains a dot.
373 * 373 *
374 * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com 374 * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
375 * would be accepted. 375 * would be accepted.
376 * 376 *
377 * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be 377 * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be
378 * rejected, because there is no embedded dot. 378 * rejected, because there is no embedded dot.
379 * 379 *
380 * - A Set-Cookie2 from request-host example for Domain=.local will 380 * - A Set-Cookie2 from request-host example for Domain=.local will
381 * be accepted, because the effective host name for the request- 381 * be accepted, because the effective host name for the request-
382 * host is example.local, and example.local domain-matches .local. 382 * host is example.local, and example.local domain-matches .local.
383 * 383 *
384 * I'm ignoring the first point for now (must check to see how other browsers handle 384 * I'm ignoring the first point for now (must check to see how other browsers handle
385 * this rule for Set-Cookie headers) 385 * this rule for Set-Cookie headers)
386 * 386 *
387 * @param string $domain1 387 * @param string $domain1
388 * @param string $domain2 388 * @param string $domain2
389 * @return bool 389 * @return bool
390 * @access private 390 * @access private
391 */ 391 */
392 function _domain_match($domain1, $domain2) 392 function _domain_match($domain1, $domain2)
393 { 393 {
394 $domain1 = strtolower($domain1); 394 $domain1 = strtolower($domain1);
395 $domain2 = strtolower($domain2); 395 $domain2 = strtolower($domain2);
396 while (strpos($domain1, '.') !== false) { 396 while (strpos($domain1, '.') !== false) {
397 if ($domain1 == $domain2) return true; 397 if ($domain1 == $domain2) return true;
398 $domain1 = $this->_reduce_domain($domain1); 398 $domain1 = $this->_reduce_domain($domain1);
399 continue; 399 continue;
400 } 400 }
401 return false; 401 return false;
402 } 402 }
403} 403} \ No newline at end of file
404?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
index e4f1b3b3..963f0c05 100644
--- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -1,779 +1,810 @@
1<?php 1<?php
2/** 2/**
3 * Humble HTTP Agent 3 * Humble HTTP Agent
4 * 4 *
5 * This class is designed to take advantage of parallel HTTP requests 5 * This class is designed to take advantage of parallel HTTP requests
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions. 6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
7 * For environments which do not have these options, it reverts to standard sequential 7 * For environments which do not have these options, it reverts to standard sequential
8 * requests (using file_get_contents()) 8 * requests (using file_get_contents())
9 * 9 *
10 * @version 1.1 10 * @version 1.4
11 * @date 2012-08-20 11 * @date 2013-05-10
12 * @see http://php.net/HttpRequestPool 12 * @see http://php.net/HttpRequestPool
13 * @author Keyvan Minoukadeh 13 * @author Keyvan Minoukadeh
14 * @copyright 2011-2012 Keyvan Minoukadeh 14 * @copyright 2011-2013 Keyvan Minoukadeh
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
16 */ 16 */
17 17
18class HumbleHttpAgent 18class HumbleHttpAgent
19{ 19{
20 const METHOD_REQUEST_POOL = 1; 20 const METHOD_REQUEST_POOL = 1;
21 const METHOD_CURL_MULTI = 2; 21 const METHOD_CURL_MULTI = 2;
22 const METHOD_FILE_GET_CONTENTS = 4; 22 const METHOD_FILE_GET_CONTENTS = 4;
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; 23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; 24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
25 const UA_PHP = 'PHP/5.2'; 25 const UA_PHP = 'PHP/5.4';
26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; 26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
27 27
28 protected $requests = array(); 28 protected $requests = array();
29 protected $redirectQueue = array(); 29 protected $redirectQueue = array();
30 protected $requestOptions; 30 protected $requestOptions;
31 protected $maxParallelRequests = 5; 31 protected $maxParallelRequests = 5;
32 protected $cache = null; //TODO 32 protected $cache = null; //TODO
33 protected $httpContext; 33 protected $httpContext;
34 protected $minimiseMemoryUse = false; //TODO 34 protected $minimiseMemoryUse = false; //TODO
35 protected $method; 35 protected $method;
36 protected $cookieJar; 36 protected $cookieJar;
37 public $debug = false; 37 public $debug = false;
38 public $debugVerbose = false; 38 public $debugVerbose = false;
39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html 39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
40 public $maxRedirects = 5; 40 public $maxRedirects = 5;
41 public $userAgentMap = array(); 41 public $userAgentMap = array();
42 public $rewriteUrls = array(); 42 public $rewriteUrls = array();
43 public $userAgentDefault; 43 public $userAgentDefault;
44 public $referer; 44 public $referer;
45 //public $userAgent = 'Mozilla/5.0'; 45 //public $userAgent = 'Mozilla/5.0';
46 46
47 // Prevent certain file/mime types 47 // Prevent certain file/mime types
48 // HTTP responses which match these content types will 48 // HTTP responses which match these content types will
49 // be returned without body. 49 // be returned without body.
50 public $headerOnlyTypes = array(); 50 public $headerOnlyTypes = array();
51 // URLs ending with one of these extensions will 51 // URLs ending with one of these extensions will
52 // prompt Humble HTTP Agent to send a HEAD request first 52 // prompt Humble HTTP Agent to send a HEAD request first
53 // to see if returned content type matches $headerOnlyTypes. 53 // to see if returned content type matches $headerOnlyTypes.
54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); 54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
55 // AJAX triggers to search for. 55 // AJAX triggers to search for.
56 // for AJAX sites, e.g. Blogger with its dynamic views templates. 56 // for AJAX sites, e.g. Blogger with its dynamic views templates.
57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); 57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
58 58
59 //TODO: set max file size 59 //TODO: set max file size
60 //TODO: normalise headers 60 //TODO: normalise headers
61 61
62 function __construct($requestOptions=null, $method=null) { 62 function __construct($requestOptions=null, $method=null) {
63 $this->userAgentDefault = self::UA_BROWSER; 63 $this->userAgentDefault = self::UA_BROWSER;
64 $this->referer = self::REF_GOOGLE; 64 $this->referer = self::REF_GOOGLE;
65 // set the request method 65 // set the request method
66 if (in_array($method, array(1,2,4))) { 66 if (in_array($method, array(1,2,4))) {
67 $this->method = $method; 67 $this->method = $method;
68 } else { 68 } else {
69 if (class_exists('HttpRequestPool')) { 69 if (class_exists('HttpRequestPool')) {
70 $this->method = self::METHOD_REQUEST_POOL; 70 $this->method = self::METHOD_REQUEST_POOL;
71 } elseif (function_exists('curl_multi_init')) { 71 } elseif (function_exists('curl_multi_init')) {
72 $this->method = self::METHOD_CURL_MULTI; 72 $this->method = self::METHOD_CURL_MULTI;
73 } else { 73 } else {
74 $this->method = self::METHOD_FILE_GET_CONTENTS; 74 $this->method = self::METHOD_FILE_GET_CONTENTS;
75 } 75 }
76 } 76 }
77 if ($this->method == self::METHOD_CURL_MULTI) { 77 if ($this->method == self::METHOD_CURL_MULTI) {
78 require_once(dirname(__FILE__).'/RollingCurl.php'); 78 require_once(dirname(__FILE__).'/RollingCurl.php');
79 } 79 }
80 // create cookie jar 80 // create cookie jar
81 $this->cookieJar = new CookieJar(); 81 $this->cookieJar = new CookieJar();
82 // set request options (redirect must be 0) 82 // set request options (redirect must be 0)
83 $this->requestOptions = array( 83 $this->requestOptions = array(
84 'timeout' => 15, 84 'timeout' => 15,
85 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web 85 'connecttimeout' => 15,
86 // TODO: test onprogress? 86 'dns_cache_timeout' => 300,
87 ); 87 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
88 if (is_array($requestOptions)) { 88 // TODO: test onprogress?
89 $this->requestOptions = array_merge($this->requestOptions, $requestOptions); 89 );
90 } 90 if (is_array($requestOptions)) {
91 $this->httpContext = array( 91 $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
92 'http' => array( 92 }
93 'ignore_errors' => true, 93 $this->httpContext = array(
94 'timeout' => $this->requestOptions['timeout'], 94 'http' => array(
95 'max_redirects' => $this->requestOptions['redirect'], 95 'ignore_errors' => true,
96 'header' => "Accept: */*\r\n" 96 'timeout' => $this->requestOptions['timeout'],
97 ) 97 'max_redirects' => $this->requestOptions['redirect'],
98 ); 98 'header' => "Accept: */*\r\n"
99 } 99 )
100 100 );
101 protected function debug($msg) { 101 }
102 if ($this->debug) { 102
103 $mem = round(memory_get_usage()/1024, 2); 103 protected function debug($msg) {
104 $memPeak = round(memory_get_peak_usage()/1024, 2); 104 if ($this->debug) {
105 echo '* ',$msg; 105 $mem = round(memory_get_usage()/1024, 2);
106 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; 106 $memPeak = round(memory_get_peak_usage()/1024, 2);
107 echo "\n"; 107 echo '* ',$msg;
108 ob_flush(); 108 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
109 flush(); 109 echo "\n";
110 } 110 ob_flush();
111 } 111 flush();
112 112 }
113 protected function getUserAgent($url, $asArray=false) { 113 }
114 $host = @parse_url($url, PHP_URL_HOST); 114
115 if (strtolower(substr($host, 0, 4)) == 'www.') { 115 protected function getUserAgent($url, $asArray=false) {
116 $host = substr($host, 4); 116 $host = @parse_url($url, PHP_URL_HOST);
117 } 117 if (strtolower(substr($host, 0, 4)) == 'www.') {
118 if ($host) { 118 $host = substr($host, 4);
119 $try = array($host); 119 }
120 $split = explode('.', $host); 120 if ($host) {
121 if (count($split) > 1) { 121 $try = array($host);
122 array_shift($split); 122 $split = explode('.', $host);
123 $try[] = '.'.implode('.', $split); 123 if (count($split) > 1) {
124 } 124 array_shift($split);
125 foreach ($try as $h) { 125 $try[] = '.'.implode('.', $split);
126 if (isset($this->userAgentMap[$h])) { 126 }
127 $ua = $this->userAgentMap[$h]; 127 foreach ($try as $h) {
128 break; 128 if (isset($this->userAgentMap[$h])) {
129 } 129 $ua = $this->userAgentMap[$h];
130 } 130 break;
131 } 131 }
132 if (!isset($ua)) $ua = $this->userAgentDefault; 132 }
133 if ($asArray) { 133 }
134 return array('User-Agent' => $ua); 134 if (!isset($ua)) $ua = $this->userAgentDefault;
135 } else { 135 if ($asArray) {
136 return 'User-Agent: '.$ua; 136 return array('User-Agent' => $ua);
137 } 137 } else {
138 } 138 return 'User-Agent: '.$ua;
139 139 }
140 public function rewriteHashbangFragment($url) { 140 }
141 // return $url if there's no '#!' 141
142 if (strpos($url, '#!') === false) return $url; 142 public function rewriteHashbangFragment($url) {
143 // split $url and rewrite 143 // return $url if there's no '#!'
144 // TODO: is SimplePie_IRI included? 144 if (strpos($url, '#!') === false) return $url;
145 $iri = new SimplePie_IRI($url); 145 // split $url and rewrite
146 $fragment = substr($iri->fragment, 1); // strip '!' 146 // TODO: is SimplePie_IRI included?
147 $iri->fragment = null; 147 $iri = new SimplePie_IRI($url);
148 if (isset($iri->query)) { 148 $fragment = substr($iri->fragment, 1); // strip '!'
149 parse_str($iri->query, $query); 149 $iri->fragment = null;
150 } else { 150 if (isset($iri->query)) {
151 $query = array(); 151 parse_str($iri->query, $query);
152 } 152 } else {
153 $query['_escaped_fragment_'] = (string)$fragment; 153 $query = array();
154 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites 154 }
155 return $iri->get_iri(); 155 $query['_escaped_fragment_'] = (string)$fragment;
156 } 156 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
157 157 return $iri->get_iri();
158 public function getUglyURL($url, $html) { 158 }
159 if ($html == '') return false; 159
160 $found = false; 160 public function getRedirectURLfromHTML($url, $html) {
161 foreach ($this->ajaxTriggers as $string) { 161 $redirect_url = $this->getMetaRefreshURL($url, $html);
162 if (stripos($html, $string)) { 162 if (!$redirect_url) {
163 $found = true; 163 $redirect_url = $this->getUglyURL($url, $html);
164 break; 164 }
165 } 165 return $redirect_url;
166 } 166 }
167 if (!$found) return false; 167
168 $iri = new SimplePie_IRI($url); 168 public function getMetaRefreshURL($url, $html) {
169 if (isset($iri->query)) { 169 if ($html == '') return false;
170 parse_str($iri->query, $query); 170 // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
171 } else { 171 if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
172 $query = array(); 172 return false;
173 } 173 }
174 $query['_escaped_fragment_'] = ''; 174 $redirect_url = $match[1];
175 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites 175 if (preg_match('!^https?://!i', $redirect_url)) {
176 return $iri->get_iri(); 176 // already absolute
177 } 177 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
178 178 return $redirect_url;
179 public function removeFragment($url) { 179 }
180 $pos = strpos($url, '#'); 180 // absolutize redirect URL
181 if ($pos === false) { 181 $base = new SimplePie_IRI($url);
182 return $url; 182 // remove '//' in URL path (causes URLs not to resolve properly)
183 } else { 183 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
184 return substr($url, 0, $pos); 184 if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
185 } 185 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
186 } 186 return $absolute;
187 187 }
188 public function rewriteUrls($url) { 188 return false;
189 foreach ($this->rewriteUrls as $find => $action) { 189 }
190 if (strpos($url, $find) !== false) { 190
191 if (is_array($action)) { 191 public function getUglyURL($url, $html) {
192 return strtr($url, $action); 192 if ($html == '') return false;
193 } 193 $found = false;
194 } 194 foreach ($this->ajaxTriggers as $string) {
195 } 195 if (stripos($html, $string)) {
196 return $url; 196 $found = true;
197 } 197 break;
198 198 }
199 public function enableDebug($bool=true) { 199 }
200 $this->debug = (bool)$bool; 200 if (!$found) return false;
201 } 201 $iri = new SimplePie_IRI($url);
202 202 if (isset($iri->query)) {
203 public function minimiseMemoryUse($bool = true) { 203 parse_str($iri->query, $query);
204 $this->minimiseMemoryUse = $bool; 204 } else {
205 } 205 $query = array();
206 206 }
207 public function setMaxParallelRequests($max) { 207 $query['_escaped_fragment_'] = '';
208 $this->maxParallelRequests = $max; 208 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
209 } 209 $ugly_url = $iri->get_iri();
210 210 $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
211 public function validateUrl($url) { 211 return $ugly_url;
212 $url = filter_var($url, FILTER_SANITIZE_URL); 212 }
213 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); 213
214 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) 214 public function removeFragment($url) {
215 if ($test === false) { 215 $pos = strpos($url, '#');
216 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); 216 if ($pos === false) {
217 } 217 return $url;
218 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { 218 } else {
219 return $url; 219 return substr($url, 0, $pos);
220 } else { 220 }
221 return false; 221 }
222 } 222
223 } 223 public function rewriteUrls($url) {
224 224 foreach ($this->rewriteUrls as $find => $action) {
225 public function fetchAll(array $urls) { 225 if (strpos($url, $find) !== false) {
226 $this->fetchAllOnce($urls, $isRedirect=false); 226 if (is_array($action)) {
227 $redirects = 0; 227 return strtr($url, $action);
228 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { 228 }
229 $this->debug("Following redirects #$redirects..."); 229 }
230 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); 230 }
231 } 231 return $url;
232 } 232 }
233 233
234 // fetch all URLs without following redirects 234 public function enableDebug($bool=true) {
235 public function fetchAllOnce(array $urls, $isRedirect=false) { 235 $this->debug = (bool)$bool;
236 if (!$isRedirect) $urls = array_unique($urls); 236 }
237 if (empty($urls)) return; 237
238 238 public function minimiseMemoryUse($bool = true) {
239 ////////////////////////////////////////////////////// 239 $this->minimiseMemoryUse = $bool;
240 // parallel (HttpRequestPool) 240 }
241 if ($this->method == self::METHOD_REQUEST_POOL) { 241
242 $this->debug('Starting parallel fetch (HttpRequestPool)'); 242 public function setMaxParallelRequests($max) {
243 try { 243 $this->maxParallelRequests = $max;
244 while (count($urls) > 0) { 244 }
245 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); 245
246 $subset = array_splice($urls, 0, $this->maxParallelRequests); 246 public function validateUrl($url) {
247 $pool = new HttpRequestPool(); 247 $url = filter_var($url, FILTER_SANITIZE_URL);
248 foreach ($subset as $orig => $url) { 248 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
249 if (!$isRedirect) $orig = $url; 249 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
250 unset($this->redirectQueue[$orig]); 250 if ($test === false) {
251 $this->debug("...$url"); 251 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
252 if (!$isRedirect && isset($this->requests[$url])) { 252 }
253 $this->debug("......in memory"); 253 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
254 /* 254 return $url;
255 } elseif ($this->isCached($url)) { 255 } else {
256 $this->debug("......is cached"); 256 return false;
257 if (!$this->minimiseMemoryUse) { 257 }
258 $this->requests[$url] = $this->getCached($url); 258 }
259 } 259
260 */ 260 public function fetchAll(array $urls) {
261 } else { 261 $this->fetchAllOnce($urls, $isRedirect=false);
262 $this->debug("......adding to pool"); 262 $redirects = 0;
263 $req_url = $this->rewriteUrls($url); 263 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
264 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 264 $this->debug("Following redirects #$redirects...");
265 $req_url = $this->removeFragment($req_url); 265 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
266 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { 266 }
267 $_meth = HttpRequest::METH_HEAD; 267 }
268 } else { 268
269 $_meth = HttpRequest::METH_GET; 269 // fetch all URLs without following redirects
270 unset($this->requests[$orig]['wrongGuess']); 270 public function fetchAllOnce(array $urls, $isRedirect=false) {
271 } 271 if (!$isRedirect) $urls = array_unique($urls);
272 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); 272 if (empty($urls)) return;
273 // send cookies, if we have any 273
274 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 274 //////////////////////////////////////////////////////
275 $this->debug("......sending cookies: $cookies"); 275 // parallel (HttpRequestPool)
276 $httpRequest->addHeaders(array('Cookie' => $cookies)); 276 if ($this->method == self::METHOD_REQUEST_POOL) {
277 } 277 $this->debug('Starting parallel fetch (HttpRequestPool)');
278 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); 278 try {
279 $httpRequest->addHeaders($this->getUserAgent($req_url, true)); 279 while (count($urls) > 0) {
280 // add referer for picky sites 280 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
281 $httpRequest->addheaders(array('Referer' => $this->referer)); 281 $subset = array_splice($urls, 0, $this->maxParallelRequests);
282 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); 282 $pool = new HttpRequestPool();
283 $this->requests[$orig]['original_url'] = $orig; 283 foreach ($subset as $orig => $url) {
284 $pool->attach($httpRequest); 284 if (!$isRedirect) $orig = $url;
285 } 285 unset($this->redirectQueue[$orig]);
286 } 286 $this->debug("...$url");
287 // did we get anything into the pool? 287 if (!$isRedirect && isset($this->requests[$url])) {
288 if (count($pool) > 0) { 288 $this->debug("......in memory");
289 $this->debug('Sending request...'); 289 /*
290 try { 290 } elseif ($this->isCached($url)) {
291 $pool->send(); 291 $this->debug("......is cached");
292 } catch (HttpRequestPoolException $e) { 292 if (!$this->minimiseMemoryUse) {
293 // do nothing 293 $this->requests[$url] = $this->getCached($url);
294 } 294 }
295 $this->debug('Received responses'); 295 */
296 foreach($subset as $orig => $url) { 296 } else {
297 if (!$isRedirect) $orig = $url; 297 $this->debug("......adding to pool");
298 $request = $this->requests[$orig]['httpRequest']; 298 $req_url = $this->rewriteUrls($url);
299 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); 299 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
300 // getResponseHeader() doesn't return status line, so, for consistency... 300 $req_url = $this->removeFragment($req_url);
301 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); 301 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
302 // check content type 302 $_meth = HttpRequest::METH_HEAD;
303 // TODO: use getResponseHeader('content-type') or getResponseInfo() 303 } else {
304 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 304 $_meth = HttpRequest::METH_GET;
305 $this->requests[$orig]['body'] = ''; 305 unset($this->requests[$orig]['wrongGuess']);
306 $_header_only_type = true; 306 }
307 $this->debug('Header only type returned'); 307 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
308 } else { 308 // send cookies, if we have any
309 $this->requests[$orig]['body'] = $request->getResponseBody(); 309 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
310 $_header_only_type = false; 310 $this->debug("......sending cookies: $cookies");
311 } 311 $httpRequest->addHeaders(array('Cookie' => $cookies));
312 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); 312 }
313 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); 313 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
314 // is redirect? 314 $httpRequest->addHeaders($this->getUserAgent($req_url, true));
315 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { 315 // add referer for picky sites
316 $redirectURL = $request->getResponseHeader('location'); 316 $httpRequest->addheaders(array('Referer' => $this->referer));
317 if (!preg_match('!^https?://!i', $redirectURL)) { 317 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
318 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 318 $this->requests[$orig]['original_url'] = $orig;
319 } 319 $pool->attach($httpRequest);
320 if ($this->validateURL($redirectURL)) { 320 }
321 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 321 }
322 // store any cookies 322 // did we get anything into the pool?
323 $cookies = $request->getResponseHeader('set-cookie'); 323 if (count($pool) > 0) {
324 if ($cookies && !is_array($cookies)) $cookies = array($cookies); 324 $this->debug('Sending request...');
325 if ($cookies) $this->cookieJar->storeCookies($url, $cookies); 325 try {
326 $this->redirectQueue[$orig] = $redirectURL; 326 $pool->send();
327 } else { 327 } catch (HttpRequestPoolException $e) {
328 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 328 // do nothing
329 } 329 }
330 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { 330 $this->debug('Received responses');
331 // the response content-type did not match our 'header only' types, 331 foreach($subset as $orig => $url) {
332 // but we'd issues a HEAD request because we assumed it would. So 332 if (!$isRedirect) $orig = $url;
333 // let's queue a proper GET request for this item... 333 $request = $this->requests[$orig]['httpRequest'];
334 $this->debug('Wrong guess at content-type, queing GET request'); 334 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
335 $this->requests[$orig]['wrongGuess'] = true; 335 // getResponseHeader() doesn't return status line, so, for consistency...
336 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; 336 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
337 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 337 // check content type
338 // check for <meta name='fragment' content='!'/> 338 // TODO: use getResponseHeader('content-type') or getResponseInfo()
339 // for AJAX sites, e.g. Blogger with its dynamic views templates. 339 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
340 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 340 $this->requests[$orig]['body'] = '';
341 if (isset($this->requests[$orig]['body'])) { 341 $_header_only_type = true;
342 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 342 $this->debug('Header only type returned');
343 if ($redirectURL) { 343 } else {
344 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 344 $this->requests[$orig]['body'] = $request->getResponseBody();
345 $this->redirectQueue[$orig] = $redirectURL; 345 $_header_only_type = false;
346 } 346 }
347 } 347 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
348 } 348 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
349 //die($url.' -multi- '.$request->getResponseInfo('effective_url')); 349 // is redirect?
350 $pool->detach($request); 350 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
351 unset($this->requests[$orig]['httpRequest'], $request); 351 $redirectURL = $request->getResponseHeader('location');
352 /* 352 if (!preg_match('!^https?://!i', $redirectURL)) {
353 if ($this->minimiseMemoryUse) { 353 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
354 if ($this->cache($url)) { 354 }
355 unset($this->requests[$url]); 355 if ($this->validateURL($redirectURL)) {
356 } 356 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
357 } 357 // store any cookies
358 */ 358 $cookies = $request->getResponseHeader('set-cookie');
359 } 359 if ($cookies && !is_array($cookies)) $cookies = array($cookies);
360 } 360 if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
361 } 361 $this->redirectQueue[$orig] = $redirectURL;
362 } catch (HttpException $e) { 362 } else {
363 $this->debug($e); 363 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
364 return false; 364 }
365 } 365 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
366 } 366 // the response content-type did not match our 'header only' types,
367 367 // but we'd issues a HEAD request because we assumed it would. So
368 ////////////////////////////////////////////////////////// 368 // let's queue a proper GET request for this item...
369 // parallel (curl_multi_*) 369 $this->debug('Wrong guess at content-type, queing GET request');
370 elseif ($this->method == self::METHOD_CURL_MULTI) { 370 $this->requests[$orig]['wrongGuess'] = true;
371 $this->debug('Starting parallel fetch (curl_multi_*)'); 371 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
372 while (count($urls) > 0) { 372 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
373 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); 373 // check for <meta name='fragment' content='!'/>
374 $subset = array_splice($urls, 0, $this->maxParallelRequests); 374 // for AJAX sites, e.g. Blogger with its dynamic views templates.
375 $pool = new RollingCurl(array($this, 'handleCurlResponse')); 375 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
376 $pool->window_size = count($subset); 376 if (isset($this->requests[$orig]['body'])) {
377 377 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
378 foreach ($subset as $orig => $url) { 378 if ($redirectURL) {
379 if (!$isRedirect) $orig = $url; 379 $this->redirectQueue[$orig] = $redirectURL;
380 unset($this->redirectQueue[$orig]); 380 }
381 $this->debug("...$url"); 381 }
382 if (!$isRedirect && isset($this->requests[$url])) { 382 }
383 $this->debug("......in memory"); 383 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
384 /* 384 $pool->detach($request);
385 } elseif ($this->isCached($url)) { 385 unset($this->requests[$orig]['httpRequest'], $request);
386 $this->debug("......is cached"); 386 /*
387 if (!$this->minimiseMemoryUse) { 387 if ($this->minimiseMemoryUse) {
388 $this->requests[$url] = $this->getCached($url); 388 if ($this->cache($url)) {
389 } 389 unset($this->requests[$url]);
390 */ 390 }
391 } else { 391 }
392 $this->debug("......adding to pool"); 392 */
393 $req_url = $this->rewriteUrls($url); 393 }
394 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 394 }
395 $req_url = $this->removeFragment($req_url); 395 }
396 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { 396 } catch (HttpException $e) {
397 $_meth = 'HEAD'; 397 $this->debug($e);
398 } else { 398 return false;
399 $_meth = 'GET'; 399 }
400 unset($this->requests[$orig]['wrongGuess']); 400 }
401 } 401
402 $headers = array(); 402 //////////////////////////////////////////////////////////
403 //$headers[] = 'User-Agent: '.$this->userAgent; 403 // parallel (curl_multi_*)
404 $headers[] = $this->getUserAgent($req_url); 404 elseif ($this->method == self::METHOD_CURL_MULTI) {
405 // add referer for picky sites 405 $this->debug('Starting parallel fetch (curl_multi_*)');
406 $headers[] = 'Referer: '.$this->referer; 406 while (count($urls) > 0) {
407 // send cookies, if we have any 407 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
408 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 408 $subset = array_splice($urls, 0, $this->maxParallelRequests);
409 $this->debug("......sending cookies: $cookies"); 409 $pool = new RollingCurl(array($this, 'handleCurlResponse'));
410 $headers[] = 'Cookie: '.$cookies; 410 $pool->window_size = count($subset);
411 } 411
412 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( 412 foreach ($subset as $orig => $url) {
413 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], 413 if (!$isRedirect) $orig = $url;
414 CURLOPT_TIMEOUT => $this->requestOptions['timeout'] 414 unset($this->redirectQueue[$orig]);
415 )); 415 $this->debug("...$url");
416 $httpRequest->set_original_url($orig); 416 if (!$isRedirect && isset($this->requests[$url])) {
417 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); 417 $this->debug("......in memory");
418 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? 418 /*
419 $pool->add($httpRequest); 419 } elseif ($this->isCached($url)) {
420 } 420 $this->debug("......is cached");
421 } 421 if (!$this->minimiseMemoryUse) {
422 // did we get anything into the pool? 422 $this->requests[$url] = $this->getCached($url);
423 if (count($pool) > 0) { 423 }
424 $this->debug('Sending request...'); 424 */
425 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] 425 } else {
426 $this->debug('Received responses'); 426 $this->debug("......adding to pool");
427 foreach($subset as $orig => $url) { 427 $req_url = $this->rewriteUrls($url);
428 if (!$isRedirect) $orig = $url; 428 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
429 // $this->requests[$orig]['headers'] 429 $req_url = $this->removeFragment($req_url);
430 // $this->requests[$orig]['body'] 430 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
431 // $this->requests[$orig]['effective_url'] 431 $_meth = 'HEAD';
432 // check content type 432 } else {
433 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 433 $_meth = 'GET';
434 $this->requests[$orig]['body'] = ''; 434 unset($this->requests[$orig]['wrongGuess']);
435 $_header_only_type = true; 435 }
436 $this->debug('Header only type returned'); 436 $headers = array();
437 } else { 437 //$headers[] = 'User-Agent: '.$this->userAgent;
438 $_header_only_type = false; 438 $headers[] = $this->getUserAgent($req_url);
439 } 439 // add referer for picky sites
440 $status_code = $this->requests[$orig]['status_code']; 440 $headers[] = 'Referer: '.$this->referer;
441 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { 441 // send cookies, if we have any
442 $redirectURL = $this->requests[$orig]['location']; 442 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
443 if (!preg_match('!^https?://!i', $redirectURL)) { 443 $this->debug("......sending cookies: $cookies");
444 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 444 $headers[] = 'Cookie: '.$cookies;
445 } 445 }
446 if ($this->validateURL($redirectURL)) { 446 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
447 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 447 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
448 // store any cookies 448 CURLOPT_TIMEOUT => $this->requestOptions['timeout']
449 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); 449 ));
450 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); 450 $httpRequest->set_original_url($orig);
451 $this->redirectQueue[$orig] = $redirectURL; 451 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
452 } else { 452 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
453 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 453 $pool->add($httpRequest);
454 } 454 }
455 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { 455 }
456 // the response content-type did not match our 'header only' types, 456 // did we get anything into the pool?
457 // but we'd issues a HEAD request because we assumed it would. So 457 if (count($pool) > 0) {
458 // let's queue a proper GET request for this item... 458 $this->debug('Sending request...');
459 $this->debug('Wrong guess at content-type, queing GET request'); 459 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
460 $this->requests[$orig]['wrongGuess'] = true; 460 $this->debug('Received responses');
461 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; 461 foreach($subset as $orig => $url) {
462 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 462 if (!$isRedirect) $orig = $url;
463 // check for <meta name='fragment' content='!'/> 463 // $this->requests[$orig]['headers']
464 // for AJAX sites, e.g. Blogger with its dynamic views templates. 464 // $this->requests[$orig]['body']
465 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 465 // $this->requests[$orig]['effective_url']
466 if (isset($this->requests[$orig]['body'])) { 466 // check content type
467 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 467 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
468 if ($redirectURL) { 468 $this->requests[$orig]['body'] = '';
469 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 469 $_header_only_type = true;
470 $this->redirectQueue[$orig] = $redirectURL; 470 $this->debug('Header only type returned');
471 } 471 } else {
472 } 472 $_header_only_type = false;
473 } 473 }
474 // die($url.' -multi- '.$request->getResponseInfo('effective_url')); 474 $status_code = $this->requests[$orig]['status_code'];
475 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); 475 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
476 } 476 $redirectURL = $this->requests[$orig]['location'];
477 } 477 if (!preg_match('!^https?://!i', $redirectURL)) {
478 } 478 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
479 } 479 }
480 480 if ($this->validateURL($redirectURL)) {
481 ////////////////////////////////////////////////////// 481 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
482 // sequential (file_get_contents) 482 // store any cookies
483 else { 483 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
484 $this->debug('Starting sequential fetch (file_get_contents)'); 484 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
485 $this->debug('Processing set of '.count($urls)); 485 $this->redirectQueue[$orig] = $redirectURL;
486 foreach ($urls as $orig => $url) { 486 } else {
487 if (!$isRedirect) $orig = $url; 487 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
488 unset($this->redirectQueue[$orig]); 488 }
489 $this->debug("...$url"); 489 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
490 if (!$isRedirect && isset($this->requests[$url])) { 490 // the response content-type did not match our 'header only' types,
491 $this->debug("......in memory"); 491 // but we'd issues a HEAD request because we assumed it would. So
492 /* 492 // let's queue a proper GET request for this item...
493 } elseif ($this->isCached($url)) { 493 $this->debug('Wrong guess at content-type, queing GET request');
494 $this->debug("......is cached"); 494 $this->requests[$orig]['wrongGuess'] = true;
495 if (!$this->minimiseMemoryUse) { 495 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
496 $this->requests[$url] = $this->getCached($url); 496 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
497 } 497 // check for <meta name='fragment' content='!'/>
498 */ 498 // for AJAX sites, e.g. Blogger with its dynamic views templates.
499 } else { 499 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
500 $this->debug("Sending request for $url"); 500 if (isset($this->requests[$orig]['body'])) {
501 $this->requests[$orig]['original_url'] = $orig; 501 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
502 $req_url = $this->rewriteUrls($url); 502 if ($redirectURL) {
503 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 503 $this->redirectQueue[$orig] = $redirectURL;
504 $req_url = $this->removeFragment($req_url); 504 }
505 // send cookies, if we have any 505 }
506 $httpContext = $this->httpContext; 506 }
507 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; 507 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
508 // add referer for picky sites 508 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
509 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; 509 }
510 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 510 }
511 $this->debug("......sending cookies: $cookies"); 511 }
512 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; 512 }
513 } 513
514 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { 514 //////////////////////////////////////////////////////
515 $this->debug('Received response'); 515 // sequential (file_get_contents)
516 // get status code 516 else {
517 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { 517 $this->debug('Starting sequential fetch (file_get_contents)');
518 $this->debug('Error: no status code found'); 518 $this->debug('Processing set of '.count($urls));
519 // TODO: handle error - no status code 519 foreach ($urls as $orig => $url) {
520 } else { 520 if (!$isRedirect) $orig = $url;
521 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); 521 unset($this->redirectQueue[$orig]);
522 // check content type 522 $this->debug("...$url");
523 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 523 if (!$isRedirect && isset($this->requests[$url])) {
524 $this->requests[$orig]['body'] = ''; 524 $this->debug("......in memory");
525 } else { 525 /*
526 $this->requests[$orig]['body'] = $html; 526 } elseif ($this->isCached($url)) {
527 } 527 $this->debug("......is cached");
528 $this->requests[$orig]['effective_url'] = $req_url; 528 if (!$this->minimiseMemoryUse) {
529 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; 529 $this->requests[$url] = $this->getCached($url);
530 unset($match); 530 }
531 // handle redirect 531 */
532 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { 532 } else {
533 $this->requests[$orig]['location'] = trim($match[1]); 533 $this->debug("Sending request for $url");
534 } 534 $this->requests[$orig]['original_url'] = $orig;
535 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { 535 $req_url = $this->rewriteUrls($url);
536 $redirectURL = $this->requests[$orig]['location']; 536 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
537 if (!preg_match('!^https?://!i', $redirectURL)) { 537 $req_url = $this->removeFragment($req_url);
538 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 538 // send cookies, if we have any
539 } 539 $httpContext = $this->httpContext;
540 if ($this->validateURL($redirectURL)) { 540 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
541 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 541 // add referer for picky sites
542 // store any cookies 542 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
543 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); 543 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
544 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); 544 $this->debug("......sending cookies: $cookies");
545 $this->redirectQueue[$orig] = $redirectURL; 545 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
546 } else { 546 }
547 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 547 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
548 } 548 $this->debug('Received response');
549 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 549 // get status code
550 // check for <meta name='fragment' content='!'/> 550 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
551 // for AJAX sites, e.g. Blogger with its dynamic views templates. 551 $this->debug('Error: no status code found');
552 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 552 // TODO: handle error - no status code
553 if (isset($this->requests[$orig]['body'])) { 553 } else {
554 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 554 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
555 if ($redirectURL) { 555 // check content type
556 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 556 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
557 $this->redirectQueue[$orig] = $redirectURL; 557 $this->requests[$orig]['body'] = '';
558 } 558 } else {
559 } 559 $this->requests[$orig]['body'] = $html;
560 } 560 }
561 } 561 $this->requests[$orig]['effective_url'] = $req_url;
562 } else { 562 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
563 $this->debug('Error retrieving URL'); 563 unset($match);
564 //print_r($req_url); 564 // handle redirect
565 //print_r($http_response_header); 565 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
566 //print_r($html); 566 $this->requests[$orig]['location'] = trim($match[1]);
567 567 }
568 // TODO: handle error - failed to retrieve URL 568 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
569 } 569 $redirectURL = $this->requests[$orig]['location'];
570 } 570 if (!preg_match('!^https?://!i', $redirectURL)) {
571 } 571 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
572 } 572 }
573 } 573 if ($this->validateURL($redirectURL)) {
574 574 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
575 public function handleCurlResponse($response, $info, $request) { 575 // store any cookies
576 $orig = $request->url_original; 576 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
577 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); 577 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
578 $this->requests[$orig]['body'] = substr($response, $info['header_size']); 578 $this->redirectQueue[$orig] = $redirectURL;
579 $this->requests[$orig]['method'] = $request->method; 579 } else {
580 $this->requests[$orig]['effective_url'] = $info['url']; 580 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
581 $this->requests[$orig]['status_code'] = (int)$info['http_code']; 581 }
582 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { 582 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
583 $this->requests[$orig]['location'] = trim($match[1]); 583 // check for <meta name='fragment' content='!'/>
584 } 584 // for AJAX sites, e.g. Blogger with its dynamic views templates.
585 } 585 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
586 586 if (isset($this->requests[$orig]['body'])) {
587 protected function headersToString(array $headers, $associative=true) { 587 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
588 if (!$associative) { 588 if ($redirectURL) {
589 return implode("\n", $headers); 589 $this->redirectQueue[$orig] = $redirectURL;
590 } else { 590 }
591 $str = ''; 591 }
592 foreach ($headers as $key => $val) { 592 }
593 if (is_array($val)) { 593 }
594 foreach ($val as $v) $str .= "$key: $v\n"; 594 } else {
595 } else { 595 $this->debug('Error retrieving URL');
596 $str .= "$key: $val\n"; 596 //print_r($req_url);
597 } 597 //print_r($http_response_header);
598 } 598 //print_r($html);
599 return rtrim($str); 599
600 } 600 // TODO: handle error - failed to retrieve URL
601 } 601 }
602 602 }
603 public function get($url, $remove=false, $gzdecode=true) { 603 }
604 $url = "$url"; 604 }
605 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { 605 }
606 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); 606
607 $response = $this->requests[$url]; 607 public function handleCurlResponse($response, $info, $request) {
608 /* 608 $orig = $request->url_original;
609 } elseif ($this->isCached($url)) { 609 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
610 $this->debug("URL already fetched - in disk cache ($url)"); 610 $this->requests[$orig]['body'] = substr($response, $info['header_size']);
611 $response = $this->getCached($url); 611 $this->requests[$orig]['method'] = $request->method;
612 $this->requests[$url] = $response; 612 $this->requests[$orig]['effective_url'] = $info['url'];
613 */ 613 $this->requests[$orig]['status_code'] = (int)$info['http_code'];
614 } else { 614 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
615 $this->debug("Fetching URL ($url)"); 615 $this->requests[$orig]['location'] = trim($match[1]);
616 $this->fetchAll(array($url)); 616 }
617 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { 617 }
618 $response = $this->requests[$url]; 618
619 } else { 619 protected function headersToString(array $headers, $associative=true) {
620 $this->debug("Request failed"); 620 if (!$associative) {
621 $response = false; 621 return implode("\n", $headers);
622 } 622 } else {
623 } 623 $str = '';
624 /* 624 foreach ($headers as $key => $val) {
625 if ($this->minimiseMemoryUse && $response) { 625 if (is_array($val)) {
626 $this->cache($url); 626 foreach ($val as $v) $str .= "$key: $v\n";
627 unset($this->requests[$url]); 627 } else {
628 } 628 $str .= "$key: $val\n";
629 */ 629 }
630 if ($remove && $response) unset($this->requests[$url]); 630 }
631 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { 631 return rtrim($str);
632 if ($html = gzdecode($response['body'])) { 632 }
633 $response['body'] = $html; 633 }
634 } 634
635 } 635 public function get($url, $remove=false, $gzdecode=true) {
636 return $response; 636 $url = "$url";
637 } 637 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
638 638 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
639 public function parallelSupport() { 639 $response = $this->requests[$url];
640 return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); 640 /*
641 } 641 } elseif ($this->isCached($url)) {
642 642 $this->debug("URL already fetched - in disk cache ($url)");
643 private function headerOnlyType($headers) { 643 $response = $this->getCached($url);
644 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { 644 $this->requests[$url] = $response;
645 // look for full mime type (e.g. image/jpeg) or just type (e.g. image) 645 */
646 $match[1] = strtolower(trim($match[1])); 646 } else {
647 $match[2] = strtolower(trim($match[2])); 647 $this->debug("Fetching URL ($url)");
648 foreach (array($match[1], $match[2]) as $mime) { 648 $this->fetchAll(array($url));
649 if (in_array($mime, $this->headerOnlyTypes)) return true; 649 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
650 } 650 $response = $this->requests[$url];
651 } 651 } else {
652 return false; 652 $this->debug("Request failed");
653 } 653 $response = false;
654 654 }
655 private function possibleUnsupportedType($url) { 655 }
656 $path = @parse_url($url, PHP_URL_PATH); 656 /*
657 if ($path && strpos($path, '.') !== false) { 657 if ($this->minimiseMemoryUse && $response) {
658 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); 658 $this->cache($url);
659 return in_array($ext, $this->headerOnlyClues); 659 unset($this->requests[$url]);
660 } 660 }
661 return false; 661 */
662 } 662 if ($remove && $response) unset($this->requests[$url]);
663} 663 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
664 664 if ($html = gzdecode($response['body'])) {
665// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 665 $response['body'] = $html;
666if (!function_exists('gzdecode')) { 666 }
667 function gzdecode($data,&$filename='',&$error='',$maxlength=null) 667 }
668 { 668 return $response;
669 $len = strlen($data); 669 }
670 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { 670
671 $error = "Not in GZIP format."; 671 public function parallelSupport() {
672 return null; // Not GZIP format (See RFC 1952) 672 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
673 } 673 }
674 $method = ord(substr($data,2,1)); // Compression method 674
675 $flags = ord(substr($data,3,1)); // Flags 675 private function headerOnlyType($headers) {
676 if ($flags & 31 != $flags) { 676 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
677 $error = "Reserved bits not allowed."; 677 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
678 return null; 678 $match[1] = strtolower(trim($match[1]));
679 } 679 $match[2] = strtolower(trim($match[2]));
680 // NOTE: $mtime may be negative (PHP integer limitations) 680 foreach (array($match[1], $match[2]) as $mime) {
681 $mtime = unpack("V", substr($data,4,4)); 681 if (in_array($mime, $this->headerOnlyTypes)) return true;
682 $mtime = $mtime[1]; 682 }
683 $xfl = substr($data,8,1); 683 }
684 $os = substr($data,8,1); 684 return false;
685 $headerlen = 10; 685 }
686 $extralen = 0; 686
687 $extra = ""; 687 private function possibleUnsupportedType($url) {
688 if ($flags & 4) { 688 $path = @parse_url($url, PHP_URL_PATH);
689 // 2-byte length prefixed EXTRA data in header 689 if ($path && strpos($path, '.') !== false) {
690 if ($len - $headerlen - 2 < 8) { 690 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
691 return false; // invalid 691 return in_array($ext, $this->headerOnlyClues);
692 } 692 }
693 $extralen = unpack("v",substr($data,8,2)); 693 return false;
694 $extralen = $extralen[1]; 694 }
695 if ($len - $headerlen - 2 - $extralen < 8) { 695}
696 return false; // invalid 696
697 } 697// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
698 $extra = substr($data,10,$extralen); 698if (!function_exists('gzdecode')) {
699 $headerlen += 2 + $extralen; 699 function gzdecode($data,&$filename='',&$error='',$maxlength=null)
700 } 700 {
701 $filenamelen = 0; 701 $len = strlen($data);
702 $filename = ""; 702 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
703 if ($flags & 8) { 703 $error = "Not in GZIP format.";
704 // C-style string 704 return null; // Not GZIP format (See RFC 1952)
705 if ($len - $headerlen - 1 < 8) { 705 }
706 return false; // invalid 706 $method = ord(substr($data,2,1)); // Compression method
707 } 707 $flags = ord(substr($data,3,1)); // Flags
708 $filenamelen = strpos(substr($data,$headerlen),chr(0)); 708 if ($flags & 31 != $flags) {
709 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { 709 $error = "Reserved bits not allowed.";
710 return false; // invalid 710 return null;
711 } 711 }
712 $filename = substr($data,$headerlen,$filenamelen); 712 // NOTE: $mtime may be negative (PHP integer limitations)
713 $headerlen += $filenamelen + 1; 713 $mtime = unpack("V", substr($data,4,4));
714 } 714 $mtime = $mtime[1];
715 $commentlen = 0; 715 $xfl = substr($data,8,1);
716 $comment = ""; 716 $os = substr($data,8,1);
717 if ($flags & 16) { 717 $headerlen = 10;
718 // C-style string COMMENT data in header 718 $extralen = 0;
719 if ($len - $headerlen - 1 < 8) { 719 $extra = "";
720 return false; // invalid 720 if ($flags & 4) {
721 } 721 // 2-byte length prefixed EXTRA data in header
722 $commentlen = strpos(substr($data,$headerlen),chr(0)); 722 if ($len - $headerlen - 2 < 8) {
723 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { 723 return false; // invalid
724 return false; // Invalid header format 724 }
725 } 725 $extralen = unpack("v",substr($data,8,2));
726 $comment = substr($data,$headerlen,$commentlen); 726 $extralen = $extralen[1];
727 $headerlen += $commentlen + 1; 727 if ($len - $headerlen - 2 - $extralen < 8) {
728 } 728 return false; // invalid
729 $headercrc = ""; 729 }
730 if ($flags & 2) { 730 $extra = substr($data,10,$extralen);
731 // 2-bytes (lowest order) of CRC32 on header present 731 $headerlen += 2 + $extralen;
732 if ($len - $headerlen - 2 < 8) { 732 }
733 return false; // invalid 733 $filenamelen = 0;
734 } 734 $filename = "";
735 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; 735 if ($flags & 8) {
736 $headercrc = unpack("v", substr($data,$headerlen,2)); 736 // C-style string
737 $headercrc = $headercrc[1]; 737 if ($len - $headerlen - 1 < 8) {
738 if ($headercrc != $calccrc) { 738 return false; // invalid
739 $error = "Header checksum failed."; 739 }
740 return false; // Bad header CRC 740 $filenamelen = strpos(substr($data,$headerlen),chr(0));
741 } 741 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
742 $headerlen += 2; 742 return false; // invalid
743 } 743 }
744 // GZIP FOOTER 744 $filename = substr($data,$headerlen,$filenamelen);
745 $datacrc = unpack("V",substr($data,-8,4)); 745 $headerlen += $filenamelen + 1;
746 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); 746 }
747 $isize = unpack("V",substr($data,-4)); 747 $commentlen = 0;
748 $isize = $isize[1]; 748 $comment = "";
749 // decompression: 749 if ($flags & 16) {
750 $bodylen = $len-$headerlen-8; 750 // C-style string COMMENT data in header
751 if ($bodylen < 1) { 751 if ($len - $headerlen - 1 < 8) {
752 // IMPLEMENTATION BUG! 752 return false; // invalid
753 return null; 753 }
754 } 754 $commentlen = strpos(substr($data,$headerlen),chr(0));
755 $body = substr($data,$headerlen,$bodylen); 755 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
756 $data = ""; 756 return false; // Invalid header format
757 if ($bodylen > 0) { 757 }
758 switch ($method) { 758 $comment = substr($data,$headerlen,$commentlen);
759 case 8: 759 $headerlen += $commentlen + 1;
760 // Currently the only supported compression method: 760 }
761 $data = gzinflate($body,$maxlength); 761 $headercrc = "";
762 break; 762 if ($flags & 2) {
763 default: 763 // 2-bytes (lowest order) of CRC32 on header present
764 $error = "Unknown compression method."; 764 if ($len - $headerlen - 2 < 8) {
765 return false; 765 return false; // invalid
766 } 766 }
767 } // zero-byte body content is allowed 767 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
768 // Verifiy CRC32 768 $headercrc = unpack("v", substr($data,$headerlen,2));
769 $crc = sprintf("%u",crc32($data)); 769 $headercrc = $headercrc[1];
770 $crcOK = $crc == $datacrc; 770 if ($headercrc != $calccrc) {
771 $lenOK = $isize == strlen($data); 771 $error = "Header checksum failed.";
772 if (!$lenOK || !$crcOK) { 772 return false; // Bad header CRC
773 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); 773 }
774 return false; 774 $headerlen += 2;
775 } 775 }
776 return $data; 776 // GZIP FOOTER
777 } 777 $datacrc = unpack("V",substr($data,-8,4));
778} 778 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
779?> \ No newline at end of file 779 $isize = unpack("V",substr($data,-4));
780 $isize = $isize[1];
781 // decompression:
782 $bodylen = $len-$headerlen-8;
783 if ($bodylen < 1) {
784 // IMPLEMENTATION BUG!
785 return null;
786 }
787 $body = substr($data,$headerlen,$bodylen);
788 $data = "";
789 if ($bodylen > 0) {
790 switch ($method) {
791 case 8:
792 // Currently the only supported compression method:
793 $data = gzinflate($body,$maxlength);
794 break;
795 default:
796 $error = "Unknown compression method.";
797 return false;
798 }
799 } // zero-byte body content is allowed
800 // Verifiy CRC32
801 $crc = sprintf("%u",crc32($data));
802 $crcOK = $crc == $datacrc;
803 $lenOK = $isize == strlen($data);
804 if (!$lenOK || !$crcOK) {
805 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
806 return false;
807 }
808 return $data;
809 }
810} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
index ecd46d5f..c524a1ee 100644
--- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
@@ -1,79 +1,78 @@
1<?php 1<?php
2/** 2/**
3 * Humble HTTP Agent extension for SimplePie_File 3 * Humble HTTP Agent extension for SimplePie_File
4 * 4 *
5 * This class is designed to extend and override SimplePie_File 5 * This class is designed to extend and override SimplePie_File
6 * in order to prevent duplicate HTTP requests being sent out. 6 * in order to prevent duplicate HTTP requests being sent out.
7 * The idea is to initialise an instance of Humble HTTP Agent 7 * The idea is to initialise an instance of Humble HTTP Agent
8 * and attach it, to a static class variable, of this class. 8 * and attach it, to a static class variable, of this class.
9 * SimplePie will then automatically initialise this class 9 * SimplePie will then automatically initialise this class
10 * 10 *
11 * @date 2011-02-28 11 * @date 2011-02-28
12 */ 12 */
13 13
14class SimplePie_HumbleHttpAgent extends SimplePie_File 14class SimplePie_HumbleHttpAgent extends SimplePie_File
15{ 15{
16 protected static $agent; 16 protected static $agent;
17 var $url; 17 var $url;
18 var $useragent; 18 var $useragent;
19 var $success = true; 19 var $success = true;
20 var $headers = array(); 20 var $headers = array();
21 var $body; 21 var $body;
22 var $status_code; 22 var $status_code;
23 var $redirects = 0; 23 var $redirects = 0;
24 var $error; 24 var $error;
25 var $method = SIMPLEPIE_FILE_SOURCE_NONE; 25 var $method = SIMPLEPIE_FILE_SOURCE_NONE;
26 26
27 public static function set_agent(HumbleHttpAgent $agent) { 27 public static function set_agent(HumbleHttpAgent $agent) {
28 self::$agent = $agent; 28 self::$agent = $agent;
29 } 29 }
30 30
31 public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { 31 public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
32 if (class_exists('idna_convert')) 32 if (class_exists('idna_convert'))
33 { 33 {
34 $idn = new idna_convert(); 34 $idn = new idna_convert();
35 $parsed = SimplePie_Misc::parse_url($url); 35 $parsed = SimplePie_Misc::parse_url($url);
36 $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); 36 $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
37 } 37 }
38 $this->url = $url; 38 $this->url = $url;
39 $this->useragent = $useragent; 39 $this->useragent = $useragent;
40 if (preg_match('/^http(s)?:\/\//i', $url)) 40 if (preg_match('/^http(s)?:\/\//i', $url))
41 { 41 {
42 if (!is_array($headers)) 42 if (!is_array($headers))
43 { 43 {
44 $headers = array(); 44 $headers = array();
45 } 45 }
46 $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; 46 $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
47 $headers2 = array(); 47 $headers2 = array();
48 foreach ($headers as $key => $value) { 48 foreach ($headers as $key => $value) {
49 $headers2[] = "$key: $value"; 49 $headers2[] = "$key: $value";
50 } 50 }
51 //TODO: allow for HTTP headers 51 //TODO: allow for HTTP headers
52 // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); 52 // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
53 53
54 $response = self::$agent->get($url); 54 $response = self::$agent->get($url);
55 55
56 if ($response === false || !isset($response['status_code'])) { 56 if ($response === false || !isset($response['status_code'])) {
57 $this->error = 'failed to fetch URL'; 57 $this->error = 'failed to fetch URL';
58 $this->success = false; 58 $this->success = false;
59 } else { 59 } else {
60 // The extra lines at the end are there to satisfy SimplePie's HTTP parser. 60 // The extra lines at the end are there to satisfy SimplePie's HTTP parser.
61 // The class expects a full HTTP message, whereas we're giving it only 61 // The class expects a full HTTP message, whereas we're giving it only
62 // headers - the new lines indicate the start of the body. 62 // headers - the new lines indicate the start of the body.
63 $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); 63 $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
64 if ($parser->parse()) { 64 if ($parser->parse()) {
65 $this->headers = $parser->headers; 65 $this->headers = $parser->headers;
66 //$this->body = $parser->body; 66 //$this->body = $parser->body;
67 $this->body = $response['body']; 67 $this->body = $response['body'];
68 $this->status_code = $parser->status_code; 68 $this->status_code = $parser->status_code;
69 } 69 }
70 } 70 }
71 } 71 }
72 else 72 else
73 { 73 {
74 $this->error = 'invalid URL'; 74 $this->error = 'invalid URL';
75 $this->success = false; 75 $this->success = false;
76 } 76 }
77 } 77 }
78} 78} \ No newline at end of file
79?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
index 09b11546..382d869c 100644
--- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
@@ -6,23 +6,24 @@
6 * Attempts to detect the language of a sample of text by correlating ranked 6 * Attempts to detect the language of a sample of text by correlating ranked
7 * 3-gram frequencies to a table of 3-gram frequencies of known languages. 7 * 3-gram frequencies to a table of 3-gram frequencies of known languages.
8 * 8 *
9 * Implements a version of a technique originally proposed by Cavnar & Trenkle 9 * Implements a version of a technique originally proposed by Cavnar & Trenkle
10 * (1994): "N-Gram-Based Text Categorization" 10 * (1994): "N-Gram-Based Text Categorization"
11 * 11 *
12 * PHP versions 4 and 5 12 * PHP version 5
13 * 13 *
14 * @category Text 14 * @category Text
15 * @package Text_LanguageDetect 15 * @package Text_LanguageDetect
16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
17 * @copyright 2005-2006 Nicholas Pisarro 17 * @copyright 2005-2006 Nicholas Pisarro
18 * @license http://www.debian.org/misc/bsd.license BSD 18 * @license http://www.debian.org/misc/bsd.license BSD
19 * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ 19 * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
20 * @link http://pear.php.net/package/Text_LanguageDetect/ 20 * @link http://pear.php.net/package/Text_LanguageDetect/
21 * @link http://langdetect.blogspot.com/ 21 * @link http://langdetect.blogspot.com/
22 */ 22 */
23 23
24//require_once 'PEAR.php'; 24require_once 'LanguageDetect/Exception.php';
25require_once 'Parser.php'; 25require_once 'LanguageDetect/Parser.php';
26require_once 'LanguageDetect/ISO639.php';
26 27
27/** 28/**
28 * Language detection class 29 * Language detection class
@@ -41,9 +42,10 @@ require_once 'Parser.php';
41 * 42 *
42 * echo "Supported languages:\n"; 43 * echo "Supported languages:\n";
43 * 44 *
44 * $langs = $l->getLanguages(); 45 * try {
45 * if (PEAR::isError($langs)) { 46 * $langs = $l->getLanguages();
46 * die($langs->getMessage()); 47 * } catch (Text_LanguageDetect_Exception $e) {
48 * die($e->getMessage());
47 * } 49 * }
48 * 50 *
49 * sort($langs); 51 * sort($langs);
@@ -54,38 +56,38 @@ require_once 'Parser.php';
54 * } 56 * }
55 * </code> 57 * </code>
56 * 58 *
57 * @category Text 59 * @category Text
58 * @package Text_LanguageDetect 60 * @package Text_LanguageDetect
59 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 61 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
60 * @copyright 2005 Nicholas Pisarro 62 * @copyright 2005 Nicholas Pisarro
61 * @license http://www.debian.org/misc/bsd.license BSD 63 * @license http://www.debian.org/misc/bsd.license BSD
62 * @version Release: @package_version@ 64 * @version Release: @package_version@
63 * @todo allow users to generate their own language models 65 * @link http://pear.php.net/package/Text_LanguageDetect/
66 * @todo allow users to generate their own language models
64 */ 67 */
65
66class Text_LanguageDetect 68class Text_LanguageDetect
67{ 69{
68 /** 70 /**
69 * The filename that stores the trigram data for the detector 71 * The filename that stores the trigram data for the detector
70 * 72 *
71 * If this value starts with a slash (/) or a dot (.) the value of 73 * If this value starts with a slash (/) or a dot (.) the value of
72 * $this->_data_dir will be ignored 74 * $this->_data_dir will be ignored
73 * 75 *
74 * @var string 76 * @var string
75 * @access private 77 * @access private
76 */ 78 */
77 var $_db_filename = './lang.dat'; 79 var $_db_filename = 'lang.dat';
78 80
79 /** 81 /**
80 * The filename that stores the unicode block definitions 82 * The filename that stores the unicode block definitions
81 * 83 *
82 * If this value starts with a slash (/) or a dot (.) the value of 84 * If this value starts with a slash (/) or a dot (.) the value of
83 * $this->_data_dir will be ignored 85 * $this->_data_dir will be ignored
84 * 86 *
85 * @var string 87 * @var string
86 * @access private 88 * @access private
87 */ 89 */
88 var $_unicode_db_filename = './unicode_blocks.dat'; 90 var $_unicode_db_filename = 'unicode_blocks.dat';
89 91
90 /** 92 /**
91 * The data directory 93 * The data directory
@@ -99,11 +101,8 @@ class Text_LanguageDetect
99 101
100 /** 102 /**
101 * The trigram data for comparison 103 * The trigram data for comparison
102 *
103 * Will be loaded on start from $this->_db_filename
104 * 104 *
105 * May be set to a PEAR_Error object if there is an error during its 105 * Will be loaded on start from $this->_db_filename
106 * initialization
107 * 106 *
108 * @var array 107 * @var array
109 * @access private 108 * @access private
@@ -120,7 +119,7 @@ class Text_LanguageDetect
120 119
121 /** 120 /**
122 * The size of the trigram data arrays 121 * The size of the trigram data arrays
123 * 122 *
124 * @var int 123 * @var int
125 * @access private 124 * @access private
126 */ 125 */
@@ -140,7 +139,7 @@ class Text_LanguageDetect
140 139
141 /** 140 /**
142 * Whether or not to simulate perl's Language::Guess exactly 141 * Whether or not to simulate perl's Language::Guess exactly
143 * 142 *
144 * @access private 143 * @access private
145 * @var bool 144 * @var bool
146 * @see setPerlCompatible() 145 * @see setPerlCompatible()
@@ -165,18 +164,24 @@ class Text_LanguageDetect
165 var $_clusters; 164 var $_clusters;
166 165
167 /** 166 /**
167 * Which type of "language names" are accepted and returned:
168 *
169 * 0 - language name ("english")
170 * 2 - 2-letter ISO 639-1 code ("en")
171 * 3 - 3-letter ISO 639-2 code ("eng")
172 */
173 var $_name_mode = 0;
174
175 /**
168 * Constructor 176 * Constructor
169 * 177 *
170 * Will attempt to load the language database. If it fails, you will get 178 * Will attempt to load the language database. If it fails, you will get
171 * a PEAR_Error object returned when you try to use detect() 179 * an exception.
172 *
173 */ 180 */
174 function Text_LanguageDetect($db=null, $unicode_db=null) 181 function __construct()
175 { 182 {
176 if (isset($db)) $this->_db_filename = $db;
177 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
178
179 $data = $this->_readdb($this->_db_filename); 183 $data = $this->_readdb($this->_db_filename);
184 $this->_checkTrigram($data['trigram']);
180 $this->_lang_db = $data['trigram']; 185 $this->_lang_db = $data['trigram'];
181 186
182 if (isset($data['trigram-unicodemap'])) { 187 if (isset($data['trigram-unicodemap'])) {
@@ -186,29 +191,32 @@ class Text_LanguageDetect
186 // Not yet implemented: 191 // Not yet implemented:
187 if (isset($data['trigram-clusters'])) { 192 if (isset($data['trigram-clusters'])) {
188 $this->_clusters = $data['trigram-clusters']; 193 $this->_clusters = $data['trigram-clusters'];
189 } 194 }
190 } 195 }
191 196
192 /** 197 /**
193 * Returns the path to the location of the database 198 * Returns the path to the location of the database
194 * 199 *
195 * @access private 200 * @param string $fname File name to load
196 * @return string expected path to the language model database 201 *
202 * @return string expected path to the language model database
203 * @access private
197 */ 204 */
198 function _get_data_loc($fname) 205 function _get_data_loc($fname)
199 { 206 {
200 return $fname; 207 return dirname(__FILE__).'/'.$fname;
201 } 208 }
202 209
203 /** 210 /**
204 * Loads the language trigram database from filename 211 * Loads the language trigram database from filename
205 * 212 *
206 * Trigram datbase should be a serialize()'d array 213 * Trigram datbase should be a serialize()'d array
207 * 214 *
208 * @access private 215 * @param string $fname the filename where the data is stored
209 * @param string $fname the filename where the data is stored 216 *
210 * @return array the language model data 217 * @return array the language model data
211 * @throws PEAR_Error 218 * @throws Text_LanguageDetect_Exception
219 * @access private
212 */ 220 */
213 function _readdb($fname) 221 function _readdb($fname)
214 { 222 {
@@ -217,79 +225,74 @@ class Text_LanguageDetect
217 225
218 // input check 226 // input check
219 if (!file_exists($fname)) { 227 if (!file_exists($fname)) {
220 throw new Exception('Language database does not exist.'); 228 throw new Text_LanguageDetect_Exception(
229 'Language database does not exist: ' . $fname,
230 Text_LanguageDetect_Exception::DB_NOT_FOUND
231 );
221 } elseif (!is_readable($fname)) { 232 } elseif (!is_readable($fname)) {
222 throw new Exception('Language database is not readable.'); 233 throw new Text_LanguageDetect_Exception(
234 'Language database is not readable: ' . $fname,
235 Text_LanguageDetect_Exception::DB_NOT_READABLE
236 );
223 } 237 }
224 238
225 if (function_exists('file_get_contents')) { 239 return unserialize(file_get_contents($fname));
226 return unserialize(file_get_contents($fname));
227 } else {
228 // if you don't have file_get_contents(),
229 // then this is the next fastest way
230 ob_start();
231 readfile($fname);
232 $contents = ob_get_contents();
233 ob_end_clean();
234 return unserialize($contents);
235 }
236 } 240 }
237 241
238 242
239 /** 243 /**
240 * Checks if this object is ready to detect languages 244 * Checks if this object is ready to detect languages
241 * 245 *
242 * @access private 246 * @param array $trigram Trigram data from database
243 * @param mixed &$err error object to be returned by reference, if any 247 *
244 * @return bool true if no errors 248 * @return void
249 * @access private
245 */ 250 */
246 function _setup_ok(&$err) 251 function _checkTrigram($trigram)
247 { 252 {
248 if (!is_array($this->_lang_db)) { 253 if (!is_array($trigram)) {
249 if (ini_get('magic_quotes_runtime')) { 254 if (ini_get('magic_quotes_runtime')) {
250 throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); 255 throw new Text_LanguageDetect_Exception(
251 } else { 256 'Error loading database. Try turning magic_quotes_runtime off.',
252 throw new Exception('Language database is not an array.'); 257 Text_LanguageDetect_Exception::MAGIC_QUOTES
258 );
253 } 259 }
254 return false; 260 throw new Text_LanguageDetect_Exception(
255 261 'Language database is not an array.',
256 } elseif (empty($this->_lang_db)) { 262 Text_LanguageDetect_Exception::DB_NOT_ARRAY
257 throw new Exception('Language database has no elements.'); 263 );
258 return false; 264 } elseif (empty($trigram)) {
259 265 throw new Text_LanguageDetect_Exception(
260 } else { 266 'Language database has no elements.',
261 return true; 267 Text_LanguageDetect_Exception::DB_EMPTY
268 );
262 } 269 }
263 } 270 }
264 271
265 /** 272 /**
266 * Omits languages 273 * Omits languages
267 * 274 *
268 * Pass this function the name of or an array of names of 275 * Pass this function the name of or an array of names of
269 * languages that you don't want considered 276 * languages that you don't want considered
270 * 277 *
271 * If you're only expecting a limited set of languages, this can greatly 278 * If you're only expecting a limited set of languages, this can greatly
272 * speed up processing 279 * speed up processing
273 * 280 *
274 * @access public 281 * @param mixed $omit_list language name or array of names to omit
275 * @param mixed $omit_list language name or array of names to omit 282 * @param bool $include_only if true will include (rather than
276 * @param bool $include_only if true will include (rather than 283 * exclude) only those in the list
277 * exclude) only those in the list 284 *
278 * @return int number of languages successfully deleted 285 * @return int number of languages successfully deleted
279 * @throws PEAR_Error 286 * @throws Text_LanguageDetect_Exception
280 */ 287 */
281 function omitLanguages($omit_list, $include_only = false) 288 public function omitLanguages($omit_list, $include_only = false)
282 { 289 {
283
284 // setup check
285 if (!$this->_setup_ok($err)) {
286 return $err;
287 }
288
289 $deleted = 0; 290 $deleted = 0;
290 291
291 // deleting the given languages 292 $omit_list = $this->_convertFromNameMode($omit_list);
293
292 if (!$include_only) { 294 if (!$include_only) {
295 // deleting the given languages
293 if (!is_array($omit_list)) { 296 if (!is_array($omit_list)) {
294 $omit_list = strtolower($omit_list); // case desensitize 297 $omit_list = strtolower($omit_list); // case desensitize
295 if (isset($this->_lang_db[$omit_list])) { 298 if (isset($this->_lang_db[$omit_list])) {
@@ -301,12 +304,12 @@ class Text_LanguageDetect
301 if (isset($this->_lang_db[$omit_lang])) { 304 if (isset($this->_lang_db[$omit_lang])) {
302 unset($this->_lang_db[$omit_lang]); 305 unset($this->_lang_db[$omit_lang]);
303 $deleted++; 306 $deleted++;
304 } 307 }
305 } 308 }
306 } 309 }
307 310
308 // deleting all except the given languages
309 } else { 311 } else {
312 // deleting all except the given languages
310 if (!is_array($omit_list)) { 313 if (!is_array($omit_list)) {
311 $omit_list = array($omit_list); 314 $omit_list = array($omit_list);
312 } 315 }
@@ -327,7 +330,7 @@ class Text_LanguageDetect
327 // reset the cluster cache if the number of languages changes 330 // reset the cluster cache if the number of languages changes
328 // this will then have to be recalculated 331 // this will then have to be recalculated
329 if (isset($this->_clusters) && $deleted > 0) { 332 if (isset($this->_clusters) && $deleted > 0) {
330 unset($this->_clusters); 333 $this->_clusters = null;
331 } 334 }
332 335
333 return $deleted; 336 return $deleted;
@@ -339,49 +342,40 @@ class Text_LanguageDetect
339 * 342 *
340 * @access public 343 * @access public
341 * @return int the number of languages 344 * @return int the number of languages
342 * @throws PEAR_Error 345 * @throws Text_LanguageDetect_Exception
343 */ 346 */
344 function getLanguageCount() 347 function getLanguageCount()
345 { 348 {
346 if (!$this->_setup_ok($err)) { 349 return count($this->_lang_db);
347 return $err;
348 } else {
349 return count($this->_lang_db);
350 }
351 } 350 }
352 351
353 /** 352 /**
354 * Returns true if a given language exists 353 * Checks if the language with the given name exists in the database
355 * 354 *
356 * If passed an array of names, will return true only if all exist 355 * @param mixed $lang Language name or array of language names
357 * 356 *
358 * @access public 357 * @return bool true if language model exists
359 * @param mixed $lang language name or array of language names
360 * @return bool true if language model exists
361 * @throws PEAR_Error
362 */ 358 */
363 function languageExists($lang) 359 public function languageExists($lang)
364 { 360 {
365 if (!$this->_setup_ok($err)) { 361 $lang = $this->_convertFromNameMode($lang);
366 return $err;
367 } else {
368 // string
369 if (is_string($lang)) {
370 return isset($this->_lang_db[strtolower($lang)]);
371
372 // array
373 } elseif (is_array($lang)) {
374 foreach ($lang as $test_lang) {
375 if (!isset($this->_lang_db[strtolower($test_lang)])) {
376 return false;
377 }
378 }
379 return true;
380 362
381 // other (error) 363 if (is_string($lang)) {
382 } else { 364 return isset($this->_lang_db[strtolower($lang)]);
383 throw new Exception('Unknown type passed to languageExists()'); 365
366 } elseif (is_array($lang)) {
367 foreach ($lang as $test_lang) {
368 if (!isset($this->_lang_db[strtolower($test_lang)])) {
369 return false;
370 }
384 } 371 }
372 return true;
373
374 } else {
375 throw new Text_LanguageDetect_Exception(
376 'Unsupported parameter type passed to languageExists()',
377 Text_LanguageDetect_Exception::PARAM_TYPE
378 );
385 } 379 }
386 } 380 }
387 381
@@ -389,25 +383,24 @@ class Text_LanguageDetect
389 * Returns the list of detectable languages 383 * Returns the list of detectable languages
390 * 384 *
391 * @access public 385 * @access public
392 * @return array the names of the languages known to this object 386 * @return array the names of the languages known to this object<<<<<<<
393 * @throws PEAR_Error 387 * @throws Text_LanguageDetect_Exception
394 */ 388 */
395 function getLanguages() 389 function getLanguages()
396 { 390 {
397 if (!$this->_setup_ok($err)) { 391 return $this->_convertToNameMode(
398 return $err; 392 array_keys($this->_lang_db)
399 } else { 393 );
400 return array_keys($this->_lang_db);
401 }
402 } 394 }
403 395
404 /** 396 /**
405 * Make this object behave like Language::Guess 397 * Make this object behave like Language::Guess
406 * 398 *
407 * @access public 399 * @param bool $setting false to turn off perl compatibility
408 * @param bool $setting false to turn off perl compatibility 400 *
401 * @return void
409 */ 402 */
410 function setPerlCompatible($setting = true) 403 public function setPerlCompatible($setting = true)
411 { 404 {
412 if (is_bool($setting)) { // input check 405 if (is_bool($setting)) { // input check
413 $this->_perl_compatible = $setting; 406 $this->_perl_compatible = $setting;
@@ -422,6 +415,21 @@ class Text_LanguageDetect
422 } 415 }
423 416
424 /** 417 /**
418 * Sets the way how language names are accepted and returned.
419 *
420 * @param integer $name_mode One of the following modes:
421 * 0 - language name ("english")
422 * 2 - 2-letter ISO 639-1 code ("en")
423 * 3 - 3-letter ISO 639-2 code ("eng")
424 *
425 * @return void
426 */
427 function setNameMode($name_mode)
428 {
429 $this->_name_mode = $name_mode;
430 }
431
432 /**
425 * Whether to use unicode block ranges in detection 433 * Whether to use unicode block ranges in detection
426 * 434 *
427 * Should speed up most detections if turned on (detault is on). In some 435 * Should speed up most detections if turned on (detault is on). In some
@@ -429,10 +437,11 @@ class Text_LanguageDetect
429 * in languages that use latin scripts. In other cases it should speed up 437 * in languages that use latin scripts. In other cases it should speed up
430 * detection noticeably. 438 * detection noticeably.
431 * 439 *
432 * @access public 440 * @param bool $setting false to turn off
433 * @param bool $setting false to turn off 441 *
442 * @return void
434 */ 443 */
435 function useUnicodeBlocks($setting = true) 444 public function useUnicodeBlocks($setting = true)
436 { 445 {
437 if (is_bool($setting)) { 446 if (is_bool($setting)) {
438 $this->_use_unicode_narrowing = $setting; 447 $this->_use_unicode_narrowing = $setting;
@@ -442,15 +451,15 @@ class Text_LanguageDetect
442 /** 451 /**
443 * Converts a piece of text into trigrams 452 * Converts a piece of text into trigrams
444 * 453 *
445 * Superceded by the Text_LanguageDetect_Parser class 454 * @param string $text text to convert
446 * 455 *
447 * @access private 456 * @return array array of trigram frequencies
448 * @param string $text text to convert 457 * @access private
449 * @return array array of trigram frequencies 458 * @deprecated Superceded by the Text_LanguageDetect_Parser class
450 */ 459 */
451 function _trigram($text) 460 function _trigram($text)
452 { 461 {
453 $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); 462 $s = new Text_LanguageDetect_Parser($text);
454 $s->prepareTrigram(); 463 $s->prepareTrigram();
455 $s->prepareUnicode(false); 464 $s->prepareUnicode(false);
456 $s->setPadStart(!$this->_perl_compatible); 465 $s->setPadStart(!$this->_perl_compatible);
@@ -463,11 +472,12 @@ class Text_LanguageDetect
463 * 472 *
464 * Thresholds (cuts off) the list at $this->_threshold 473 * Thresholds (cuts off) the list at $this->_threshold
465 * 474 *
466 * @access protected 475 * @param array $arr array of trigram
467 * @param array $arr array of trgram 476 *
468 * @return array ranks of trigrams 477 * @return array ranks of trigrams
478 * @access protected
469 */ 479 */
470 function _arr_rank(&$arr) 480 function _arr_rank($arr)
471 { 481 {
472 482
473 // sorts alphabetically first as a standard way of breaking rank ties 483 // sorts alphabetically first as a standard way of breaking rank ties
@@ -494,14 +504,17 @@ class Text_LanguageDetect
494 504
495 /** 505 /**
496 * Sorts an array by value breaking ties alphabetically 506 * Sorts an array by value breaking ties alphabetically
497 * 507 *
498 * @access private 508 * @param array &$arr the array to sort
499 * @param array &$arr the array to sort 509 *
510 * @return void
511 * @access private
500 */ 512 */
501 function _bub_sort(&$arr) 513 function _bub_sort(&$arr)
502 { 514 {
503 // should do the same as this perl statement: 515 // should do the same as this perl statement:
504 // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } 516 // sort { $trigrams{$b} == $trigrams{$a}
517 // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
505 518
506 // needs to sort by both key and value at once 519 // needs to sort by both key and value at once
507 // using the key to break ties for the value 520 // using the key to break ties for the value
@@ -528,13 +541,14 @@ class Text_LanguageDetect
528 /** 541 /**
529 * Sort function used by bubble sort 542 * Sort function used by bubble sort
530 * 543 *
531 * Callback function for usort(). 544 * Callback function for usort().
532 * 545 *
533 * @access private 546 * @param array $a first param passed by usort()
534 * @param array first param passed by usort() 547 * @param array $b second param passed by usort()
535 * @param array second param passed by usort() 548 *
536 * @return int 1 if $a is greater, -1 if not 549 * @return int 1 if $a is greater, -1 if not
537 * @see _bub_sort() 550 * @see _bub_sort()
551 * @access private
538 */ 552 */
539 function _sort_func($a, $b) 553 function _sort_func($a, $b)
540 { 554 {
@@ -542,12 +556,12 @@ class Text_LanguageDetect
542 list($a_key, $a_value) = $a; 556 list($a_key, $a_value) = $a;
543 list($b_key, $b_value) = $b; 557 list($b_key, $b_value) = $b;
544 558
545 // if the values are the same, break ties using the key
546 if ($a_value == $b_value) { 559 if ($a_value == $b_value) {
560 // if the values are the same, break ties using the key
547 return strcmp($a_key, $b_key); 561 return strcmp($a_key, $b_key);
548 562
549 // if not, just sort normally
550 } else { 563 } else {
564 // if not, just sort normally
551 if ($a_value > $b_value) { 565 if ($a_value > $b_value) {
552 return -1; 566 return -1;
553 } else { 567 } else {
@@ -559,23 +573,24 @@ class Text_LanguageDetect
559 } 573 }
560 574
561 /** 575 /**
562 * Calculates a linear rank-order distance statistic between two sets of 576 * Calculates a linear rank-order distance statistic between two sets of
563 * ranked trigrams 577 * ranked trigrams
564 * 578 *
565 * Sums the differences in rank for each trigram. If the trigram does not 579 * Sums the differences in rank for each trigram. If the trigram does not
566 * appear in both, consider it a difference of $this->_threshold. 580 * appear in both, consider it a difference of $this->_threshold.
567 * 581 *
568 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite 582 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
569 * its simplicity it has been shown to be highly accurate for language 583 * its simplicity it has been shown to be highly accurate for language
570 * identification tasks. 584 * identification tasks.
571 * 585 *
572 * @access private 586 * @param array $arr1 the reference set of trigram ranks
573 * @param array $arr1 the reference set of trigram ranks 587 * @param array $arr2 the target set of trigram ranks
574 * @param array $arr2 the target set of trigram ranks 588 *
575 * @return int the sum of the differences between the ranks of 589 * @return int the sum of the differences between the ranks of
576 * the two trigram sets 590 * the two trigram sets
591 * @access private
577 */ 592 */
578 function _distance(&$arr1, &$arr2) 593 function _distance($arr1, $arr2)
579 { 594 {
580 $sumdist = 0; 595 $sumdist = 0;
581 596
@@ -598,14 +613,15 @@ class Text_LanguageDetect
598 613
599 /** 614 /**
600 * Normalizes the score returned by _distance() 615 * Normalizes the score returned by _distance()
601 * 616 *
602 * Different if perl compatible or not 617 * Different if perl compatible or not
603 * 618 *
604 * @access private 619 * @param int $score the score from _distance()
605 * @param int $score the score from _distance() 620 * @param int $base_count the number of trigrams being considered
606 * @param int $base_count the number of trigrams being considered 621 *
607 * @return float the normalized score 622 * @return float the normalized score
608 * @see _distance() 623 * @see _distance()
624 * @access private
609 */ 625 */
610 function _normalize_score($score, $base_count = null) 626 function _normalize_score($score, $base_count = null)
611 { 627 {
@@ -630,29 +646,24 @@ class Text_LanguageDetect
630 * 646 *
631 * If perl compatible, the score is 300-0, 0 being most similar. 647 * If perl compatible, the score is 300-0, 0 being most similar.
632 * Otherwise, it's 0-1 with 1 being most similar. 648 * Otherwise, it's 0-1 with 1 being most similar.
633 * 649 *
634 * The $sample text should be at least a few sentences in length; 650 * The $sample text should be at least a few sentences in length;
635 * should be ascii-7 or utf8 encoded, if another and the mbstring extension 651 * should be ascii-7 or utf8 encoded, if another and the mbstring extension
636 * is present it will try to detect and convert. However, experience has 652 * is present it will try to detect and convert. However, experience has
637 * shown that mb_detect_encoding() *does not work very well* with at least 653 * shown that mb_detect_encoding() *does not work very well* with at least
638 * some types of encoding. 654 * some types of encoding.
639 * 655 *
640 * @access public 656 * @param string $sample a sample of text to compare.
641 * @param string $sample a sample of text to compare. 657 * @param int $limit if specified, return an array of the most likely
642 * @param int $limit if specified, return an array of the most likely 658 * $limit languages and their scores.
643 * $limit languages and their scores. 659 *
644 * @return mixed sorted array of language scores, blank array if no 660 * @return mixed sorted array of language scores, blank array if no
645 * useable text was found, or PEAR_Error if error 661 * useable text was found
646 * with the object setup 662 * @see _distance()
647 * @see _distance() 663 * @throws Text_LanguageDetect_Exception
648 * @throws PEAR_Error
649 */ 664 */
650 function detect($sample, $limit = 0) 665 public function detect($sample, $limit = 0)
651 { 666 {
652 if (!$this->_setup_ok($err)) {
653 return $err;
654 }
655
656 // input check 667 // input check
657 if (!Text_LanguageDetect_Parser::validateString($sample)) { 668 if (!Text_LanguageDetect_Parser::validateString($sample)) {
658 return array(); 669 return array();
@@ -660,36 +671,27 @@ class Text_LanguageDetect
660 671
661 // check char encoding 672 // check char encoding
662 // (only if mbstring extension is compiled and PHP > 4.0.6) 673 // (only if mbstring extension is compiled and PHP > 4.0.6)
663 if (function_exists('mb_detect_encoding') 674 if (function_exists('mb_detect_encoding')
664 && function_exists('mb_convert_encoding')) { 675 && function_exists('mb_convert_encoding')
665 676 ) {
666 // mb_detect_encoding isn't very reliable, to say the least 677 // mb_detect_encoding isn't very reliable, to say the least
667 // detection should still work with a sufficient sample of ascii characters 678 // detection should still work with a sufficient sample
679 // of ascii characters
668 $encoding = mb_detect_encoding($sample); 680 $encoding = mb_detect_encoding($sample);
669 681
670 // mb_detect_encoding() will return FALSE if detection fails 682 // mb_detect_encoding() will return FALSE if detection fails
671 // don't attempt conversion if that's the case 683 // don't attempt conversion if that's the case
672 if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { 684 if ($encoding != 'ASCII' && $encoding != 'UTF-8'
673 685 && $encoding !== false
674 if (function_exists('mb_list_encodings')) { 686 ) {
675 687 // verify the encoding exists in mb_list_encodings
676 // verify the encoding exists in mb_list_encodings 688 if (in_array($encoding, mb_list_encodings())) {
677 if (in_array($encoding, mb_list_encodings())) { 689 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
678 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
679 }
680
681 // if the previous condition failed:
682 // somehow we detected an encoding that also we don't support
683
684 } else {
685 // php 4 doesnt have mb_list_encodings()
686 // so attempt with error suppression
687 $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding);
688 } 690 }
689 } 691 }
690 } 692 }
691 693
692 $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); 694 $sample_obj = new Text_LanguageDetect_Parser($sample);
693 $sample_obj->prepareTrigram(); 695 $sample_obj->prepareTrigram();
694 if ($this->_use_unicode_narrowing) { 696 if ($this->_use_unicode_narrowing) {
695 $sample_obj->prepareUnicode(); 697 $sample_obj->prepareUnicode();
@@ -713,7 +715,10 @@ class Text_LanguageDetect
713 if (is_array($blocks)) { 715 if (is_array($blocks)) {
714 $present_blocks = array_keys($blocks); 716 $present_blocks = array_keys($blocks);
715 } else { 717 } else {
716 throw new Exception('Error during block detection'); 718 throw new Text_LanguageDetect_Exception(
719 'Error during block detection',
720 Text_LanguageDetect_Exception::BLOCK_DETECTION
721 );
717 } 722 }
718 723
719 $possible_langs = array(); 724 $possible_langs = array();
@@ -731,30 +736,30 @@ class Text_LanguageDetect
731 } 736 }
732 737
733 // could also try an intersect operation rather than a union 738 // could also try an intersect operation rather than a union
734 // in other words, choose languages whose trigrams contain 739 // in other words, choose languages whose trigrams contain
735 // ALL of the unicode blocks found in this sample 740 // ALL of the unicode blocks found in this sample
736 // would improve speed but would be completely thrown off by an 741 // would improve speed but would be completely thrown off by an
737 // unexpected character, like an umlaut appearing in english text 742 // unexpected character, like an umlaut appearing in english text
738 743
739 $possible_langs = array_intersect( 744 $possible_langs = array_intersect(
740 array_keys($this->_lang_db), 745 array_keys($this->_lang_db),
741 array_unique($possible_langs) 746 array_unique($possible_langs)
742 ); 747 );
743 748
744 // needs to intersect it with the keys of _lang_db in case 749 // needs to intersect it with the keys of _lang_db in case
745 // languages have been omitted 750 // languages have been omitted
746 751
747 // or just try 'em all
748 } else { 752 } else {
753 // or just try 'em all
749 $possible_langs = array_keys($this->_lang_db); 754 $possible_langs = array_keys($this->_lang_db);
750 } 755 }
751 756
752 757
753 foreach ($possible_langs as $lang) { 758 foreach ($possible_langs as $lang) {
754 $scores[$lang] = 759 $scores[$lang] = $this->_normalize_score(
755 $this->_normalize_score( 760 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
756 $this->_distance($this->_lang_db[$lang], $trigram_freqs), 761 $trigram_count
757 $trigram_count); 762 );
758 } 763 }
759 764
760 unset($sample_obj); 765 unset($sample_obj);
@@ -772,7 +777,6 @@ class Text_LanguageDetect
772 $limited_scores = array(); 777 $limited_scores = array();
773 778
774 $i = 0; 779 $i = 0;
775
776 foreach ($scores as $key => $value) { 780 foreach ($scores as $key => $value) {
777 if ($i++ >= $limit) { 781 if ($i++ >= $limit) {
778 break; 782 break;
@@ -781,9 +785,9 @@ class Text_LanguageDetect
781 $limited_scores[$key] = $value; 785 $limited_scores[$key] = $value;
782 } 786 }
783 787
784 return $limited_scores; 788 return $this->_convertToNameMode($limited_scores, true);
785 } else { 789 } else {
786 return $scores; 790 return $this->_convertToNameMode($scores, true);
787 } 791 }
788 } 792 }
789 793
@@ -791,35 +795,33 @@ class Text_LanguageDetect
791 * Returns only the most similar language to the text sample 795 * Returns only the most similar language to the text sample
792 * 796 *
793 * Calls $this->detect() and returns only the top result 797 * Calls $this->detect() and returns only the top result
794 * 798 *
795 * @access public 799 * @param string $sample text to detect the language of
796 * @param string $sample text to detect the language of 800 *
797 * @return string the name of the most likely language 801 * @return string the name of the most likely language
798 * or null if no language is similar 802 * or null if no language is similar
799 * @see detect() 803 * @see detect()
800 * @throws PEAR_Error 804 * @throws Text_LanguageDetect_Exception
801 */ 805 */
802 function detectSimple($sample) 806 public function detectSimple($sample)
803 { 807 {
804 $scores = $this->detect($sample, 1); 808 $scores = $this->detect($sample, 1);
805 809
806 // if top language has the maximum possible score, 810 // if top language has the maximum possible score,
807 // then the top score will have been picked at random 811 // then the top score will have been picked at random
808 if ( !is_array($scores) 812 if (!is_array($scores) || empty($scores)
809 || empty($scores) 813 || current($scores) == $this->_max_score
810 || current($scores) == $this->_max_score) { 814 ) {
811
812 return null; 815 return null;
813
814 } else { 816 } else {
815 return ucfirst(key($scores)); 817 return key($scores);
816 } 818 }
817 } 819 }
818 820
819 /** 821 /**
820 * Returns an array containing the most similar language and a confidence 822 * Returns an array containing the most similar language and a confidence
821 * rating 823 * rating
822 * 824 *
823 * Confidence is a simple measure calculated from the similarity score 825 * Confidence is a simple measure calculated from the similarity score
824 * minus the similarity score from the next most similar language 826 * minus the similarity score from the next most similar language
825 * divided by the highest possible score. Languages that have closely 827 * divided by the highest possible score. Languages that have closely
@@ -827,46 +829,43 @@ class Text_LanguageDetect
827 * confidence scores. 829 * confidence scores.
828 * 830 *
829 * The similarity score answers the question "How likely is the text the 831 * The similarity score answers the question "How likely is the text the
830 * returned language regardless of the other languages considered?" The 832 * returned language regardless of the other languages considered?" The
831 * confidence score is one way of answering the question "how likely is the 833 * confidence score is one way of answering the question "how likely is the
832 * text the detected language relative to the rest of the language model 834 * text the detected language relative to the rest of the language model
833 * set?" 835 * set?"
834 * 836 *
835 * To see how similar languages are a priori, see languageSimilarity() 837 * To see how similar languages are a priori, see languageSimilarity()
836 * 838 *
837 * @access public 839 * @param string $sample text for which language will be detected
838 * @param string $sample text for which language will be detected 840 *
839 * @return array most similar language, score and confidence rating 841 * @return array most similar language, score and confidence rating
840 * or null if no language is similar 842 * or null if no language is similar
841 * @see detect() 843 * @see detect()
842 * @throws PEAR_Error 844 * @throws Text_LanguageDetect_Exception
843 */ 845 */
844 function detectConfidence($sample) 846 public function detectConfidence($sample)
845 { 847 {
846 $scores = $this->detect($sample, 2); 848 $scores = $this->detect($sample, 2);
847 849
848 // if most similar language has the max score, it 850 // if most similar language has the max score, it
849 // will have been picked at random 851 // will have been picked at random
850 if ( !is_array($scores) 852 if (!is_array($scores) || empty($scores)
851 || empty($scores) 853 || current($scores) == $this->_max_score
852 || current($scores) == $this->_max_score) { 854 ) {
853
854 return null; 855 return null;
855 } 856 }
856 857
857 $arr['language'] = ucfirst(key($scores)); 858 $arr['language'] = key($scores);
858 $arr['similarity'] = current($scores); 859 $arr['similarity'] = current($scores);
859 if (next($scores) !== false) { // if false then no next element 860 if (next($scores) !== false) { // if false then no next element
860 // the goal is to return a higher value if the distance between 861 // the goal is to return a higher value if the distance between
861 // the similarity of the first score and the second score is high 862 // the similarity of the first score and the second score is high
862 863
863 if ($this->_perl_compatible) { 864 if ($this->_perl_compatible) {
864 865 $arr['confidence'] = (current($scores) - $arr['similarity'])
865 $arr['confidence'] = 866 / $this->_max_score;
866 (current($scores) - $arr['similarity']) / $this->_max_score;
867 867
868 } else { 868 } else {
869
870 $arr['confidence'] = $arr['similarity'] - current($scores); 869 $arr['confidence'] = $arr['similarity'] - current($scores);
871 870
872 } 871 }
@@ -882,32 +881,26 @@ class Text_LanguageDetect
882 * Returns the distribution of unicode blocks in a given utf8 string 881 * Returns the distribution of unicode blocks in a given utf8 string
883 * 882 *
884 * For the block name of a single char, use unicodeBlockName() 883 * For the block name of a single char, use unicodeBlockName()
885 * 884 *
886 * @access public 885 * @param string $str input string. Must be ascii or utf8
887 * @param string $str input string. Must be ascii or utf8 886 * @param bool $skip_symbols if true, skip ascii digits, symbols and
888 * @param bool $skip_symbols if true, skip ascii digits, symbols and 887 * non-printing characters. Includes spaces,
889 * non-printing characters. Includes spaces, 888 * newlines and common punctutation characters.
890 * newlines and common punctutation characters. 889 *
891 * @return array 890 * @return array
892 * @throws PEAR_Error 891 * @throws Text_LanguageDetect_Exception
893 */ 892 */
894 function detectUnicodeBlocks($str, $skip_symbols) 893 public function detectUnicodeBlocks($str, $skip_symbols)
895 { 894 {
896 // input check 895 $skip_symbols = (bool)$skip_symbols;
897 if (!is_bool($skip_symbols)) { 896 $str = (string)$str;
898 throw new Exception('Second parameter must be boolean');
899 }
900
901 if (!is_string($str)) {
902 throw new Exception('First parameter was not a string');
903 }
904 897
905 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 898 $sample_obj = new Text_LanguageDetect_Parser($str);
906 $sample_obj->prepareUnicode(); 899 $sample_obj->prepareUnicode();
907 $sample_obj->prepareTrigram(false); 900 $sample_obj->prepareTrigram(false);
908 $sample_obj->setUnicodeSkipSymbols($skip_symbols); 901 $sample_obj->setUnicodeSkipSymbols($skip_symbols);
909 $sample_obj->analyze(); 902 $sample_obj->analyze();
910 $blocks =& $sample_obj->getUnicodeBlocks(); 903 $blocks = $sample_obj->getUnicodeBlocks();
911 unset($sample_obj); 904 unset($sample_obj);
912 return $blocks; 905 return $blocks;
913 } 906 }
@@ -915,38 +908,37 @@ class Text_LanguageDetect
915 /** 908 /**
916 * Returns the block name for a given unicode value 909 * Returns the block name for a given unicode value
917 * 910 *
918 * If passed a string, will assume it is being passed a UTF8-formatted 911 * If passed a string, will assume it is being passed a UTF8-formatted
919 * character and will automatically convert. Otherwise it will assume it 912 * character and will automatically convert. Otherwise it will assume it
920 * is being passed a numeric unicode value. 913 * is being passed a numeric unicode value.
921 * 914 *
922 * Make sure input is of the correct type! 915 * Make sure input is of the correct type!
923 * 916 *
924 * @access public
925 * @param mixed $unicode unicode value or utf8 char 917 * @param mixed $unicode unicode value or utf8 char
918 *
926 * @return mixed the block name string or false if not found 919 * @return mixed the block name string or false if not found
927 * @throws PEAR_Error 920 * @throws Text_LanguageDetect_Exception
928 */ 921 */
929 function unicodeBlockName($unicode) { 922 public function unicodeBlockName($unicode)
923 {
930 if (is_string($unicode)) { 924 if (is_string($unicode)) {
931 // assume it is being passed a utf8 char, so convert it 925 // assume it is being passed a utf8 char, so convert it
932 926 if (self::utf8strlen($unicode) > 1) {
933 // input check 927 throw new Text_LanguageDetect_Exception(
934 if ($this->utf8strlen($unicode) > 1) { 928 'Pass a single char only to this method',
935 throw new Exception('Pass this function only a single char'); 929 Text_LanguageDetect_Exception::PARAM_TYPE
930 );
936 } 931 }
937
938 $unicode = $this->_utf8char2unicode($unicode); 932 $unicode = $this->_utf8char2unicode($unicode);
939 933
940 if ($unicode == -1) {
941 throw new Exception('Malformatted char');
942 }
943
944 // input check
945 } elseif (!is_int($unicode)) { 934 } elseif (!is_int($unicode)) {
946 throw new Exception('Input must be of type string or int.'); 935 throw new Text_LanguageDetect_Exception(
936 'Input must be of type string or int.',
937 Text_LanguageDetect_Exception::PARAM_TYPE
938 );
947 } 939 }
948 940
949 $blocks =& $this->_read_unicode_block_db(); 941 $blocks = $this->_read_unicode_block_db();
950 942
951 $result = $this->_unicode_block_name($unicode, $blocks); 943 $result = $this->_unicode_block_name($unicode, $blocks);
952 944
@@ -964,14 +956,17 @@ class Text_LanguageDetect
964 * the public interface for this function, which does input checks which 956 * the public interface for this function, which does input checks which
965 * this function omits for speed. 957 * this function omits for speed.
966 * 958 *
967 * @access protected 959 * @param int $unicode the unicode value
968 * @param int $unicode the unicode value 960 * @param array $blocks the block database
969 * @param array &$blocks the block database 961 * @param int $block_count the number of defined blocks in the database
970 * @param int $block_count the number of defined blocks in the database 962 *
971 * @see unicodeBlockName() 963 * @return mixed Block name, -1 if it failed
964 * @see unicodeBlockName()
965 * @access protected
972 */ 966 */
973 function _unicode_block_name($unicode, &$blocks, $block_count = -1) { 967 function _unicode_block_name($unicode, $blocks, $block_count = -1)
974 // for a reference, see 968 {
969 // for a reference, see
975 // http://www.unicode.org/Public/UNIDATA/Blocks.txt 970 // http://www.unicode.org/Public/UNIDATA/Blocks.txt
976 971
977 // assume that ascii characters are the most common 972 // assume that ascii characters are the most common
@@ -994,35 +989,36 @@ class Text_LanguageDetect
994 while ($low <= $high) { 989 while ($low <= $high) {
995 $mid = floor(($low + $high) / 2); 990 $mid = floor(($low + $high) / 2);
996 991
997 // if it's lower than the lower bound
998 if ($unicode < $blocks[$mid][0]) { 992 if ($unicode < $blocks[$mid][0]) {
993 // if it's lower than the lower bound
999 $high = $mid - 1; 994 $high = $mid - 1;
1000 995
1001 // if it's higher than the upper bound
1002 } elseif ($unicode > $blocks[$mid][1]) { 996 } elseif ($unicode > $blocks[$mid][1]) {
997 // if it's higher than the upper bound
1003 $low = $mid + 1; 998 $low = $mid + 1;
1004 999
1005 // found it
1006 } else { 1000 } else {
1001 // found it
1007 return $blocks[$mid]; 1002 return $blocks[$mid];
1008 } 1003 }
1009 } 1004 }
1010 1005
1011 // failed to find the block 1006 // failed to find the block
1012 return -1; 1007 return -1;
1013 1008
1014 // todo: differentiate when it's out of range or when it falls 1009 // todo: differentiate when it's out of range or when it falls
1015 // into an unassigned range? 1010 // into an unassigned range?
1016 } 1011 }
1017 1012
1018 /** 1013 /**
1019 * Brings up the unicode block database 1014 * Brings up the unicode block database
1020 * 1015 *
1021 * @access protected
1022 * @return array the database of unicode block definitions 1016 * @return array the database of unicode block definitions
1023 * @throws PEAR_Error 1017 * @throws Text_LanguageDetect_Exception
1018 * @access protected
1024 */ 1019 */
1025 function &_read_unicode_block_db() { 1020 function _read_unicode_block_db()
1021 {
1026 // since the unicode definitions are always going to be the same, 1022 // since the unicode definitions are always going to be the same,
1027 // might as well share the memory for the db with all other instances 1023 // might as well share the memory for the db with all other instances
1028 // of this class 1024 // of this class
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect
1037 1033
1038 /** 1034 /**
1039 * Calculate the similarities between the language models 1035 * Calculate the similarities between the language models
1040 * 1036 *
1041 * Use this function to see how similar languages are to each other. 1037 * Use this function to see how similar languages are to each other.
1042 * 1038 *
1043 * If passed 2 language names, will return just those languages compared. 1039 * If passed 2 language names, will return just those languages compared.
1044 * If passed 1 language name, will return that language compared to 1040 * If passed 1 language name, will return that language compared to
1045 * all others. 1041 * all others.
1046 * If passed none, will return an array of every language model compared 1042 * If passed none, will return an array of every language model compared
1047 * to every other one. 1043 * to every other one.
1048 * 1044 *
1049 * @access public 1045 * @param string $lang1 the name of the first language to be compared
1050 * @param string $lang1 the name of the first language to be compared 1046 * @param string $lang2 the name of the second language to be compared
1051 * @param string $lang2 the name of the second language to be compared 1047 *
1052 * @return array scores of every language compared 1048 * @return array scores of every language compared
1053 * or the score of just the provided languages 1049 * or the score of just the provided languages
1054 * or null if one of the supplied languages does not exist 1050 * or null if one of the supplied languages does not exist
1055 * @throws PEAR_Error 1051 * @throws Text_LanguageDetect_Exception
1056 */ 1052 */
1057 function languageSimilarity($lang1 = null, $lang2 = null) 1053 public function languageSimilarity($lang1 = null, $lang2 = null)
1058 { 1054 {
1059 if (!$this->_setup_ok($err)) { 1055 $lang1 = $this->_convertFromNameMode($lang1);
1060 return $err; 1056 $lang2 = $this->_convertFromNameMode($lang2);
1061 }
1062
1063 if ($lang1 != null) { 1057 if ($lang1 != null) {
1064 $lang1 = strtolower($lang1); 1058 $lang1 = strtolower($lang1);
1065 1059
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect
1069 } 1063 }
1070 1064
1071 if ($lang2 != null) { 1065 if ($lang2 != null) {
1072 1066 if (!isset($this->_lang_db[$lang2])) {
1073 // can't only set the second param 1067 // check if language model exists
1074 if ($lang1 == null) {
1075 return null;
1076 // check if language model exists
1077 } elseif (!isset($this->_lang_db[$lang2])) {
1078 return null; 1068 return null;
1079 } 1069 }
1080 1070
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect
1088 ) 1078 )
1089 ); 1079 );
1090 1080
1091
1092 // compare just $lang1 to all languages
1093 } else { 1081 } else {
1082 // compare just $lang1 to all languages
1094 $return_arr = array(); 1083 $return_arr = array();
1095 foreach ($this->_lang_db as $key => $value) { 1084 foreach ($this->_lang_db as $key => $value) {
1096 if ($key != $lang1) { // don't compare a language to itself 1085 if ($key != $lang1) {
1086 // don't compare a language to itself
1097 $return_arr[$key] = $this->_normalize_score( 1087 $return_arr[$key] = $this->_normalize_score(
1098 $this->_distance($this->_lang_db[$lang1], $value)); 1088 $this->_distance($this->_lang_db[$lang1], $value)
1089 );
1099 } 1090 }
1100 } 1091 }
1101 asort($return_arr); 1092 asort($return_arr);
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect
1104 } 1095 }
1105 1096
1106 1097
1107 // compare all languages to each other
1108 } else { 1098 } else {
1099 // compare all languages to each other
1109 $return_arr = array(); 1100 $return_arr = array();
1110 foreach (array_keys($this->_lang_db) as $lang1) { 1101 foreach (array_keys($this->_lang_db) as $lang1) {
1111 foreach (array_keys($this->_lang_db) as $lang2) { 1102 foreach (array_keys($this->_lang_db) as $lang2) {
1112
1113 // skip comparing languages to themselves 1103 // skip comparing languages to themselves
1114 if ($lang1 != $lang2) { 1104 if ($lang1 != $lang2) {
1115
1116 // don't re-calculate what's already been done
1117 if (isset($return_arr[$lang2][$lang1])) {
1118 1105
1119 $return_arr[$lang1][$lang2] = 1106 if (isset($return_arr[$lang2][$lang1])) {
1120 $return_arr[$lang2][$lang1]; 1107 // don't re-calculate what's already been done
1108 $return_arr[$lang1][$lang2]
1109 = $return_arr[$lang2][$lang1];
1121 1110
1122 // calculate
1123 } else { 1111 } else {
1124 1112 // calculate
1125 $return_arr[$lang1][$lang2] = 1113 $return_arr[$lang1][$lang2]
1126 $this->_normalize_score( 1114 = $this->_normalize_score(
1127 $this->_distance( 1115 $this->_distance(
1128 $this->_lang_db[$lang1], 1116 $this->_lang_db[$lang1],
1129 $this->_lang_db[$lang2] 1117 $this->_lang_db[$lang2]
1130 ) 1118 )
1131 ); 1119 );
1132 1120
1133 } 1121 }
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect
1150 * 1138 *
1151 * @access public 1139 * @access public
1152 * @return array language cluster data 1140 * @return array language cluster data
1153 * @throws PEAR_Error 1141 * @throws Text_LanguageDetect_Exception
1154 * @see languageSimilarity() 1142 * @see languageSimilarity()
1155 * @deprecated this function will eventually be removed and placed into 1143 * @deprecated this function will eventually be removed and placed into
1156 * the model generation class 1144 * the model generation class
1157 */ 1145 */
1158 function clusterLanguages() 1146 function clusterLanguages()
1159 { 1147 {
1160 // todo: set the maximum number of clusters 1148 // todo: set the maximum number of clusters
1161
1162 // setup check
1163 if (!$this->_setup_ok($err)) {
1164 return $err;
1165 }
1166
1167 // return cached result, if any 1149 // return cached result, if any
1168 if (isset($this->_clusters)) { 1150 if (isset($this->_clusters)) {
1169 return $this->_clusters; 1151 return $this->_clusters;
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect
1177 1159
1178 foreach ($langs as $lang) { 1160 foreach ($langs as $lang) {
1179 if (!isset($this->_lang_db[$lang])) { 1161 if (!isset($this->_lang_db[$lang])) {
1180 throw new Exception("missing $lang!\n"); 1162 throw new Text_LanguageDetect_Exception(
1163 "missing $lang!",
1164 Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1165 );
1181 } 1166 }
1182 } 1167 }
1183 1168
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect
1186 $langs[$lang1] = $lang1; 1171 $langs[$lang1] = $lang1;
1187 unset($langs[$old_key]); 1172 unset($langs[$old_key]);
1188 } 1173 }
1189 1174
1175 $result_data = $really_map = array();
1176
1190 $i = 0; 1177 $i = 0;
1191 while (count($langs) > 2 && $i++ < 200) { 1178 while (count($langs) > 2 && $i++ < 200) {
1192 $highest_score = -1; 1179 $highest_score = -1;
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect
1194 $highest_key2 = ''; 1181 $highest_key2 = '';
1195 foreach ($langs as $lang1) { 1182 foreach ($langs as $lang1) {
1196 foreach ($langs as $lang2) { 1183 foreach ($langs as $lang2) {
1197 if ( $lang1 != $lang2 1184 if ($lang1 != $lang2
1198 && $arr[$lang1][$lang2] > $highest_score) { 1185 && $arr[$lang1][$lang2] > $highest_score
1186 ) {
1199 $highest_score = $arr[$lang1][$lang2]; 1187 $highest_score = $arr[$lang1][$lang2];
1200 $highest_key1 = $lang1; 1188 $highest_key1 = $lang1;
1201 $highest_key2 = $lang2; 1189 $highest_key2 = $lang2;
1202 } 1190 }
1203 } 1191 }
1204 } 1192 }
1205 1193
1206 if (!$highest_key1) { 1194 if (!$highest_key1) {
1207 // should not ever happen 1195 // should not ever happen
1208 throw new Exception("no highest key? (step: $i)"); 1196 throw new Text_LanguageDetect_Exception(
1197 "no highest key? (step: $i)",
1198 Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1199 );
1209 } 1200 }
1210 1201
1211 if ($highest_score == 0) { 1202 if ($highest_score == 0) {
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect
1217 $sum1 = array_sum($arr[$highest_key1]); 1208 $sum1 = array_sum($arr[$highest_key1]);
1218 $sum2 = array_sum($arr[$highest_key2]); 1209 $sum2 = array_sum($arr[$highest_key2]);
1219 1210
1220 // use the score for the one that is most similar to the rest of 1211 // use the score for the one that is most similar to the rest of
1221 // the field as the score for the group 1212 // the field as the score for the group
1222 // todo: could try averaging or "centroid" method instead 1213 // todo: could try averaging or "centroid" method instead
1223 // seems like that might make more sense 1214 // seems like that might make more sense
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect
1248 $really_lang = $replaceme; 1239 $really_lang = $replaceme;
1249 while (isset($really_map[$really_lang])) { 1240 while (isset($really_map[$really_lang])) {
1250 $really_lang = $really_map[$really_lang]; 1241 $really_lang = $really_map[$really_lang];
1251 } 1242 }
1252 $really_map[$newkey] = $really_lang; 1243 $really_map[$newkey] = $really_lang;
1253 1244
1254 1245
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect
1259 $arr[$key1][$newkey] = $arr[$key1][$key2]; 1250 $arr[$key1][$newkey] = $arr[$key1][$key2];
1260 unset($arr[$key1][$key2]); 1251 unset($arr[$key1][$key2]);
1261 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] 1252 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1262 } 1253 }
1263 1254
1264 if ($key1 == $replaceme) { 1255 if ($key1 == $replaceme) {
1265 $arr[$newkey][$key2] = $arr[$key1][$key2]; 1256 $arr[$newkey][$key2] = $arr[$key1][$key2];
1266 unset($arr[$key1][$key2]); 1257 unset($arr[$key1][$key2]);
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect
1273 } 1264 }
1274 } 1265 }
1275 } 1266 }
1276 1267
1277 1268
1278 unset($langs[$highest_key1]); 1269 unset($langs[$highest_key1]);
1279 unset($langs[$highest_key2]); 1270 unset($langs[$highest_key2]);
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect
1293 } 1284 }
1294 1285
1295 $return_val = array( 1286 $return_val = array(
1296 'open_forks' => $langs, 1287 'open_forks' => $langs,
1297 // the top level of clusters 1288 // the top level of clusters
1298 // clusters that are mutually exclusive 1289 // clusters that are mutually exclusive
1299 // or specified by a specific maximum 1290 // or specified by a specific maximum
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect
1323 * use, and it may disappear or its functionality may change in future 1314 * use, and it may disappear or its functionality may change in future
1324 * releases without notice. 1315 * releases without notice.
1325 * 1316 *
1326 * This compares the sample text to top the top level of clusters. If the 1317 * This compares the sample text to top the top level of clusters. If the
1327 * sample is similar to the cluster it will drop down and compare it to the 1318 * sample is similar to the cluster it will drop down and compare it to the
1328 * languages in the cluster, and so on until it hits a leaf node. 1319 * languages in the cluster, and so on until it hits a leaf node.
1329 * 1320 *
1330 * this should find the language in considerably fewer compares 1321 * this should find the language in considerably fewer compares
1331 * (the equivalent of a binary search), however clusterLanguages() is costly 1322 * (the equivalent of a binary search), however clusterLanguages() is costly
1332 * and the loss of accuracy from this technique is significant. 1323 * and the loss of accuracy from this technique is significant.
1333 * 1324 *
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect
1337 * was very large, however in such cases some method of Bayesian inference 1328 * was very large, however in such cases some method of Bayesian inference
1338 * might be more helpful. 1329 * might be more helpful.
1339 * 1330 *
1340 * @see clusterLanguages() 1331 * @param string $str input string
1341 * @access public 1332 *
1342 * @param string $str input string 1333 * @return array language scores (only those compared)
1343 * @return array language scores (only those compared) 1334 * @throws Text_LanguageDetect_Exception
1344 * @throws PEAR_Error 1335 * @see clusterLanguages()
1345 */ 1336 */
1346 function clusteredSearch($str) 1337 public function clusteredSearch($str)
1347 { 1338 {
1348
1349 // input check 1339 // input check
1350 if (!Text_LanguageDetect_Parser::validateString($str)) { 1340 if (!Text_LanguageDetect_Parser::validateString($str)) {
1351 return array(); 1341 return array();
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect
1359 $dendogram_data = $result['fork_data']; 1349 $dendogram_data = $result['fork_data'];
1360 $dendogram_alias = $result['name_map']; 1350 $dendogram_alias = $result['name_map'];
1361 1351
1362 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 1352 $sample_obj = new Text_LanguageDetect_Parser($str);
1363 $sample_obj->prepareTrigram(); 1353 $sample_obj->prepareTrigram();
1364 $sample_obj->setPadStart(!$this->_perl_compatible); 1354 $sample_obj->setPadStart(!$this->_perl_compatible);
1365 $sample_obj->analyze(); 1355 $sample_obj->analyze();
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect
1372 } 1362 }
1373 1363
1374 $i = 0; // counts the number of steps 1364 $i = 0; // counts the number of steps
1375 1365
1376 foreach ($dendogram_start as $lang) { 1366 foreach ($dendogram_start as $lang) {
1377 if (isset($dendogram_alias[$lang])) { 1367 if (isset($dendogram_alias[$lang])) {
1378 $lang_key = $dendogram_alias[$lang]; 1368 $lang_key = $dendogram_alias[$lang];
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect
1382 1372
1383 $scores[$lang] = $this->_normalize_score( 1373 $scores[$lang] = $this->_normalize_score(
1384 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1374 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1385 $sample_count); 1375 $sample_count
1376 );
1386 1377
1387 $i++; 1378 $i++;
1388 } 1379 }
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect
1411 1402
1412 $scores[$lang] = $this->_normalize_score( 1403 $scores[$lang] = $this->_normalize_score(
1413 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1404 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1414 $sample_count); 1405 $sample_count
1406 );
1415 1407
1416 //todo: does not need to do same comparison again 1408 //todo: does not need to do same comparison again
1417 } 1409 }
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect
1428 1420
1429 $diff = $scores[$cur_key] - $scores[$loser_key]; 1421 $diff = $scores[$cur_key] - $scores[$loser_key];
1430 1422
1431 // $cur_key ({$dendogram_alias[$cur_key]}) wins 1423 // $cur_key ({$dendogram_alias[$cur_key]}) wins
1432 // over $loser_key ({$dendogram_alias[$loser_key]}) 1424 // over $loser_key ({$dendogram_alias[$loser_key]})
1433 // with a difference of $diff 1425 // with a difference of $diff
1434 } 1426 }
1435 1427
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect
1439 // which paths the algorithm decided to take along the tree 1431 // which paths the algorithm decided to take along the tree
1440 1432
1441 // but sometimes the last item is only the second highest 1433 // but sometimes the last item is only the second highest
1442 if ( ($this->_perl_compatible && (end($scores) > prev($scores))) 1434 if (($this->_perl_compatible && (end($scores) > prev($scores)))
1443 || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { 1435 || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1444 1436 ) {
1445 $real_last_score = current($scores); 1437 $real_last_score = current($scores);
1446 $real_last_key = key($scores); 1438 $real_last_key = key($scores);
1447 1439
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect
1449 unset($scores[$real_last_key]); 1441 unset($scores[$real_last_key]);
1450 $scores[$real_last_key] = $real_last_score; 1442 $scores[$real_last_key] = $real_last_score;
1451 } 1443 }
1452 1444
1453 1445
1454 if (!$this->_perl_compatible) { 1446 if (!$this->_perl_compatible) {
1455 $scores = array_reverse($scores, true); 1447 $scores = array_reverse($scores, true);
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect
1464 * 1456 *
1465 * Returns the numbers of characters (not bytes) in a utf8 string 1457 * Returns the numbers of characters (not bytes) in a utf8 string
1466 * 1458 *
1467 * @static 1459 * @param string $str string to get the length of
1468 * @access public 1460 *
1469 * @param string $str string to get the length of 1461 * @return int number of chars
1470 * @return int number of chars
1471 */ 1462 */
1472 function utf8strlen($str) 1463 public static function utf8strlen($str)
1473 { 1464 {
1474 // utf8_decode() will convert unknown chars to '?', which is actually 1465 // utf8_decode() will convert unknown chars to '?', which is actually
1475 // ideal for counting. 1466 // ideal for counting.
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect
1482 /** 1473 /**
1483 * Returns the unicode value of a utf8 char 1474 * Returns the unicode value of a utf8 char
1484 * 1475 *
1485 * @access protected 1476 * @param string $char a utf8 (possibly multi-byte) char
1486 * @param string $char a utf8 (possibly multi-byte) char 1477 *
1487 * @return int unicode value or -1 if malformatted 1478 * @return int unicode value
1479 * @access protected
1480 * @link http://en.wikipedia.org/wiki/UTF-8
1488 */ 1481 */
1489 function _utf8char2unicode($char) { 1482 function _utf8char2unicode($char)
1490 1483 {
1491 // strlen() here will actually get the binary length of a single char 1484 // strlen() here will actually get the binary length of a single char
1492 switch (strlen($char)) { 1485 switch (strlen($char)) {
1493 1486 case 1:
1494 // for a reference, see http://en.wikipedia.org/wiki/UTF-8 1487 // normal ASCII-7 byte
1495 1488 // 0xxxxxxx --> 0xxxxxxx
1496 case 1: 1489 return ord($char{0});
1497 // normal ASCII-7 byte 1490
1498 // 0xxxxxxx --> 0xxxxxxx 1491 case 2:
1499 return ord($char{0}); 1492 // 2 byte unicode
1500 1493 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1501 case 2: 1494 $z = (ord($char{0}) & 0x000001F) << 6;
1502 // 2 byte unicode 1495 $x = (ord($char{1}) & 0x0000003F);
1503 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx 1496 return ($z | $x);
1504 $z = (ord($char{0}) & 0x000001F) << 6; 1497
1505 $x = (ord($char{1}) & 0x0000003F); 1498 case 3:
1506 1499 // 3 byte unicode
1507 return ($z | $x); 1500 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1508 1501 $z = (ord($char{0}) & 0x0000000F) << 12;
1509 case 3: 1502 $x1 = (ord($char{1}) & 0x0000003F) << 6;
1510 // 3 byte unicode 1503 $x2 = (ord($char{2}) & 0x0000003F);
1511 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx 1504 return ($z | $x1 | $x2);
1512 $z = (ord($char{0}) & 0x0000000F) << 12; 1505
1513 $x1 = (ord($char{1}) & 0x0000003F) << 6; 1506 case 4:
1514 $x2 = (ord($char{2}) & 0x0000003F); 1507 // 4 byte unicode
1515 1508 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1516 return ($z | $x1 | $x2); 1509 // 000zzzzz xxxxxxxx xxxxxxxx
1517 1510 $z1 = (ord($char{0}) & 0x00000007) << 18;
1518 case 4: 1511 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1519 // 4 byte unicode 1512 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1520 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> 1513 $x2 = (ord($char{3}) & 0x0000003F);
1521 // 000zzzzz xxxxxxxx xxxxxxxx 1514 return ($z1 | $z2 | $x1 | $x2);
1522 $z1 = (ord($char{0}) & 0x00000007) << 18;
1523 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1524 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1525 $x2 = (ord($char{3}) & 0x0000003F);
1526
1527 return ($z1 | $z2 | $x1 | $x2);
1528
1529 default:
1530 // error: malformatted char?
1531 return -1;
1532 } 1515 }
1533 } 1516 }
1534 1517
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect
1536 * utf8-safe fast character iterator 1519 * utf8-safe fast character iterator
1537 * 1520 *
1538 * Will get the next character starting from $counter, which will then be 1521 * Will get the next character starting from $counter, which will then be
1539 * incremented. If a multi-byte char the bytes will be concatenated and 1522 * incremented. If a multi-byte char the bytes will be concatenated and
1540 * $counter will be incremeted by the number of bytes in the char. 1523 * $counter will be incremeted by the number of bytes in the char.
1541 * 1524 *
1542 * @access private 1525 * @param string $str the string being iterated over
1543 * @param string &$str the string being iterated over 1526 * @param int &$counter the iterator, will increment by reference
1544 * @param int &$counter the iterator, will increment by reference 1527 * @param bool $special_convert whether to do special conversions
1545 * @param bool $special_convert whether to do special conversions 1528 *
1546 * @return char the next (possibly multi-byte) char from $counter 1529 * @return char the next (possibly multi-byte) char from $counter
1530 * @access private
1547 */ 1531 */
1548 function _next_char(&$str, &$counter, $special_convert = false) 1532 static function _next_char($str, &$counter, $special_convert = false)
1549 { 1533 {
1550
1551 $char = $str{$counter++}; 1534 $char = $str{$counter++};
1552 $ord = ord($char); 1535 $ord = ord($char);
1553 1536
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect
1556 1539
1557 // normal ascii one byte char 1540 // normal ascii one byte char
1558 if ($ord <= 127) { 1541 if ($ord <= 127) {
1559
1560 // special conversions needed for this package 1542 // special conversions needed for this package
1561 // (that only apply to regular ascii characters) 1543 // (that only apply to regular ascii characters)
1562 // lower case, and convert all non-alphanumeric characters 1544 // lower case, and convert all non-alphanumeric characters
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect
1571 1553
1572 return $char; 1554 return $char;
1573 1555
1574 // multi-byte chars
1575 } elseif ($ord >> 5 == 6) { // two-byte char 1556 } elseif ($ord >> 5 == 6) { // two-byte char
1557 // multi-byte chars
1576 $nextchar = $str{$counter++}; // get next byte 1558 $nextchar = $str{$counter++}; // get next byte
1577 1559
1578 // lower-casing of non-ascii characters is still incomplete 1560 // lower-casing of non-ascii characters is still incomplete
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect
1582 if ($ord == 195) { 1564 if ($ord == 195) {
1583 $nextord = ord($nextchar); 1565 $nextord = ord($nextchar);
1584 $nextord_adj = $nextord + 64; 1566 $nextord_adj = $nextord + 64;
1585 // for a reference, see 1567 // for a reference, see
1586 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html 1568 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1587 1569
1588 // &Agrave; - &THORN; but not &times; 1570 // &Agrave; - &THORN; but not &times;
1589 if ( $nextord_adj >= 192 1571 if ($nextord_adj >= 192
1590 && $nextord_adj <= 222 1572 && $nextord_adj <= 222
1591 && $nextord_adj != 215) { 1573 && $nextord_adj != 215
1592 1574 ) {
1593 $nextchar = chr($nextord + 32); 1575 $nextchar = chr($nextord + 32);
1594 } 1576 }
1595 1577
1596 // lower case cyrillic alphabet
1597 } elseif ($ord == 208) { 1578 } elseif ($ord == 208) {
1579 // lower case cyrillic alphabet
1598 $nextord = ord($nextchar); 1580 $nextord = ord($nextchar);
1599 // if A - Pe 1581 // if A - Pe
1600 if ($nextord >= 144 && $nextord <= 159) { 1582 if ($nextord >= 144 && $nextord <= 159) {
1601 // lower case 1583 // lower case
1602 $nextchar = chr($nextord + 32); 1584 $nextchar = chr($nextord + 32);
1603 1585
1604 // if Er - Ya
1605 } elseif ($nextord >= 160 && $nextord <= 175) { 1586 } elseif ($nextord >= 160 && $nextord <= 175) {
1587 // if Er - Ya
1606 // lower case 1588 // lower case
1607 $char = chr(209); // == $ord++ 1589 $char = chr(209); // == $ord++
1608 $nextchar = chr($nextord - 32); 1590 $nextchar = chr($nextord - 32);
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect
1611 } 1593 }
1612 1594
1613 // tag on next byte 1595 // tag on next byte
1614 return $char . $nextchar; 1596 return $char . $nextchar;
1615
1616 } elseif ($ord >> 4 == 14) { // three-byte char 1597 } elseif ($ord >> 4 == 14) { // three-byte char
1617 1598
1618 // tag on next 2 bytes 1599 // tag on next 2 bytes
1619 return $char . $str{$counter++} . $str{$counter++}; 1600 return $char . $str{$counter++} . $str{$counter++};
1620 1601
1621 } elseif ($ord >> 3 == 30) { // four-byte char 1602 } elseif ($ord >> 3 == 30) { // four-byte char
1622 1603
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect
1628 } 1609 }
1629 } 1610 }
1630 1611
1631} 1612 /**
1613 * Converts an $language input parameter from the configured mode
1614 * to the language name that is used internally.
1615 *
1616 * Works for strings and arrays.
1617 *
1618 * @param string|array $lang A language description ("english"/"en"/"eng")
1619 * @param boolean $convertKey If $lang is an array, setting $key
1620 * converts the keys to the language name.
1621 *
1622 * @return string|array Language name
1623 */
1624 function _convertFromNameMode($lang, $convertKey = false)
1625 {
1626 if ($this->_name_mode == 0) {
1627 return $lang;
1628 }
1629
1630 if ($this->_name_mode == 2) {
1631 $method = 'code2ToName';
1632 } else {
1633 $method = 'code3ToName';
1634 }
1635
1636 if (is_string($lang)) {
1637 return (string)Text_LanguageDetect_ISO639::$method($lang);
1638 }
1639
1640 $newlang = array();
1641 foreach ($lang as $key => $val) {
1642 if ($convertKey) {
1643 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1644 $newlang[$newkey] = $val;
1645 } else {
1646 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1647 }
1648 }
1649 return $newlang;
1650 }
1632 1651
1633/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 1652 /**
1653 * Converts an $language output parameter from the language name that is
1654 * used internally to the configured mode.
1655 *
1656 * Works for strings and arrays.
1657 *
1658 * @param string|array $lang A language description ("english"/"en"/"eng")
1659 * @param boolean $convertKey If $lang is an array, setting $key
1660 * converts the keys to the language name.
1661 *
1662 * @return string|array Language name
1663 */
1664 function _convertToNameMode($lang, $convertKey = false)
1665 {
1666 if ($this->_name_mode == 0) {
1667 return $lang;
1668 }
1669
1670 if ($this->_name_mode == 2) {
1671 $method = 'nameToCode2';
1672 } else {
1673 $method = 'nameToCode3';
1674 }
1675
1676 if (is_string($lang)) {
1677 return Text_LanguageDetect_ISO639::$method($lang);
1678 }
1679
1680 $newlang = array();
1681 foreach ($lang as $key => $val) {
1682 if ($convertKey) {
1683 $newkey = Text_LanguageDetect_ISO639::$method($key);
1684 $newlang[$newkey] = $val;
1685 } else {
1686 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1687 }
1688 }
1689 return $newlang;
1690 }
1691}
1634 1692
1635?> 1693/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
1<?php
2class Text_LanguageDetect_Exception extends Exception
3{
4 /**
5 * Database file could not be found
6 */
7 const DB_NOT_FOUND = 10;
8
9 /**
10 * Database file found, but not readable
11 */
12 const DB_NOT_READABLE = 11;
13
14 /**
15 * Database file is empty
16 */
17 const DB_EMPTY = 12;
18
19 /**
20 * Database contents is not a PHP array
21 */
22 const DB_NOT_ARRAY = 13;
23
24 /**
25 * Magic quotes are activated
26 */
27 const MAGIC_QUOTES = 14;
28
29
30 /**
31 * Parameter of invalid type passed to method
32 */
33 const PARAM_TYPE = 20;
34
35 /**
36 * Character in parameter is invalid
37 */
38 const INVALID_CHAR = 21;
39
40
41 /**
42 * Language is not in the database
43 */
44 const UNKNOWN_LANGUAGE = 30;
45
46
47 /**
48 * Error during block detection
49 */
50 const BLOCK_DETECTION = 40;
51
52
53 /**
54 * Error while clustering languages
55 */
56 const NO_HIGHEST_KEY = 50;
57}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
1<?php
2/**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Christian Weiske <cweiske@php.net>
10 * @copyright 2011 Christian Weiske <cweiske@php.net>
11 * @license http://www.debian.org/misc/bsd.license BSD
12 * @version SVN: $Id$
13 * @link http://pear.php.net/package/Text_LanguageDetect/
14 */
15
16/**
17 * Provides a mapping between the languages from lang.dat and the
18 * ISO 639-1 and ISO-639-2 codes.
19 *
20 * Note that this class contains only languages that exist in lang.dat.
21 *
22 * @category Text
23 * @package Text_LanguageDetect
24 * @author Christian Weiske <cweiske@php.net>
25 * @copyright 2011 Christian Weiske <cweiske@php.net>
26 * @license http://www.debian.org/misc/bsd.license BSD
27 * @link http://www.loc.gov/standards/iso639-2/php/code_list.php
28 */
29class Text_LanguageDetect_ISO639
30{
31 /**
32 * Maps all language names from the language database to the
33 * ISO 639-1 2-letter language code.
34 *
35 * NULL indicates that there is no 2-letter code.
36 *
37 * @var array
38 */
39 public static $nameToCode2 = array(
40 'albanian' => 'sq',
41 'arabic' => 'ar',
42 'azeri' => 'az',
43 'bengali' => 'bn',
44 'bulgarian' => 'bg',
45 'cebuano' => null,
46 'croatian' => 'hr',
47 'czech' => 'cs',
48 'danish' => 'da',
49 'dutch' => 'nl',
50 'english' => 'en',
51 'estonian' => 'et',
52 'farsi' => 'fa',
53 'finnish' => 'fi',
54 'french' => 'fr',
55 'german' => 'de',
56 'hausa' => 'ha',
57 'hawaiian' => null,
58 'hindi' => 'hi',
59 'hungarian' => 'hu',
60 'icelandic' => 'is',
61 'indonesian' => 'id',
62 'italian' => 'it',
63 'kazakh' => 'kk',
64 'kyrgyz' => 'ky',
65 'latin' => 'la',
66 'latvian' => 'lv',
67 'lithuanian' => 'lt',
68 'macedonian' => 'mk',
69 'mongolian' => 'mn',
70 'nepali' => 'ne',
71 'norwegian' => 'no',
72 'pashto' => 'ps',
73 'pidgin' => null,
74 'polish' => 'pl',
75 'portuguese' => 'pt',
76 'romanian' => 'ro',
77 'russian' => 'ru',
78 'serbian' => 'sr',
79 'slovak' => 'sk',
80 'slovene' => 'sl',
81 'somali' => 'so',
82 'spanish' => 'es',
83 'swahili' => 'sw',
84 'swedish' => 'sv',
85 'tagalog' => 'tl',
86 'turkish' => 'tr',
87 'ukrainian' => 'uk',
88 'urdu' => 'ur',
89 'uzbek' => 'uz',
90 'vietnamese' => 'vi',
91 'welsh' => 'cy',
92 );
93
94 /**
95 * Maps all language names from the language database to the
96 * ISO 639-2 3-letter language code.
97 *
98 * @var array
99 */
100 public static $nameToCode3 = array(
101 'albanian' => 'sqi',
102 'arabic' => 'ara',
103 'azeri' => 'aze',
104 'bengali' => 'ben',
105 'bulgarian' => 'bul',
106 'cebuano' => 'ceb',
107 'croatian' => 'hrv',
108 'czech' => 'ces',
109 'danish' => 'dan',
110 'dutch' => 'nld',
111 'english' => 'eng',
112 'estonian' => 'est',
113 'farsi' => 'fas',
114 'finnish' => 'fin',
115 'french' => 'fra',
116 'german' => 'deu',
117 'hausa' => 'hau',
118 'hawaiian' => 'haw',
119 'hindi' => 'hin',
120 'hungarian' => 'hun',
121 'icelandic' => 'isl',
122 'indonesian' => 'ind',
123 'italian' => 'ita',
124 'kazakh' => 'kaz',
125 'kyrgyz' => 'kir',
126 'latin' => 'lat',
127 'latvian' => 'lav',
128 'lithuanian' => 'lit',
129 'macedonian' => 'mkd',
130 'mongolian' => 'mon',
131 'nepali' => 'nep',
132 'norwegian' => 'nor',
133 'pashto' => 'pus',
134 'pidgin' => 'crp',
135 'polish' => 'pol',
136 'portuguese' => 'por',
137 'romanian' => 'ron',
138 'russian' => 'rus',
139 'serbian' => 'srp',
140 'slovak' => 'slk',
141 'slovene' => 'slv',
142 'somali' => 'som',
143 'spanish' => 'spa',
144 'swahili' => 'swa',
145 'swedish' => 'swe',
146 'tagalog' => 'tgl',
147 'turkish' => 'tur',
148 'ukrainian' => 'ukr',
149 'urdu' => 'urd',
150 'uzbek' => 'uzb',
151 'vietnamese' => 'vie',
152 'welsh' => 'cym',
153 );
154
155 /**
156 * Maps ISO 639-1 2-letter language codes to the language names
157 * in the language database
158 *
159 * Not all languages have a 2 letter code, so some are missing
160 *
161 * @var array
162 */
163 public static $code2ToName = array(
164 'ar' => 'arabic',
165 'az' => 'azeri',
166 'bg' => 'bulgarian',
167 'bn' => 'bengali',
168 'cs' => 'czech',
169 'cy' => 'welsh',
170 'da' => 'danish',
171 'de' => 'german',
172 'en' => 'english',
173 'es' => 'spanish',
174 'et' => 'estonian',
175 'fa' => 'farsi',
176 'fi' => 'finnish',
177 'fr' => 'french',
178 'ha' => 'hausa',
179 'hi' => 'hindi',
180 'hr' => 'croatian',
181 'hu' => 'hungarian',
182 'id' => 'indonesian',
183 'is' => 'icelandic',
184 'it' => 'italian',
185 'kk' => 'kazakh',
186 'ky' => 'kyrgyz',
187 'la' => 'latin',
188 'lt' => 'lithuanian',
189 'lv' => 'latvian',
190 'mk' => 'macedonian',
191 'mn' => 'mongolian',
192 'ne' => 'nepali',
193 'nl' => 'dutch',
194 'no' => 'norwegian',
195 'pl' => 'polish',
196 'ps' => 'pashto',
197 'pt' => 'portuguese',
198 'ro' => 'romanian',
199 'ru' => 'russian',
200 'sk' => 'slovak',
201 'sl' => 'slovene',
202 'so' => 'somali',
203 'sq' => 'albanian',
204 'sr' => 'serbian',
205 'sv' => 'swedish',
206 'sw' => 'swahili',
207 'tl' => 'tagalog',
208 'tr' => 'turkish',
209 'uk' => 'ukrainian',
210 'ur' => 'urdu',
211 'uz' => 'uzbek',
212 'vi' => 'vietnamese',
213 );
214
215 /**
216 * Maps ISO 639-2 3-letter language codes to the language names
217 * in the language database.
218 *
219 * @var array
220 */
221 public static $code3ToName = array(
222 'ara' => 'arabic',
223 'aze' => 'azeri',
224 'ben' => 'bengali',
225 'bul' => 'bulgarian',
226 'ceb' => 'cebuano',
227 'ces' => 'czech',
228 'crp' => 'pidgin',
229 'cym' => 'welsh',
230 'dan' => 'danish',
231 'deu' => 'german',
232 'eng' => 'english',
233 'est' => 'estonian',
234 'fas' => 'farsi',
235 'fin' => 'finnish',
236 'fra' => 'french',
237 'hau' => 'hausa',
238 'haw' => 'hawaiian',
239 'hin' => 'hindi',
240 'hrv' => 'croatian',
241 'hun' => 'hungarian',
242 'ind' => 'indonesian',
243 'isl' => 'icelandic',
244 'ita' => 'italian',
245 'kaz' => 'kazakh',
246 'kir' => 'kyrgyz',
247 'lat' => 'latin',
248 'lav' => 'latvian',
249 'lit' => 'lithuanian',
250 'mkd' => 'macedonian',
251 'mon' => 'mongolian',
252 'nep' => 'nepali',
253 'nld' => 'dutch',
254 'nor' => 'norwegian',
255 'pol' => 'polish',
256 'por' => 'portuguese',
257 'pus' => 'pashto',
258 'rom' => 'romanian',
259 'rus' => 'russian',
260 'slk' => 'slovak',
261 'slv' => 'slovene',
262 'som' => 'somali',
263 'spa' => 'spanish',
264 'sqi' => 'albanian',
265 'srp' => 'serbian',
266 'swa' => 'swahili',
267 'swe' => 'swedish',
268 'tgl' => 'tagalog',
269 'tur' => 'turkish',
270 'ukr' => 'ukrainian',
271 'urd' => 'urdu',
272 'uzb' => 'uzbek',
273 'vie' => 'vietnamese',
274 );
275
276 /**
277 * Returns the 2-letter ISO 639-1 code for the given language name.
278 *
279 * @param string $lang English language name like "swedish"
280 *
281 * @return string Two-letter language code (e.g. "sv") or NULL if not found
282 */
283 public static function nameToCode2($lang)
284 {
285 $lang = strtolower($lang);
286 if (!isset(self::$nameToCode2[$lang])) {
287 return null;
288 }
289 return self::$nameToCode2[$lang];
290 }
291
292 /**
293 * Returns the 3-letter ISO 639-2 code for the given language name.
294 *
295 * @param string $lang English language name like "swedish"
296 *
297 * @return string Three-letter language code (e.g. "swe") or NULL if not found
298 */
299 public static function nameToCode3($lang)
300 {
301 $lang = strtolower($lang);
302 if (!isset(self::$nameToCode3[$lang])) {
303 return null;
304 }
305 return self::$nameToCode3[$lang];
306 }
307
308 /**
309 * Returns the language name for the given 2-letter ISO 639-1 code.
310 *
311 * @param string $code Two-letter language code (e.g. "sv")
312 *
313 * @return string English language name like "swedish"
314 */
315 public static function code2ToName($code)
316 {
317 $lang = strtolower($code);
318 if (!isset(self::$code2ToName[$code])) {
319 return null;
320 }
321 return self::$code2ToName[$code];
322 }
323
324 /**
325 * Returns the language name for the given 3-letter ISO 639-2 code.
326 *
327 * @param string $code Three-letter language code (e.g. "swe")
328 *
329 * @return string English language name like "swedish"
330 */
331 public static function code3ToName($code)
332 {
333 $lang = strtolower($code);
334 if (!isset(self::$code3ToName[$code])) {
335 return null;
336 }
337 return self::$code3ToName[$code];
338 }
339} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
index 7f15fa98..fb0e1e20 100644
--- a/inc/3rdparty/libraries/language-detect/Parser.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -8,7 +8,7 @@
8 * @author Nicholas Pisarro 8 * @author Nicholas Pisarro
9 * @copyright 2006 9 * @copyright 2006
10 * @license BSD 10 * @license BSD
11 * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ 11 * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
12 * @link http://pear.php.net/package/Text_LanguageDetect/ 12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/ 13 * @link http://langdetect.blogspot.com/
14 */ 14 */
@@ -28,7 +28,7 @@
28 * @author Nicholas Pisarro 28 * @author Nicholas Pisarro
29 * @copyright 2006 29 * @copyright 2006
30 * @license BSD 30 * @license BSD
31 * @version release: 0.2.3 31 * @version release: 0.3.0
32 */ 32 */
33class Text_LanguageDetect_Parser extends Text_LanguageDetect 33class Text_LanguageDetect_Parser extends Text_LanguageDetect
34{ 34{
@@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
102 * @access private 102 * @access private
103 * @param string $string string to be parsed 103 * @param string $string string to be parsed
104 */ 104 */
105 function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { 105 function Text_LanguageDetect_Parser($string) {
106 if (isset($db)) $this->_db_filename = $db;
107 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
108 $this->_string = $string; 106 $this->_string = $string;
109 } 107 }
110 108
111 /** 109 /**
112 * Returns true if a string is suitable for parsing 110 * Returns true if a string is suitable for parsing
113 * 111 *
114 * @static
115 * @access public
116 * @param string $str input string to test 112 * @param string $str input string to test
117 * @return bool true if acceptable, false if not 113 * @return bool true if acceptable, false if not
118 */ 114 */
119 function validateString($str) { 115 public static function validateString($str) {
120 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { 116 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
121 return true; 117 return true;
122 } else { 118 } else {
@@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
222 218
223 // unicode startup 219 // unicode startup
224 if ($this->_compile_unicode) { 220 if ($this->_compile_unicode) {
225 $blocks =& $this->_read_unicode_block_db(); 221 $blocks = $this->_read_unicode_block_db();
226
227 $block_count = count($blocks); 222 $block_count = count($blocks);
228 223
229 $skipped_count = 0; 224 $skipped_count = 0;
@@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
349 } 344 }
350} 345}
351 346
352/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 347/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
353
354?>
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php
index 2e8991cc..d0f09d74 100644
--- a/inc/3rdparty/libraries/readability/Readability.php
+++ b/inc/3rdparty/libraries/readability/Readability.php
@@ -1,1138 +1,1138 @@
1<?php 1<?php
2/** 2/**
3* Arc90's Readability ported to PHP for FiveFilters.org 3* Arc90's Readability ported to PHP for FiveFilters.org
4* Based on readability.js version 1.7.1 (without multi-page support) 4* Based on readability.js version 1.7.1 (without multi-page support)
5* Updated to allow HTML5 parsing with html5lib 5* Updated to allow HTML5 parsing with html5lib
6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds 6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
7* ------------------------------------------------------ 7* ------------------------------------------------------
8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js 8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
9* Arc90's project URL: http://lab.arc90.com/experiments/readability/ 9* Arc90's project URL: http://lab.arc90.com/experiments/readability/
10* JS Source: http://code.google.com/p/arc90labs-readability 10* JS Source: http://code.google.com/p/arc90labs-readability
11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net 11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
12* More information: http://fivefilters.org/content-only/ 12* More information: http://fivefilters.org/content-only/
13* License: Apache License, Version 2.0 13* License: Apache License, Version 2.0
14* Requires: PHP5 14* Requires: PHP5
15* Date: 2012-09-19 15* Date: 2012-09-19
16* 16*
17* Differences between the PHP port and the original 17* Differences between the PHP port and the original
18* ------------------------------------------------------ 18* ------------------------------------------------------
19* Arc90's Readability is designed to run in the browser. It works on the DOM 19* Arc90's Readability is designed to run in the browser. It works on the DOM
20* tree (the parsed HTML) after the page's CSS styles have been applied and 20* tree (the parsed HTML) after the page's CSS styles have been applied and
21* Javascript code executed. This PHP port does not run inside a browser. 21* Javascript code executed. This PHP port does not run inside a browser.
22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot
23* rely on CSS or Javascript support. As such, the results will not always 23* rely on CSS or Javascript support. As such, the results will not always
24* match Arc90's Readability. (For example, if a web page contains CSS style 24* match Arc90's Readability. (For example, if a web page contains CSS style
25* rules or Javascript code which hide certain HTML elements from display, 25* rules or Javascript code which hide certain HTML elements from display,
26* Arc90's Readability will dismiss those from consideration but our PHP port, 26* Arc90's Readability will dismiss those from consideration but our PHP port,
27* unable to understand CSS or Javascript, will not know any better.) 27* unable to understand CSS or Javascript, will not know any better.)
28* 28*
29* Another significant difference is that the aim of Arc90's Readability is 29* Another significant difference is that the aim of Arc90's Readability is
30* to re-present the main content block of a given web page so users can 30* to re-present the main content block of a given web page so users can
31* read it more easily in their browsers. Correct identification, clean up, 31* read it more easily in their browsers. Correct identification, clean up,
32* and separation of the content block is only a part of this process. 32* and separation of the content block is only a part of this process.
33* This PHP port is only concerned with this part, it does not include code 33* This PHP port is only concerned with this part, it does not include code
34* that relates to presentation in the browser - Arc90 already do 34* that relates to presentation in the browser - Arc90 already do
35* that extremely well, and for PDF output there's FiveFilters.org's 35* that extremely well, and for PDF output there's FiveFilters.org's
36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/. 36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
37* 37*
38* Finally, this class contains methods that might be useful for developers 38* Finally, this class contains methods that might be useful for developers
39* working on HTML document fragments. So without deviating too much from 39* working on HTML document fragments. So without deviating too much from
40* the original code (which I don't want to do because it makes debugging 40* the original code (which I don't want to do because it makes debugging
41* and updating more difficult), I've tried to make it a little more 41* and updating more difficult), I've tried to make it a little more
42* developer friendly. You should be able to use the methods here on 42* developer friendly. You should be able to use the methods here on
43* existing DOMElement objects without passing an entire HTML document to 43* existing DOMElement objects without passing an entire HTML document to
44* be parsed. 44* be parsed.
45*/ 45*/
46 46
47// This class allows us to do JavaScript like assignements to innerHTML 47// This class allows us to do JavaScript like assignements to innerHTML
48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); 48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
49 49
50// Alternative usage (for testing only!) 50// Alternative usage (for testing only!)
51// uncomment the lines below and call Readability.php in your browser 51// uncomment the lines below and call Readability.php in your browser
52// passing it the URL of the page you'd like content from, e.g.: 52// passing it the URL of the page you'd like content from, e.g.:
53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php 53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
54 54
55/* 55/*
56if (!isset($_GET['url']) || $_GET['url'] == '') { 56if (!isset($_GET['url']) || $_GET['url'] == '') {
57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); 57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
58} 58}
59$url = $_GET['url']; 59$url = $_GET['url'];
60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; 60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
61$html = file_get_contents($url); 61$html = file_get_contents($url);
62$r = new Readability($html, $url); 62$r = new Readability($html, $url);
63$r->init(); 63$r->init();
64echo $r->articleContent->innerHTML; 64echo $r->articleContent->innerHTML;
65*/ 65*/
66 66
67class Readability 67class Readability
68{ 68{
69 public $version = '1.7.1-without-multi-page'; 69 public $version = '1.7.1-without-multi-page';
70 public $convertLinksToFootnotes = false; 70 public $convertLinksToFootnotes = false;
71 public $revertForcedParagraphElements = true; 71 public $revertForcedParagraphElements = true;
72 public $articleTitle; 72 public $articleTitle;
73 public $articleContent; 73 public $articleContent;
74 public $dom; 74 public $dom;
75 public $url = null; // optional - URL where HTML was retrieved 75 public $url = null; // optional - URL where HTML was retrieved
76 public $debug = false; 76 public $debug = false;
77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19 77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19
78 protected $body = null; // 78 protected $body = null; //
79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. 80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
81 protected $success = false; // indicates whether we were able to extract or not 81 protected $success = false; // indicates whether we were able to extract or not
82 82
83 /** 83 /**
84 * All of the regular expressions in use within readability. 84 * All of the regular expressions in use within readability.
85 * Defined up here so we don't instantiate them repeatedly in loops. 85 * Defined up here so we don't instantiate them repeatedly in loops.
86 **/ 86 **/
87 public $regexps = array( 87 public $regexps = array(
88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', 88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', 90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', 91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', 92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', 93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
94 'replaceFonts' => '/<(\/?)font[^>]*>/i', 94 'replaceFonts' => '/<(\/?)font[^>]*>/i',
95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() 95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
96 'normalize' => '/\s{2,}/', 96 'normalize' => '/\s{2,}/',
97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/', 97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', 98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 ); 100 );
101 101
102 /* constants */ 102 /* constants */
103 const FLAG_STRIP_UNLIKELYS = 1; 103 const FLAG_STRIP_UNLIKELYS = 1;
104 const FLAG_WEIGHT_CLASSES = 2; 104 const FLAG_WEIGHT_CLASSES = 2;
105 const FLAG_CLEAN_CONDITIONALLY = 4; 105 const FLAG_CLEAN_CONDITIONALLY = 4;
106 106
107 /** 107 /**
108 * Create instance of Readability 108 * Create instance of Readability
109 * @param string UTF-8 encoded string 109 * @param string UTF-8 encoded string
110 * @param string (optional) URL associated with HTML (used for footnotes) 110 * @param string (optional) URL associated with HTML (used for footnotes)
111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') 111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 */ 112 */
113 function __construct($html, $url=null, $parser='libxml') 113 function __construct($html, $url=null, $parser='libxml')
114 { 114 {
115 $this->url = $url; 115 $this->url = $url;
116 /* Turn all double br's into p's */ 116 /* Turn all double br's into p's */
117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); 117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); 118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); 119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
120 if (trim($html) == '') $html = '<html></html>'; 120 if (trim($html) == '') $html = '<html></html>';
121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { 121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
122 // all good 122 // all good
123 } else { 123 } else {
124 $this->dom = new DOMDocument(); 124 $this->dom = new DOMDocument();
125 $this->dom->preserveWhiteSpace = false; 125 $this->dom->preserveWhiteSpace = false;
126 @$this->dom->loadHTML($html); 126 @$this->dom->loadHTML($html);
127 } 127 }
128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
129 } 129 }
130 130
131 /** 131 /**
132 * Get article title element 132 * Get article title element
133 * @return DOMElement 133 * @return DOMElement
134 */ 134 */
135 public function getTitle() { 135 public function getTitle() {
136 return $this->articleTitle; 136 return $this->articleTitle;
137 } 137 }
138 138
139 /** 139 /**
140 * Get article content element 140 * Get article content element
141 * @return DOMElement 141 * @return DOMElement
142 */ 142 */
143 public function getContent() { 143 public function getContent() {
144 return $this->articleContent; 144 return $this->articleContent;
145 } 145 }
146 146
147 /** 147 /**
148 * Runs readability. 148 * Runs readability.
149 * 149 *
150 * Workflow: 150 * Workflow:
151 * 1. Prep the document by removing script tags, css, etc. 151 * 1. Prep the document by removing script tags, css, etc.
152 * 2. Build readability's DOM tree. 152 * 2. Build readability's DOM tree.
153 * 3. Grab the article content from the current dom tree. 153 * 3. Grab the article content from the current dom tree.
154 * 4. Replace the current DOM tree with the new one. 154 * 4. Replace the current DOM tree with the new one.
155 * 5. Read peacefully. 155 * 5. Read peacefully.
156 * 156 *
157 * @return boolean true if we found content, false otherwise 157 * @return boolean true if we found content, false otherwise
158 **/ 158 **/
159 public function init() 159 public function init()
160 { 160 {
161 if (!isset($this->dom->documentElement)) return false; 161 if (!isset($this->dom->documentElement)) return false;
162 $this->removeScripts($this->dom); 162 $this->removeScripts($this->dom);
163 //die($this->getInnerHTML($this->dom->documentElement)); 163 //die($this->getInnerHTML($this->dom->documentElement));
164 164
165 // Assume successful outcome 165 // Assume successful outcome
166 $this->success = true; 166 $this->success = true;
167 167
168 $bodyElems = $this->dom->getElementsByTagName('body'); 168 $bodyElems = $this->dom->getElementsByTagName('body');
169 if ($bodyElems->length > 0) { 169 if ($bodyElems->length > 0) {
170 if ($this->bodyCache == null) { 170 if ($this->bodyCache == null) {
171 $this->bodyCache = $bodyElems->item(0)->innerHTML; 171 $this->bodyCache = $bodyElems->item(0)->innerHTML;
172 } 172 }
173 if ($this->body == null) { 173 if ($this->body == null) {
174 $this->body = $bodyElems->item(0); 174 $this->body = $bodyElems->item(0);
175 } 175 }
176 } 176 }
177 177
178 $this->prepDocument(); 178 $this->prepDocument();
179 179
180 //die($this->dom->documentElement->parentNode->nodeType); 180 //die($this->dom->documentElement->parentNode->nodeType);
181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 //die($this->getInnerHTML($this->dom->documentElement)); 182 //die($this->getInnerHTML($this->dom->documentElement));
183 183
184 /* Build readability's DOM tree */ 184 /* Build readability's DOM tree */
185 $overlay = $this->dom->createElement('div'); 185 $overlay = $this->dom->createElement('div');
186 $innerDiv = $this->dom->createElement('div'); 186 $innerDiv = $this->dom->createElement('div');
187 $articleTitle = $this->getArticleTitle(); 187 $articleTitle = $this->getArticleTitle();
188 $articleContent = $this->grabArticle(); 188 $articleContent = $this->grabArticle();
189 189
190 if (!$articleContent) { 190 if (!$articleContent) {
191 $this->success = false; 191 $this->success = false;
192 $articleContent = $this->dom->createElement('div'); 192 $articleContent = $this->dom->createElement('div');
193 $articleContent->setAttribute('id', 'readability-content'); 193 $articleContent->setAttribute('id', 'readability-content');
194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; 194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
195 } 195 }
196 196
197 $overlay->setAttribute('id', 'readOverlay'); 197 $overlay->setAttribute('id', 'readOverlay');
198 $innerDiv->setAttribute('id', 'readInner'); 198 $innerDiv->setAttribute('id', 'readInner');
199 199
200 /* Glue the structure of our document together. */ 200 /* Glue the structure of our document together. */
201 $innerDiv->appendChild($articleTitle); 201 $innerDiv->appendChild($articleTitle);
202 $innerDiv->appendChild($articleContent); 202 $innerDiv->appendChild($articleContent);
203 $overlay->appendChild($innerDiv); 203 $overlay->appendChild($innerDiv);
204 204
205 /* Clear the old HTML, insert the new content. */ 205 /* Clear the old HTML, insert the new content. */
206 $this->body->innerHTML = ''; 206 $this->body->innerHTML = '';
207 $this->body->appendChild($overlay); 207 $this->body->appendChild($overlay);
208 //document.body.insertBefore(overlay, document.body.firstChild); 208 //document.body.insertBefore(overlay, document.body.firstChild);
209 $this->body->removeAttribute('style'); 209 $this->body->removeAttribute('style');
210 210
211 $this->postProcessContent($articleContent); 211 $this->postProcessContent($articleContent);
212 212
213 // Set title and content instance variables 213 // Set title and content instance variables
214 $this->articleTitle = $articleTitle; 214 $this->articleTitle = $articleTitle;
215 $this->articleContent = $articleContent; 215 $this->articleContent = $articleContent;
216 216
217 return $this->success; 217 return $this->success;
218 } 218 }
219 219
220 /** 220 /**
221 * Debug 221 * Debug
222 */ 222 */
223 protected function dbg($msg) { 223 protected function dbg($msg) {
224 if ($this->debug) echo '* ',$msg, "\n"; 224 if ($this->debug) echo '* ',$msg, "\n";
225 } 225 }
226 226
227 /** 227 /**
228 * Run any post-process modifications to article content as necessary. 228 * Run any post-process modifications to article content as necessary.
229 * 229 *
230 * @param DOMElement 230 * @param DOMElement
231 * @return void 231 * @return void
232 */ 232 */
233 public function postProcessContent($articleContent) { 233 public function postProcessContent($articleContent) {
234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 $this->addFootnotes($articleContent); 235 $this->addFootnotes($articleContent);
236 } 236 }
237 } 237 }
238 238
239 /** 239 /**
240 * Get the article title as an H1. 240 * Get the article title as an H1.
241 * 241 *
242 * @return DOMElement 242 * @return DOMElement
243 */ 243 */
244 protected function getArticleTitle() { 244 protected function getArticleTitle() {
245 $curTitle = ''; 245 $curTitle = '';
246 $origTitle = ''; 246 $origTitle = '';
247 247
248 try { 248 try {
249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); 249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 } catch(Exception $e) {} 250 } catch(Exception $e) {}
251 251
252 if (preg_match('/ [\|\-] /', $curTitle)) 252 if (preg_match('/ [\|\-] /', $curTitle))
253 { 253 {
254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); 254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255 255
256 if (count(explode(' ', $curTitle)) < 3) { 256 if (count(explode(' ', $curTitle)) < 3) {
257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); 257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 } 258 }
259 } 259 }
260 else if (strpos($curTitle, ': ') !== false) 260 else if (strpos($curTitle, ': ') !== false)
261 { 261 {
262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); 262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
263 263
264 if (count(explode(' ', $curTitle)) < 3) { 264 if (count(explode(' ', $curTitle)) < 3) {
265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); 265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
266 } 266 }
267 } 267 }
268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) 268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
269 { 269 {
270 $hOnes = $this->dom->getElementsByTagName('h1'); 270 $hOnes = $this->dom->getElementsByTagName('h1');
271 if($hOnes->length == 1) 271 if($hOnes->length == 1)
272 { 272 {
273 $curTitle = $this->getInnerText($hOnes->item(0)); 273 $curTitle = $this->getInnerText($hOnes->item(0));
274 } 274 }
275 } 275 }
276 276
277 $curTitle = trim($curTitle); 277 $curTitle = trim($curTitle);
278 278
279 if (count(explode(' ', $curTitle)) <= 4) { 279 if (count(explode(' ', $curTitle)) <= 4) {
280 $curTitle = $origTitle; 280 $curTitle = $origTitle;
281 } 281 }
282 282
283 $articleTitle = $this->dom->createElement('h1'); 283 $articleTitle = $this->dom->createElement('h1');
284 $articleTitle->innerHTML = $curTitle; 284 $articleTitle->innerHTML = $curTitle;
285 285
286 return $articleTitle; 286 return $articleTitle;
287 } 287 }
288 288
289 /** 289 /**
290 * Prepare the HTML document for readability to scrape it. 290 * Prepare the HTML document for readability to scrape it.
291 * This includes things like stripping javascript, CSS, and handling terrible markup. 291 * This includes things like stripping javascript, CSS, and handling terrible markup.
292 * 292 *
293 * @return void 293 * @return void
294 **/ 294 **/
295 protected function prepDocument() { 295 protected function prepDocument() {
296 /** 296 /**
297 * In some cases a body element can't be found (if the HTML is totally hosed for example) 297 * In some cases a body element can't be found (if the HTML is totally hosed for example)
298 * so we create a new body node and append it to the document. 298 * so we create a new body node and append it to the document.
299 */ 299 */
300 if ($this->body == null) 300 if ($this->body == null)
301 { 301 {
302 $this->body = $this->dom->createElement('body'); 302 $this->body = $this->dom->createElement('body');
303 $this->dom->documentElement->appendChild($this->body); 303 $this->dom->documentElement->appendChild($this->body);
304 } 304 }
305 $this->body->setAttribute('id', 'readabilityBody'); 305 $this->body->setAttribute('id', 'readabilityBody');
306 306
307 /* Remove all style tags in head */ 307 /* Remove all style tags in head */
308 $styleTags = $this->dom->getElementsByTagName('style'); 308 $styleTags = $this->dom->getElementsByTagName('style');
309 for ($i = $styleTags->length-1; $i >= 0; $i--) 309 for ($i = $styleTags->length-1; $i >= 0; $i--)
310 { 310 {
311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); 311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
312 } 312 }
313 313
314 /* Turn all double br's into p's */ 314 /* Turn all double br's into p's */
315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); 316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. 317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
318 // Manipulating innerHTML as it's done in JS is not possible in PHP. 318 // Manipulating innerHTML as it's done in JS is not possible in PHP.
319 } 319 }
320 320
321 /** 321 /**
322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
324 * 324 *
325 * @return void 325 * @return void
326 **/ 326 **/
327 public function addFootnotes($articleContent) { 327 public function addFootnotes($articleContent) {
328 $footnotesWrapper = $this->dom->createElement('div'); 328 $footnotesWrapper = $this->dom->createElement('div');
329 $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 329 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 $footnotesWrapper->innerHTML = '<h3>References</h3>'; 330 $footnotesWrapper->innerHTML = '<h3>References</h3>';
331 331
332 $articleFootnotes = $this->dom->createElement('ol'); 332 $articleFootnotes = $this->dom->createElement('ol');
333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); 333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 $footnotesWrapper->appendChild($articleFootnotes); 334 $footnotesWrapper->appendChild($articleFootnotes);
335 335
336 $articleLinks = $articleContent->getElementsByTagName('a'); 336 $articleLinks = $articleContent->getElementsByTagName('a');
337 337
338 $linkCount = 0; 338 $linkCount = 0;
339 for ($i = 0; $i < $articleLinks->length; $i++) 339 for ($i = 0; $i < $articleLinks->length; $i++)
340 { 340 {
341 $articleLink = $articleLinks->item($i); 341 $articleLink = $articleLinks->item($i);
342 $footnoteLink = $articleLink->cloneNode(true); 342 $footnoteLink = $articleLink->cloneNode(true);
343 $refLink = $this->dom->createElement('a'); 343 $refLink = $this->dom->createElement('a');
344 $footnote = $this->dom->createElement('li'); 344 $footnote = $this->dom->createElement('li');
345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); 345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); 346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, 347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 $linkText = $this->getInnerText($articleLink); 348 $linkText = $this->getInnerText($articleLink);
349 349
350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { 350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 continue; 351 continue;
352 } 352 }
353 353
354 $linkCount++; 354 $linkCount++;
355 355
356 /** Add a superscript reference after the article link */ 356 /** Add a superscript reference after the article link */
357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); 357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; 358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
359 $refLink->setAttribute('class', 'readability-DoNotFootnote'); 359 $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 $refLink->setAttribute('style', 'color: inherit;'); 360 $refLink->setAttribute('style', 'color: inherit;');
361 361
362 //TODO: does this work or should we use DOMNode.isSameNode()? 362 //TODO: does this work or should we use DOMNode.isSameNode()?
363 if ($articleLink->parentNode->lastChild == $articleLink) { 363 if ($articleLink->parentNode->lastChild == $articleLink) {
364 $articleLink->parentNode->appendChild($refLink); 364 $articleLink->parentNode->appendChild($refLink);
365 } else { 365 } else {
366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); 366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
367 } 367 }
368 368
369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); 369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); 370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
371 371
372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; 372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
373 373
374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); 374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); 375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376 376
377 $footnote->appendChild($footnoteLink); 377 $footnote->appendChild($footnoteLink);
378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; 378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
379 379
380 $articleFootnotes->appendChild($footnote); 380 $articleFootnotes->appendChild($footnote);
381 } 381 }
382 382
383 if ($linkCount > 0) { 383 if ($linkCount > 0) {
384 $articleContent->appendChild($footnotesWrapper); 384 $articleContent->appendChild($footnotesWrapper);
385 } 385 }
386 } 386 }
387 387
388 /** 388 /**
389 * Reverts P elements with class 'readability-styled' 389 * Reverts P elements with class 'readability-styled'
390 * to text nodes - which is what they were before. 390 * to text nodes - which is what they were before.
391 * 391 *
392 * @param DOMElement 392 * @param DOMElement
393 * @return void 393 * @return void
394 */ 394 */
395 function revertReadabilityStyledElements($articleContent) { 395 function revertReadabilityStyledElements($articleContent) {
396 $xpath = new DOMXPath($articleContent->ownerDocument); 396 $xpath = new DOMXPath($articleContent->ownerDocument);
397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); 397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
398 //$elems = $articleContent->getElementsByTagName('p'); 398 //$elems = $articleContent->getElementsByTagName('p');
399 for ($i = $elems->length-1; $i >= 0; $i--) { 399 for ($i = $elems->length-1; $i >= 0; $i--) {
400 $e = $elems->item($i); 400 $e = $elems->item($i);
401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); 401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { 402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); 403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
404 //} 404 //}
405 } 405 }
406 } 406 }
407 407
408 /** 408 /**
409 * Prepare the article node for display. Clean out any inline styles, 409 * Prepare the article node for display. Clean out any inline styles,
410 * iframes, forms, strip extraneous <p> tags, etc. 410 * iframes, forms, strip extraneous <p> tags, etc.
411 * 411 *
412 * @param DOMElement 412 * @param DOMElement
413 * @return void 413 * @return void
414 */ 414 */
415 function prepArticle($articleContent) { 415 function prepArticle($articleContent) {
416 $this->cleanStyles($articleContent); 416 $this->cleanStyles($articleContent);
417 $this->killBreaks($articleContent); 417 $this->killBreaks($articleContent);
418 if ($this->revertForcedParagraphElements) { 418 if ($this->revertForcedParagraphElements) {
419 $this->revertReadabilityStyledElements($articleContent); 419 $this->revertReadabilityStyledElements($articleContent);
420 } 420 }
421 421
422 /* Clean out junk from the article content */ 422 /* Clean out junk from the article content */
423 $this->cleanConditionally($articleContent, 'form'); 423 $this->cleanConditionally($articleContent, 'form');
424 $this->clean($articleContent, 'object'); 424 $this->clean($articleContent, 'object');
425 $this->clean($articleContent, 'h1'); 425 $this->clean($articleContent, 'h1');
426 426
427 /** 427 /**
428 * If there is only one h2, they are probably using it 428 * If there is only one h2, they are probably using it
429 * as a header and not a subheader, so remove it since we already have a header. 429 * as a header and not a subheader, so remove it since we already have a header.
430 ***/ 430 ***/
431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { 431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 $this->clean($articleContent, 'h2'); 432 $this->clean($articleContent, 'h2');
433 } 433 }
434 $this->clean($articleContent, 'iframe'); 434 $this->clean($articleContent, 'iframe');
435 435
436 $this->cleanHeaders($articleContent); 436 $this->cleanHeaders($articleContent);
437 437
438 /* Do these last as the previous stuff may have removed junk that will affect these */ 438 /* Do these last as the previous stuff may have removed junk that will affect these */
439 $this->cleanConditionally($articleContent, 'table'); 439 $this->cleanConditionally($articleContent, 'table');
440 $this->cleanConditionally($articleContent, 'ul'); 440 $this->cleanConditionally($articleContent, 'ul');
441 $this->cleanConditionally($articleContent, 'div'); 441 $this->cleanConditionally($articleContent, 'div');
442 442
443 /* Remove extra paragraphs */ 443 /* Remove extra paragraphs */
444 $articleParagraphs = $articleContent->getElementsByTagName('p'); 444 $articleParagraphs = $articleContent->getElementsByTagName('p');
445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--) 445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
446 { 446 {
447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; 447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; 448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; 449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; 450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451 451
452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') 452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 { 453 {
454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); 454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
455 } 455 }
456 } 456 }
457 457
458 try { 458 try {
459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); 459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); 460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
461 } 461 }
462 catch (Exception $e) { 462 catch (Exception $e) {
463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); 463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 } 464 }
465 } 465 }
466 466
467 /** 467 /**
468 * Initialize a node with the readability object. Also checks the 468 * Initialize a node with the readability object. Also checks the
469 * className/id for special names to add to its score. 469 * className/id for special names to add to its score.
470 * 470 *
471 * @param Element 471 * @param Element
472 * @return void 472 * @return void
473 **/ 473 **/
474 protected function initializeNode($node) { 474 protected function initializeNode($node) {
475 $readability = $this->dom->createAttribute('readability'); 475 $readability = $this->dom->createAttribute('readability');
476 $readability->value = 0; // this is our contentScore 476 $readability->value = 0; // this is our contentScore
477 $node->setAttributeNode($readability); 477 $node->setAttributeNode($readability);
478 478
479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case 479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 case 'DIV': 480 case 'DIV':
481 $readability->value += 5; 481 $readability->value += 5;
482 break; 482 break;
483 483
484 case 'PRE': 484 case 'PRE':
485 case 'TD': 485 case 'TD':
486 case 'BLOCKQUOTE': 486 case 'BLOCKQUOTE':
487 $readability->value += 3; 487 $readability->value += 3;
488 break; 488 break;
489 489
490 case 'ADDRESS': 490 case 'ADDRESS':
491 case 'OL': 491 case 'OL':
492 case 'UL': 492 case 'UL':
493 case 'DL': 493 case 'DL':
494 case 'DD': 494 case 'DD':
495 case 'DT': 495 case 'DT':
496 case 'LI': 496 case 'LI':
497 case 'FORM': 497 case 'FORM':
498 $readability->value -= 3; 498 $readability->value -= 3;
499 break; 499 break;
500 500
501 case 'H1': 501 case 'H1':
502 case 'H2': 502 case 'H2':
503 case 'H3': 503 case 'H3':
504 case 'H4': 504 case 'H4':
505 case 'H5': 505 case 'H5':
506 case 'H6': 506 case 'H6':
507 case 'TH': 507 case 'TH':
508 $readability->value -= 5; 508 $readability->value -= 5;
509 break; 509 break;
510 } 510 }
511 $readability->value += $this->getClassWeight($node); 511 $readability->value += $this->getClassWeight($node);
512 } 512 }
513 513
514 /*** 514 /***
515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
517 * 517 *
518 * @return DOMElement 518 * @return DOMElement
519 **/ 519 **/
520 protected function grabArticle($page=null) { 520 protected function grabArticle($page=null) {
521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); 521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
522 if (!$page) $page = $this->dom; 522 if (!$page) $page = $this->dom;
523 $allElements = $page->getElementsByTagName('*'); 523 $allElements = $page->getElementsByTagName('*');
524 /** 524 /**
525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
527 * 527 *
528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
529 * TODO: Shouldn't this be a reverse traversal? 529 * TODO: Shouldn't this be a reverse traversal?
530 **/ 530 **/
531 $node = null; 531 $node = null;
532 $nodesToScore = array(); 532 $nodesToScore = array();
533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { 533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { 534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
535 //$node = $targetList->item($nodeIndex); 535 //$node = $targetList->item($nodeIndex);
536 $tagName = strtoupper($node->tagName); 536 $tagName = strtoupper($node->tagName);
537 /* Remove unlikely candidates */ 537 /* Remove unlikely candidates */
538 if ($stripUnlikelyCandidates) { 538 if ($stripUnlikelyCandidates) {
539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); 539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
540 if ( 540 if (
541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && 541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && 542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
543 $tagName != 'BODY' 543 $tagName != 'BODY'
544 ) 544 )
545 { 545 {
546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); 546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
547 //$nodesToRemove[] = $node; 547 //$nodesToRemove[] = $node;
548 $node->parentNode->removeChild($node); 548 $node->parentNode->removeChild($node);
549 $nodeIndex--; 549 $nodeIndex--;
550 continue; 550 continue;
551 } 551 }
552 } 552 }
553 553
554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { 554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
555 $nodesToScore[] = $node; 555 $nodesToScore[] = $node;
556 } 556 }
557 557
558 /* Turn all divs that don't have children block level elements into p's */ 558 /* Turn all divs that don't have children block level elements into p's */
559 if ($tagName == 'DIV') { 559 if ($tagName == 'DIV') {
560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { 560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
561 //$this->dbg('Altering div to p'); 561 //$this->dbg('Altering div to p');
562 $newNode = $this->dom->createElement('p'); 562 $newNode = $this->dom->createElement('p');
563 try { 563 try {
564 $newNode->innerHTML = $node->innerHTML; 564 $newNode->innerHTML = $node->innerHTML;
565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); 565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
566 $node->parentNode->replaceChild($newNode, $node); 566 $node->parentNode->replaceChild($newNode, $node);
567 $nodeIndex--; 567 $nodeIndex--;
568 $nodesToScore[] = $node; // or $newNode? 568 $nodesToScore[] = $node; // or $newNode?
569 } 569 }
570 catch(Exception $e) { 570 catch(Exception $e) {
571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); 571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
572 } 572 }
573 } 573 }
574 else 574 else
575 { 575 {
576 /* EXPERIMENTAL */ 576 /* EXPERIMENTAL */
577 // TODO: change these p elements back to text nodes after processing 577 // TODO: change these p elements back to text nodes after processing
578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { 578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
579 $childNode = $node->childNodes->item($i); 579 $childNode = $node->childNodes->item($i);
580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE 580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE
581 //$this->dbg('replacing text node with a p tag with the same content.'); 581 //$this->dbg('replacing text node with a p tag with the same content.');
582 $p = $this->dom->createElement('p'); 582 $p = $this->dom->createElement('p');
583 $p->innerHTML = $childNode->nodeValue; 583 $p->innerHTML = $childNode->nodeValue;
584 $p->setAttribute('style', 'display: inline;'); 584 $p->setAttribute('style', 'display: inline;');
585 $p->setAttribute('class', 'readability-styled'); 585 $p->setAttribute('class', 'readability-styled');
586 $childNode->parentNode->replaceChild($p, $childNode); 586 $childNode->parentNode->replaceChild($p, $childNode);
587 } 587 }
588 } 588 }
589 } 589 }
590 } 590 }
591 } 591 }
592 592
593 /** 593 /**
594 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 594 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 * Then add their score to their parent node. 595 * Then add their score to their parent node.
596 * 596 *
597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
598 **/ 598 **/
599 $candidates = array(); 599 $candidates = array();
600 for ($pt=0; $pt < count($nodesToScore); $pt++) { 600 for ($pt=0; $pt < count($nodesToScore); $pt++) {
601 $parentNode = $nodesToScore[$pt]->parentNode; 601 $parentNode = $nodesToScore[$pt]->parentNode;
602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null; 602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); 603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
604 $innerText = $this->getInnerText($nodesToScore[$pt]); 604 $innerText = $this->getInnerText($nodesToScore[$pt]);
605 605
606 if (!$parentNode || !isset($parentNode->tagName)) { 606 if (!$parentNode || !isset($parentNode->tagName)) {
607 continue; 607 continue;
608 } 608 }
609 609
610 /* If this paragraph is less than 25 characters, don't even count it. */ 610 /* If this paragraph is less than 25 characters, don't even count it. */
611 if(strlen($innerText) < 25) { 611 if(strlen($innerText) < 25) {
612 continue; 612 continue;
613 } 613 }
614 614
615 /* Initialize readability data for the parent. */ 615 /* Initialize readability data for the parent. */
616 if (!$parentNode->hasAttribute('readability')) 616 if (!$parentNode->hasAttribute('readability'))
617 { 617 {
618 $this->initializeNode($parentNode); 618 $this->initializeNode($parentNode);
619 $candidates[] = $parentNode; 619 $candidates[] = $parentNode;
620 } 620 }
621 621
622 /* Initialize readability data for the grandparent. */ 622 /* Initialize readability data for the grandparent. */
623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) 623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
624 { 624 {
625 $this->initializeNode($grandParentNode); 625 $this->initializeNode($grandParentNode);
626 $candidates[] = $grandParentNode; 626 $candidates[] = $grandParentNode;
627 } 627 }
628 628
629 $contentScore = 0; 629 $contentScore = 0;
630 630
631 /* Add a point for the paragraph itself as a base. */ 631 /* Add a point for the paragraph itself as a base. */
632 $contentScore++; 632 $contentScore++;
633 633
634 /* Add points for any commas within this paragraph */ 634 /* Add points for any commas within this paragraph */
635 $contentScore += count(explode(',', $innerText)); 635 $contentScore += count(explode(',', $innerText));
636 636
637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 $contentScore += min(floor(strlen($innerText) / 100), 3); 638 $contentScore += min(floor(strlen($innerText) / 100), 3);
639 639
640 /* Add the score to the parent. The grandparent gets half. */ 640 /* Add the score to the parent. The grandparent gets half. */
641 $parentNode->getAttributeNode('readability')->value += $contentScore; 641 $parentNode->getAttributeNode('readability')->value += $contentScore;
642 642
643 if ($grandParentNode) { 643 if ($grandParentNode) {
644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; 644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 } 645 }
646 } 646 }
647 647
648 /** 648 /**
649 * After we've calculated scores, loop through all of the possible candidate nodes we found 649 * After we've calculated scores, loop through all of the possible candidate nodes we found
650 * and find the one with the highest score. 650 * and find the one with the highest score.
651 **/ 651 **/
652 $topCandidate = null; 652 $topCandidate = null;
653 for ($c=0, $cl=count($candidates); $c < $cl; $c++) 653 for ($c=0, $cl=count($candidates); $c < $cl; $c++)
654 { 654 {
655 /** 655 /**
656 * Scale the final candidates score based on link density. Good content should have a 656 * Scale the final candidates score based on link density. Good content should have a
657 * relatively small link density (5% or less) and be mostly unaffected by this operation. 657 * relatively small link density (5% or less) and be mostly unaffected by this operation.
658 **/ 658 **/
659 $readability = $candidates[$c]->getAttributeNode('readability'); 659 $readability = $candidates[$c]->getAttributeNode('readability');
660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); 660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
661 661
662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); 662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
663 663
664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { 664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
665 $topCandidate = $candidates[$c]; 665 $topCandidate = $candidates[$c];
666 } 666 }
667 } 667 }
668 668
669 /** 669 /**
670 * If we still have no top candidate, just use the body as a last resort. 670 * If we still have no top candidate, just use the body as a last resort.
671 * We also have to copy the body node so it is something we can modify. 671 * We also have to copy the body node so it is something we can modify.
672 **/ 672 **/
673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') 673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
674 { 674 {
675 $topCandidate = $this->dom->createElement('div'); 675 $topCandidate = $this->dom->createElement('div');
676 if ($page instanceof DOMDocument) { 676 if ($page instanceof DOMDocument) {
677 if (!isset($page->documentElement)) { 677 if (!isset($page->documentElement)) {
678 // we don't have a body either? what a mess! :) 678 // we don't have a body either? what a mess! :)
679 } else { 679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML; 680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = ''; 681 $page->documentElement->innerHTML = '';
682 $page->documentElement->appendChild($topCandidate); 682 $page->documentElement->appendChild($topCandidate);
683 } 683 }
684 } else { 684 } else {
685 $topCandidate->innerHTML = $page->innerHTML; 685 $topCandidate->innerHTML = $page->innerHTML;
686 $page->innerHTML = ''; 686 $page->innerHTML = '';
687 $page->appendChild($topCandidate); 687 $page->appendChild($topCandidate);
688 } 688 }
689 $this->initializeNode($topCandidate); 689 $this->initializeNode($topCandidate);
690 } 690 }
691 691
692 /** 692 /**
693 * Now that we have the top candidate, look through its siblings for content that might also be related. 693 * Now that we have the top candidate, look through its siblings for content that might also be related.
694 * Things like preambles, content split by ads that we removed, etc. 694 * Things like preambles, content split by ads that we removed, etc.
695 **/ 695 **/
696 $articleContent = $this->dom->createElement('div'); 696 $articleContent = $this->dom->createElement('div');
697 $articleContent->setAttribute('id', 'readability-content'); 697 $articleContent->setAttribute('id', 'readability-content');
698 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); 698 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
699 $siblingNodes = $topCandidate->parentNode->childNodes; 699 $siblingNodes = $topCandidate->parentNode->childNodes;
700 if (!isset($siblingNodes)) { 700 if (!isset($siblingNodes)) {
701 $siblingNodes = new stdClass; 701 $siblingNodes = new stdClass;
702 $siblingNodes->length = 0; 702 $siblingNodes->length = 0;
703 } 703 }
704 704
705 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) 705 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
706 { 706 {
707 $siblingNode = $siblingNodes->item($s); 707 $siblingNode = $siblingNodes->item($s);
708 $append = false; 708 $append = false;
709 709
710 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); 710 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
711 711
712 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); 712 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
713 713
714 if ($siblingNode === $topCandidate) 714 if ($siblingNode === $topCandidate)
715 // or if ($siblingNode->isSameNode($topCandidate)) 715 // or if ($siblingNode->isSameNode($topCandidate))
716 { 716 {
717 $append = true; 717 $append = true;
718 } 718 }
719 719
720 $contentBonus = 0; 720 $contentBonus = 0;
721 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 721 /* Give a bonus if sibling nodes and top candidates have the example same classname */
722 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { 722 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
723 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; 723 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
724 } 724 }
725 725
726 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) 726 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
727 { 727 {
728 $append = true; 728 $append = true;
729 } 729 }
730 730
731 if (strtoupper($siblingNode->nodeName) == 'P') { 731 if (strtoupper($siblingNode->nodeName) == 'P') {
732 $linkDensity = $this->getLinkDensity($siblingNode); 732 $linkDensity = $this->getLinkDensity($siblingNode);
733 $nodeContent = $this->getInnerText($siblingNode); 733 $nodeContent = $this->getInnerText($siblingNode);
734 $nodeLength = strlen($nodeContent); 734 $nodeLength = strlen($nodeContent);
735 735
736 if ($nodeLength > 80 && $linkDensity < 0.25) 736 if ($nodeLength > 80 && $linkDensity < 0.25)
737 { 737 {
738 $append = true; 738 $append = true;
739 } 739 }
740 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) 740 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
741 { 741 {
742 $append = true; 742 $append = true;
743 } 743 }
744 } 744 }
745 745
746 if ($append) 746 if ($append)
747 { 747 {
748 $this->dbg('Appending node: ' . $siblingNode->nodeName); 748 $this->dbg('Appending node: ' . $siblingNode->nodeName);
749 749
750 $nodeToAppend = null; 750 $nodeToAppend = null;
751 $sibNodeName = strtoupper($siblingNode->nodeName); 751 $sibNodeName = strtoupper($siblingNode->nodeName);
752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { 752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
754 754
755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); 755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
756 $nodeToAppend = $this->dom->createElement('div'); 756 $nodeToAppend = $this->dom->createElement('div');
757 try { 757 try {
758 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); 758 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
759 $nodeToAppend->innerHTML = $siblingNode->innerHTML; 759 $nodeToAppend->innerHTML = $siblingNode->innerHTML;
760 } 760 }
761 catch(Exception $e) 761 catch(Exception $e)
762 { 762 {
763 $this->dbg('Could not alter siblingNode to div, reverting back to original.'); 763 $this->dbg('Could not alter siblingNode to div, reverting back to original.');
764 $nodeToAppend = $siblingNode; 764 $nodeToAppend = $siblingNode;
765 $s--; 765 $s--;
766 $sl--; 766 $sl--;
767 } 767 }
768 } else { 768 } else {
769 $nodeToAppend = $siblingNode; 769 $nodeToAppend = $siblingNode;
770 $s--; 770 $s--;
771 $sl--; 771 $sl--;
772 } 772 }
773 773
774 /* To ensure a node does not interfere with readability styles, remove its classnames */ 774 /* To ensure a node does not interfere with readability styles, remove its classnames */
775 $nodeToAppend->removeAttribute('class'); 775 $nodeToAppend->removeAttribute('class');
776 776
777 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 777 /* Append sibling and subtract from our list because it removes the node when you append to another node */
778 $articleContent->appendChild($nodeToAppend); 778 $articleContent->appendChild($nodeToAppend);
779 } 779 }
780 } 780 }
781 781
782 /** 782 /**
783 * So we have all of the content that we need. Now we clean it up for presentation. 783 * So we have all of the content that we need. Now we clean it up for presentation.
784 **/ 784 **/
785 $this->prepArticle($articleContent); 785 $this->prepArticle($articleContent);
786 786
787 /** 787 /**
788 * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 788 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
789 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 789 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
790 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 790 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
791 * finding the -right- content. 791 * finding the -right- content.
792 **/ 792 **/
793 if (strlen($this->getInnerText($articleContent, false)) < 250) 793 if (strlen($this->getInnerText($articleContent, false)) < 250)
794 { 794 {
795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 // in the meantime, we check and create an empty element if it's not there. 796 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); 797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
798 $this->body->innerHTML = $this->bodyCache; 798 $this->body->innerHTML = $this->bodyCache;
799 799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 return $this->grabArticle($this->body); 802 return $this->grabArticle($this->body);
803 } 803 }
804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES); 805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 return $this->grabArticle($this->body); 806 return $this->grabArticle($this->body);
807 } 807 }
808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); 809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
810 return $this->grabArticle($this->body); 810 return $this->grabArticle($this->body);
811 } 811 }
812 else { 812 else {
813 return false; 813 return false;
814 } 814 }
815 } 815 }
816 return $articleContent; 816 return $articleContent;
817 } 817 }
818 818
819 /** 819 /**
820 * Remove script tags from document 820 * Remove script tags from document
821 * 821 *
822 * @param DOMElement 822 * @param DOMElement
823 * @return void 823 * @return void
824 */ 824 */
825 public function removeScripts($doc) { 825 public function removeScripts($doc) {
826 $scripts = $doc->getElementsByTagName('script'); 826 $scripts = $doc->getElementsByTagName('script');
827 for($i = $scripts->length-1; $i >= 0; $i--) 827 for($i = $scripts->length-1; $i >= 0; $i--)
828 { 828 {
829 $scripts->item($i)->parentNode->removeChild($scripts->item($i)); 829 $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 } 830 }
831 } 831 }
832 832
833 /** 833 /**
834 * Get the inner text of a node. 834 * Get the inner text of a node.
835 * This also strips out any excess whitespace to be found. 835 * This also strips out any excess whitespace to be found.
836 * 836 *
837 * @param DOMElement $ 837 * @param DOMElement $
838 * @param boolean $normalizeSpaces (default: true) 838 * @param boolean $normalizeSpaces (default: true)
839 * @return string 839 * @return string
840 **/ 840 **/
841 public function getInnerText($e, $normalizeSpaces=true) { 841 public function getInnerText($e, $normalizeSpaces=true) {
842 $textContent = ''; 842 $textContent = '';
843 843
844 if (!isset($e->textContent) || $e->textContent == '') { 844 if (!isset($e->textContent) || $e->textContent == '') {
845 return ''; 845 return '';
846 } 846 }
847 847
848 $textContent = trim($e->textContent); 848 $textContent = trim($e->textContent);
849 849
850 if ($normalizeSpaces) { 850 if ($normalizeSpaces) {
851 return preg_replace($this->regexps['normalize'], ' ', $textContent); 851 return preg_replace($this->regexps['normalize'], ' ', $textContent);
852 } else { 852 } else {
853 return $textContent; 853 return $textContent;
854 } 854 }
855 } 855 }
856 856
857 /** 857 /**
858 * Get the number of times a string $s appears in the node $e. 858 * Get the number of times a string $s appears in the node $e.
859 * 859 *
860 * @param DOMElement $e 860 * @param DOMElement $e
861 * @param string - what to count. Default is "," 861 * @param string - what to count. Default is ","
862 * @return number (integer) 862 * @return number (integer)
863 **/ 863 **/
864 public function getCharCount($e, $s=',') { 864 public function getCharCount($e, $s=',') {
865 return substr_count($this->getInnerText($e), $s); 865 return substr_count($this->getInnerText($e), $s);
866 } 866 }
867 867
868 /** 868 /**
869 * Remove the style attribute on every $e and under. 869 * Remove the style attribute on every $e and under.
870 * 870 *
871 * @param DOMElement $e 871 * @param DOMElement $e
872 * @return void 872 * @return void
873 */ 873 */
874 public function cleanStyles($e) { 874 public function cleanStyles($e) {
875 if (!is_object($e)) return; 875 if (!is_object($e)) return;
876 $elems = $e->getElementsByTagName('*'); 876 $elems = $e->getElementsByTagName('*');
877 foreach ($elems as $elem) { 877 foreach ($elems as $elem) {
878 $elem->removeAttribute('style'); 878 $elem->removeAttribute('style');
879 } 879 }
880 } 880 }
881 881
882 /** 882 /**
883 * Get the density of links as a percentage of the content 883 * Get the density of links as a percentage of the content
884 * This is the amount of text that is inside a link divided by the total text in the node. 884 * This is the amount of text that is inside a link divided by the total text in the node.
885 * 885 *
886 * @param DOMElement $e 886 * @param DOMElement $e
887 * @return number (float) 887 * @return number (float)
888 */ 888 */
889 public function getLinkDensity($e) { 889 public function getLinkDensity($e) {
890 $links = $e->getElementsByTagName('a'); 890 $links = $e->getElementsByTagName('a');
891 $textLength = strlen($this->getInnerText($e)); 891 $textLength = strlen($this->getInnerText($e));
892 $linkLength = 0; 892 $linkLength = 0;
893 for ($i=0, $il=$links->length; $i < $il; $i++) 893 for ($i=0, $il=$links->length; $i < $il; $i++)
894 { 894 {
895 $linkLength += strlen($this->getInnerText($links->item($i))); 895 $linkLength += strlen($this->getInnerText($links->item($i)));
896 } 896 }
897 if ($textLength > 0) { 897 if ($textLength > 0) {
898 return $linkLength / $textLength; 898 return $linkLength / $textLength;
899 } else { 899 } else {
900 return 0; 900 return 0;
901 } 901 }
902 } 902 }
903 903
904 /** 904 /**
905 * Get an elements class/id weight. Uses regular expressions to tell if this 905 * Get an elements class/id weight. Uses regular expressions to tell if this
906 * element looks good or bad. 906 * element looks good or bad.
907 * 907 *
908 * @param DOMElement $e 908 * @param DOMElement $e
909 * @return number (Integer) 909 * @return number (Integer)
910 */ 910 */
911 public function getClassWeight($e) { 911 public function getClassWeight($e) {
912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
913 return 0; 913 return 0;
914 } 914 }
915 915
916 $weight = 0; 916 $weight = 0;
917 917
918 /* Look for a special classname */ 918 /* Look for a special classname */
919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '') 919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
920 { 920 {
921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { 921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
922 $weight -= 25; 922 $weight -= 25;
923 } 923 }
924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { 924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
925 $weight += 25; 925 $weight += 25;
926 } 926 }
927 } 927 }
928 928
929 /* Look for a special ID */ 929 /* Look for a special ID */
930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '') 930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
931 { 931 {
932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { 932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
933 $weight -= 25; 933 $weight -= 25;
934 } 934 }
935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { 935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
936 $weight += 25; 936 $weight += 25;
937 } 937 }
938 } 938 }
939 return $weight; 939 return $weight;
940 } 940 }
941 941
942 /** 942 /**
943 * Remove extraneous break tags from a node. 943 * Remove extraneous break tags from a node.
944 * 944 *
945 * @param DOMElement $node 945 * @param DOMElement $node
946 * @return void 946 * @return void
947 */ 947 */
948 public function killBreaks($node) { 948 public function killBreaks($node) {
949 $html = $node->innerHTML; 949 $html = $node->innerHTML;
950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); 950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
951 $node->innerHTML = $html; 951 $node->innerHTML = $html;
952 } 952 }
953 953
954 /** 954 /**
955 * Clean a node of all elements of type "tag". 955 * Clean a node of all elements of type "tag".
956 * (Unless it's a youtube/vimeo video. People love movies.) 956 * (Unless it's a youtube/vimeo video. People love movies.)
957 * 957 *
958 * Updated 2012-09-18 to preserve youtube/vimeo iframes 958 * Updated 2012-09-18 to preserve youtube/vimeo iframes
959 * 959 *
960 * @param DOMElement $e 960 * @param DOMElement $e
961 * @param string $tag 961 * @param string $tag
962 * @return void 962 * @return void
963 */ 963 */
964 public function clean($e, $tag) { 964 public function clean($e, $tag) {
965 $targetList = $e->getElementsByTagName($tag); 965 $targetList = $e->getElementsByTagName($tag);
966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); 966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967 967
968 for ($y=$targetList->length-1; $y >= 0; $y--) { 968 for ($y=$targetList->length-1; $y >= 0; $y--) {
969 /* Allow youtube and vimeo videos through as people usually want to see those. */ 969 /* Allow youtube and vimeo videos through as people usually want to see those. */
970 if ($isEmbed) { 970 if ($isEmbed) {
971 $attributeValues = ''; 971 $attributeValues = '';
972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { 972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) 973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 } 974 }
975 975
976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 if (preg_match($this->regexps['video'], $attributeValues)) { 977 if (preg_match($this->regexps['video'], $attributeValues)) {
978 continue; 978 continue;
979 } 979 }
980 980
981 /* Then check the elements inside this element for the same. */ 981 /* Then check the elements inside this element for the same. */
982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { 982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
983 continue; 983 continue;
984 } 984 }
985 } 985 }
986 $targetList->item($y)->parentNode->removeChild($targetList->item($y)); 986 $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 } 987 }
988 } 988 }
989 989
990 /** 990 /**
991 * Clean an element of all tags of type "tag" if they look fishy. 991 * Clean an element of all tags of type "tag" if they look fishy.
992 * "Fishy" is an algorithm based on content length, classnames, 992 * "Fishy" is an algorithm based on content length, classnames,
993 * link density, number of images & embeds, etc. 993 * link density, number of images & embeds, etc.
994 * 994 *
995 * @param DOMElement $e 995 * @param DOMElement $e
996 * @param string $tag 996 * @param string $tag
997 * @return void 997 * @return void
998 */ 998 */
999 public function cleanConditionally($e, $tag) { 999 public function cleanConditionally($e, $tag) {
1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1001 return; 1001 return;
1002 } 1002 }
1003 1003
1004 $tagsList = $e->getElementsByTagName($tag); 1004 $tagsList = $e->getElementsByTagName($tag);
1005 $curTagsLength = $tagsList->length; 1005 $curTagsLength = $tagsList->length;
1006 1006
1007 /** 1007 /**
1008 * Gather counts for other typical elements embedded within. 1008 * Gather counts for other typical elements embedded within.
1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1010 * 1010 *
1011 * TODO: Consider taking into account original contentScore here. 1011 * TODO: Consider taking into account original contentScore here.
1012 */ 1012 */
1013 for ($i=$curTagsLength-1; $i >= 0; $i--) { 1013 for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 $weight = $this->getClassWeight($tagsList->item($i)); 1014 $weight = $this->getClassWeight($tagsList->item($i));
1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; 1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016 1016
1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); 1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018 1018
1019 if ($weight + $contentScore < 0) { 1019 if ($weight + $contentScore < 0) {
1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1021 } 1021 }
1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { 1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1023 /** 1023 /**
1024 * If there are not very many commas, and the number of 1024 * If there are not very many commas, and the number of
1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1026 **/ 1026 **/
1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length; 1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length; 1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; 1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length; 1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length; 1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length;
1032 1032
1033 $embedCount = 0; 1033 $embedCount = 0;
1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed'); 1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 $embedCount++; 1037 $embedCount++;
1038 } 1038 }
1039 } 1039 }
1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); 1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 $embedCount++; 1043 $embedCount++;
1044 } 1044 }
1045 } 1045 }
1046 1046
1047 $linkDensity = $this->getLinkDensity($tagsList->item($i)); 1047 $linkDensity = $this->getLinkDensity($tagsList->item($i));
1048 $contentLength = strlen($this->getInnerText($tagsList->item($i))); 1048 $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1049 $toRemove = false; 1049 $toRemove = false;
1050 1050
1051 if ($this->lightClean) { 1051 if ($this->lightClean) {
1052 $this->dbg('Light clean...'); 1052 $this->dbg('Light clean...');
1053 if ( ($img > $p) && ($img > 4) ) { 1053 if ( ($img > $p) && ($img > 4) ) {
1054 $this->dbg(' more than 4 images and more image elements than paragraph elements'); 1054 $this->dbg(' more than 4 images and more image elements than paragraph elements');
1055 $toRemove = true; 1055 $toRemove = true;
1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { 1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); 1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1058 $toRemove = true; 1058 $toRemove = true;
1059 } else if ( $input > floor($p/3) ) { 1059 } else if ( $input > floor($p/3) ) {
1060 $this->dbg(' too many <input> elements'); 1060 $this->dbg(' too many <input> elements');
1061 $toRemove = true; 1061 $toRemove = true;
1062 } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { 1062 } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
1063 $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); 1063 $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
1064 $toRemove = true; 1064 $toRemove = true;
1065 } else if($weight < 25 && $linkDensity > 0.2) { 1065 } else if($weight < 25 && $linkDensity > 0.2) {
1066 $this->dbg(' weight smaller than 25 and link density above 0.2'); 1066 $this->dbg(' weight smaller than 25 and link density above 0.2');
1067 $toRemove = true; 1067 $toRemove = true;
1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { 1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); 1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
1070 $toRemove = true; 1070 $toRemove = true;
1071 } else if($embedCount > 3) { 1071 } else if($embedCount > 3) {
1072 $this->dbg(' more than 3 embeds'); 1072 $this->dbg(' more than 3 embeds');
1073 $toRemove = true; 1073 $toRemove = true;
1074 } 1074 }
1075 } else { 1075 } else {
1076 $this->dbg('Standard clean...'); 1076 $this->dbg('Standard clean...');
1077 if ( $img > $p ) { 1077 if ( $img > $p ) {
1078 $this->dbg(' more image elements than paragraph elements'); 1078 $this->dbg(' more image elements than paragraph elements');
1079 $toRemove = true; 1079 $toRemove = true;
1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { 1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); 1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1082 $toRemove = true; 1082 $toRemove = true;
1083 } else if ( $input > floor($p/3) ) { 1083 } else if ( $input > floor($p/3) ) {
1084 $this->dbg(' too many <input> elements'); 1084 $this->dbg(' too many <input> elements');
1085 $toRemove = true; 1085 $toRemove = true;
1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { 1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); 1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
1088 $toRemove = true; 1088 $toRemove = true;
1089 } else if($weight < 25 && $linkDensity > 0.2) { 1089 } else if($weight < 25 && $linkDensity > 0.2) {
1090 $this->dbg(' weight smaller than 25 and link density above 0.2'); 1090 $this->dbg(' weight smaller than 25 and link density above 0.2');
1091 $toRemove = true; 1091 $toRemove = true;
1092 } else if($weight >= 25 && $linkDensity > 0.5) { 1092 } else if($weight >= 25 && $linkDensity > 0.5) {
1093 $this->dbg(' weight above 25 but link density greater than 0.5'); 1093 $this->dbg(' weight above 25 but link density greater than 0.5');
1094 $toRemove = true; 1094 $toRemove = true;
1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { 1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); 1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
1097 $toRemove = true; 1097 $toRemove = true;
1098 } 1098 }
1099 } 1099 }
1100 1100
1101 if ($toRemove) { 1101 if ($toRemove) {
1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); 1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1104 } 1104 }
1105 } 1105 }
1106 } 1106 }
1107 } 1107 }
1108 1108
1109 /** 1109 /**
1110 * Clean out spurious headers from an Element. Checks things like classnames and link density. 1110 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1111 * 1111 *
1112 * @param DOMElement $e 1112 * @param DOMElement $e
1113 * @return void 1113 * @return void
1114 */ 1114 */
1115 public function cleanHeaders($e) { 1115 public function cleanHeaders($e) {
1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { 1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1117 $headers = $e->getElementsByTagName('h' . $headerIndex); 1117 $headers = $e->getElementsByTagName('h' . $headerIndex);
1118 for ($i=$headers->length-1; $i >=0; $i--) { 1118 for ($i=$headers->length-1; $i >=0; $i--) {
1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { 1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1120 $headers->item($i)->parentNode->removeChild($headers->item($i)); 1120 $headers->item($i)->parentNode->removeChild($headers->item($i));
1121 } 1121 }
1122 } 1122 }
1123 } 1123 }
1124 } 1124 }
1125 1125
1126 public function flagIsActive($flag) { 1126 public function flagIsActive($flag) {
1127 return ($this->flags & $flag) > 0; 1127 return ($this->flags & $flag) > 0;
1128 } 1128 }
1129 1129
1130 public function addFlag($flag) { 1130 public function addFlag($flag) {
1131 $this->flags = $this->flags | $flag; 1131 $this->flags = $this->flags | $flag;
1132 } 1132 }
1133 1133
1134 public function removeFlag($flag) { 1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag; 1135 $this->flags = $this->flags & ~$flag;
1136 } 1136 }
1137} 1137}
1138?> \ No newline at end of file 1138?> \ No newline at end of file
diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php
index 4faad6d9..7a56be8c 100755
--- a/inc/3rdparty/makefulltextfeed.php
+++ b/inc/3rdparty/makefulltextfeed.php
@@ -3,8 +3,8 @@
3// Author: Keyvan Minoukadeh 3// Author: Keyvan Minoukadeh
4// Copyright (c) 2013 Keyvan Minoukadeh 4// Copyright (c) 2013 Keyvan Minoukadeh
5// License: AGPLv3 5// License: AGPLv3
6// Version: 3.1 6// Version: 3.2
7// Date: 2013-03-05 7// Date: 2013-05-13
8// More info: http://fivefilters.org/content-only/ 8// More info: http://fivefilters.org/content-only/
9// Help: http://help.fivefilters.org 9// Help: http://help.fivefilters.org
10 10
@@ -25,12 +25,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
25 25
26// Usage 26// Usage
27// ----- 27// -----
28// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org 28// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
29// The following options can be passed in the querystring: 29// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
30// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))
31// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)
32// * API key: key=[api key] (optional, refer to config.php)
33// * Max entries to process: max=[max number of items] (optional)
34 30
35error_reporting(E_ALL ^ E_NOTICE); 31error_reporting(E_ALL ^ E_NOTICE);
36ini_set("display_errors", 1); 32ini_set("display_errors", 1);
@@ -76,8 +72,8 @@ header('X-Robots-Tag: noindex, nofollow');
76//////////////////////////////// 72////////////////////////////////
77// Check if service is enabled 73// Check if service is enabled
78//////////////////////////////// 74////////////////////////////////
79if (!$options->enabled) { 75if (!$options->enabled) {
80 die('The full-text RSS service is currently disabled'); 76 die('The full-text RSS service is currently disabled');
81} 77}
82 78
83//////////////////////////////// 79////////////////////////////////
@@ -121,8 +117,8 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc');
121//////////////////////////////// 117////////////////////////////////
122// Check for feed URL 118// Check for feed URL
123//////////////////////////////// 119////////////////////////////////
124if (!isset($_GET['url'])) { 120if (!isset($_GET['url'])) {
125 die('No URL supplied'); 121 die('No URL supplied');
126} 122}
127$url = trim($_GET['url']); 123$url = trim($_GET['url']);
128if (strtolower(substr($url, 0, 7)) == 'feed://') { 124if (strtolower(substr($url, 0, 7)) == 'feed://') {
@@ -161,10 +157,12 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
161 if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); 157 if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
162 if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); 158 if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
163 if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); 159 if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
164 if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); 160 if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);
165 if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); 161 if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
166 if (isset($_GET['xss'])) $redirect .= '&xss'; 162 if (isset($_GET['xss'])) $redirect .= '&xss';
167 if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; 163 if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
164 if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
165 if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
168 if (isset($_GET['debug'])) $redirect .= '&debug'; 166 if (isset($_GET['debug'])) $redirect .= '&debug';
169 if ($debug_mode) { 167 if ($debug_mode) {
170 debug('Redirecting to hide access key, follow URL below to continue'); 168 debug('Redirecting to hide access key, follow URL below to continue');
@@ -177,7 +175,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
177 175
178/////////////////////////////////////////////// 176///////////////////////////////////////////////
179// Set timezone. 177// Set timezone.
180// Prevents warnings, but needs more testing - 178// Prevents warnings, but needs more testing -
181// perhaps if timezone is set in php.ini we 179// perhaps if timezone is set in php.ini we
182// don't need to set it at all... 180// don't need to set it at all...
183/////////////////////////////////////////////// 181///////////////////////////////////////////////
@@ -199,7 +197,7 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int
199} 197}
200$key_index = ($valid_key) ? (int)$_GET['key'] : 0; 198$key_index = ($valid_key) ? (int)$_GET['key'] : 0;
201if (!$valid_key && $options->key_required) { 199if (!$valid_key && $options->key_required) {
202 die('A valid key must be supplied'); 200 die('A valid key must be supplied');
203} 201}
204if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') { 202if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
205 die('The entered key is invalid'); 203 die('The entered key is invalid');
@@ -251,6 +249,28 @@ if ($options->favour_feed_titles == 'user') {
251} 249}
252 250
253/////////////////////////////////////////////// 251///////////////////////////////////////////////
252// Include full content in output?
253///////////////////////////////////////////////
254if ($options->content === 'user') {
255 if (isset($_GET['content']) && $_GET['content'] === '0') {
256 $options->content = false;
257 } else {
258 $options->content = true;
259 }
260}
261
262///////////////////////////////////////////////
263// Include summaries in output?
264///////////////////////////////////////////////
265if ($options->summary === 'user') {
266 if (isset($_GET['summary']) && $_GET['summary'] === '1') {
267 $options->summary = true;
268 } else {
269 $options->summary = false;
270 }
271}
272
273///////////////////////////////////////////////
254// Exclude items if extraction fails 274// Exclude items if extraction fails
255/////////////////////////////////////////////// 275///////////////////////////////////////////////
256if ($options->exclude_items_on_fail === 'user') { 276if ($options->exclude_items_on_fail === 'user') {
@@ -272,15 +292,6 @@ if ($options->detect_language === 'user') {
272 $detect_language = $options->detect_language; 292 $detect_language = $options->detect_language;
273} 293}
274 294
275if ($detect_language >= 2) {
276 $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
277 'cebuano' => 'ceb', // ISO 639-2
278 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
279 'hawaiian' => 'haw', // ISO 639-2
280 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
281 'pidgin' => 'cpe', // ISO 639-2
282 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
283}
284$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); 295$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
285 296
286///////////////////////////////////// 297/////////////////////////////////////
@@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
330////////////////////////////////// 341//////////////////////////////////
331if ($options->caching) { 342if ($options->caching) {
332 debug('Caching is enabled...'); 343 debug('Caching is enabled...');
333 $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); 344 $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
334 $check_cache = true; 345 $check_cache = true;
335 if ($options->apc && $options->smart_cache) { 346 if ($options->apc && $options->smart_cache) {
336 apc_add("cache.$cache_id", 0, 10*60); 347 apc_add("cache.$cache_id", 0, 10*60);
@@ -468,7 +479,7 @@ if ($img_url = $feed->get_image_url()) {
468//////////////////////////////////////////// 479////////////////////////////////////////////
469// Loop through feed items 480// Loop through feed items
470//////////////////////////////////////////// 481////////////////////////////////////////////
471$items = $feed->get_items(0, $max); 482$items = $feed->get_items(0, $max);
472// Request all feed items in parallel (if supported) 483// Request all feed items in parallel (if supported)
473$urls_sanitized = array(); 484$urls_sanitized = array();
474$urls = array(); 485$urls = array();
@@ -550,24 +561,43 @@ foreach ($items as $key => $item) {
550 $is_single_page = false; 561 $is_single_page = false;
551 if ($single_page_response = getSinglePage($item, $html, $effective_url)) { 562 if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
552 $is_single_page = true; 563 $is_single_page = true;
553 $html = $single_page_response['body'];
554 // remove strange things
555 $html = str_replace('</[>', '', $html);
556 $html = convert_to_utf8($html, $single_page_response['headers']);
557 $effective_url = $single_page_response['effective_url']; 564 $effective_url = $single_page_response['effective_url'];
558 debug("Retrieved single-page view from $effective_url"); 565 // check if action defined for returned Content-Type
566 $mime_info = get_mime_action_info($single_page_response['headers']);
567 if (isset($mime_info['action'])) {
568 if ($mime_info['action'] == 'exclude') {
569 continue; // skip this feed item entry
570 } elseif ($mime_info['action'] == 'link') {
571 if ($mime_info['type'] == 'image') {
572 $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
573 } else {
574 $html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
575 }
576 $extracted_title = $mime_info['name'];
577 $do_content_extraction = false;
578 }
579 }
580 if ($do_content_extraction) {
581 $html = $single_page_response['body'];
582 // remove strange things
583 $html = str_replace('</[>', '', $html);
584 $html = convert_to_utf8($html, $single_page_response['headers']);
585 debug("Retrieved single-page view from $effective_url");
586 }
559 unset($single_page_response); 587 unset($single_page_response);
560 } 588 }
589 }
590 if ($do_content_extraction) {
561 debug('--------'); 591 debug('--------');
562 debug('Attempting to extract content'); 592 debug('Attempting to extract content');
563 $extract_result = $extractor->process($html, $effective_url); 593 $extract_result = $extractor->process($html, $effective_url);
564 $readability = $extractor->readability; 594 $readability = $extractor->readability;
565 $content_block = ($extract_result) ? $extractor->getContent() : null; 595 $content_block = ($extract_result) ? $extractor->getContent() : null;
566 $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; 596 $extracted_title = ($extract_result) ? $extractor->getTitle() : '';
567 // Deal with multi-page articles 597 // Deal with multi-page articles
568 //die('Next: '.$extractor->getNextPageUrl()); 598 //die('Next: '.$extractor->getNextPageUrl());
569 $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); 599 $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
570 if ($options->multipage && $is_multi_page) { 600 if ($options->multipage && $is_multi_page && $options->content) {
571 debug('--------'); 601 debug('--------');
572 debug('Attempting to process multi-page article'); 602 debug('Attempting to process multi-page article');
573 $multi_page_urls = array(); 603 $multi_page_urls = array();
@@ -580,7 +610,7 @@ foreach ($items as $key => $item) {
580 // check it's not what we have already! 610 // check it's not what we have already!
581 if (!in_array($next_page_url, $multi_page_urls)) { 611 if (!in_array($next_page_url, $multi_page_urls)) {
582 // it's not, so let's attempt to fetch it 612 // it's not, so let's attempt to fetch it
583 $multi_page_urls[] = $next_page_url; 613 $multi_page_urls[] = $next_page_url;
584 $_prev_ref = $http->referer; 614 $_prev_ref = $http->referer;
585 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { 615 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {
586 // make sure mime type is not something with a different action associated 616 // make sure mime type is not something with a different action associated
@@ -605,13 +635,15 @@ foreach ($items as $key => $item) {
605 // did we successfully deal with this multi-page article? 635 // did we successfully deal with this multi-page article?
606 if (empty($multi_page_content)) { 636 if (empty($multi_page_content)) {
607 debug('Failed to extract all parts of multi-page article, so not going to include them'); 637 debug('Failed to extract all parts of multi-page article, so not going to include them');
608 $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; 638 $_page = $readability->dom->createElement('p');
639 $_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
640 $multi_page_content[] = $_page;
609 } 641 }
610 foreach ($multi_page_content as $_page) { 642 foreach ($multi_page_content as $_page) {
611 $_page = $content_block->ownerDocument->importNode($_page, true); 643 $_page = $content_block->ownerDocument->importNode($_page, true);
612 $content_block->appendChild($_page); 644 $content_block->appendChild($_page);
613 } 645 }
614 unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); 646 unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
615 } 647 }
616 } 648 }
617 // use extracted title for both feed and item title if we're using single-item dummy feed 649 // use extracted title for both feed and item title if we're using single-item dummy feed
@@ -658,7 +690,7 @@ foreach ($items as $key => $item) {
658 } else { 690 } else {
659 $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML 691 $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
660 } 692 }
661 unset($content_block); 693 //unset($content_block);
662 // post-processing cleanup 694 // post-processing cleanup
663 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html); 695 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
664 if ($links == 'remove') { 696 if ($links == 'remove') {
@@ -671,130 +703,155 @@ foreach ($items as $key => $item) {
671 } 703 }
672 } 704 }
673 705
674 if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment 706 if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
675 $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); 707 $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
708 } else {
709 $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
710 }
711 // filter xss?
712 if ($xss_filter) {
713 debug('Filtering HTML to remove XSS');
714 $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
715 }
716
717 // add content
718 if ($options->summary === true) {
719 // get summary
720 $summary = '';
721 if (!$do_content_extraction) {
722 $summary = $html;
676 } else { 723 } else {
677 $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); 724 // Try to get first few paragraphs
678 } 725 if (isset($content_block) && ($content_block instanceof DOMElement)) {
679 // filter xss? 726 $_paras = $content_block->getElementsByTagName('p');
680 if ($xss_filter) { 727 foreach ($_paras as $_para) {
681 debug('Filtering HTML to remove XSS'); 728 $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
682 $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); 729 if (strlen($summary) > 200) break;
683 }
684 $newitem->setDescription($html);
685
686 // set date
687 if ((int)$item->get_date('U') > 0) {
688 $newitem->setDate((int)$item->get_date('U'));
689 } elseif ($extractor->getDate()) {
690 $newitem->setDate($extractor->getDate());
691 }
692
693 // add authors
694 if ($authors = $item->get_authors()) {
695 foreach ($authors as $author) {
696 // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
697 if ($author->get_name() !== null) {
698 $newitem->addElement('dc:creator', $author->get_name());
699 } elseif ($author->get_email() !== null) {
700 $newitem->addElement('dc:creator', $author->get_email());
701 } 730 }
731 } else {
732 $summary = $html;
702 } 733 }
703 } elseif ($authors = $extractor->getAuthors()) { 734 }
704 //TODO: make sure the list size is reasonable 735 unset($_paras, $_para);
705 foreach ($authors as $author) { 736 $summary = get_excerpt($summary);
706 // TODO: xpath often selects authors from other articles linked from the page. 737 $newitem->setDescription($summary);
707 // for now choose first item 738 if ($options->content) $newitem->setElement('content:encoded', $html);
708 $newitem->addElement('dc:creator', $author); 739 } else {
709 break; 740 if ($options->content) $newitem->setDescription($html);
741 }
742
743 // set date
744 if ((int)$item->get_date('U') > 0) {
745 $newitem->setDate((int)$item->get_date('U'));
746 } elseif ($extractor->getDate()) {
747 $newitem->setDate($extractor->getDate());
748 }
749
750 // add authors
751 if ($authors = $item->get_authors()) {
752 foreach ($authors as $author) {
753 // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
754 if ($author->get_name() !== null) {
755 $newitem->addElement('dc:creator', $author->get_name());
756 } elseif ($author->get_email() !== null) {
757 $newitem->addElement('dc:creator', $author->get_email());
710 } 758 }
711 } 759 }
712 760 } elseif ($authors = $extractor->getAuthors()) {
713 // add language 761 //TODO: make sure the list size is reasonable
714 if ($detect_language) { 762 foreach ($authors as $author) {
715 $language = $extractor->getLanguage(); 763 // TODO: xpath often selects authors from other articles linked from the page.
716 if (!$language) $language = $feed->get_language(); 764 // for now choose first item
717 if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { 765 $newitem->addElement('dc:creator', $author);
718 try { 766 break;
719 if ($use_cld) { 767 }
720 // Use PHP-CLD extension 768 }
721 $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error 769
722 $res = $php_cld($text_sample); 770 // add language
723 if (is_array($res) && count($res) > 0) { 771 if ($detect_language) {
724 $language = $res[0]['code']; 772 $language = $extractor->getLanguage();
725 } 773 if (!$language) $language = $feed->get_language();
726 } else { 774 if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
727 //die('what'); 775 try {
728 // Use PEAR's Text_LanguageDetect 776 if ($use_cld) {
729 if (!isset($l)) { 777 // Use PHP-CLD extension
730 $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); 778 $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
731 } 779 $res = $php_cld($text_sample);
732 $l_result = $l->detect($text_sample, 1); 780 if (is_array($res) && count($res) > 0) {
733 if (count($l_result) > 0) { 781 $language = $res[0]['code'];
734 $language = $language_codes[key($l_result)]; 782 }
735 } 783 } else {
784 //die('what');
785 // Use PEAR's Text_LanguageDetect
786 if (!isset($l)) {
787 $l = new Text_LanguageDetect();
788 $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
789 }
790 $l_result = $l->detect($text_sample, 1);
791 if (count($l_result) > 0) {
792 $language = key($l_result);
736 } 793 }
737 } catch (Exception $e) {
738 //die('error: '.$e);
739 // do nothing
740 } 794 }
741 } 795 } catch (Exception $e) {
742 if ($language && (strlen($language) < 7)) { 796 //die('error: '.$e);
743 $newitem->addElement('dc:language', $language); 797 // do nothing
744 } 798 }
745 } 799 }
746 800 if ($language && (strlen($language) < 7)) {
747 // add MIME type (if it appeared in our exclusions lists) 801 $newitem->addElement('dc:language', $language);
748 if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
749 // add effective URL (URL after redirects)
750 if (isset($effective_url)) {
751 //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
752 //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)
753 //temporary measure: use utf8_encode()
754 $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
755 } else {
756 $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
757 } 802 }
758 803 }
759 // add categories 804
760 if ($categories = $item->get_categories()) { 805 // add MIME type (if it appeared in our exclusions lists)
761 foreach ($categories as $category) { 806 if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
762 if ($category->get_label() !== null) { 807 // add effective URL (URL after redirects)
763 $newitem->addElement('category', $category->get_label()); 808 if (isset($effective_url)) {
764 } 809 //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
810 //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)
811 //temporary measure: use utf8_encode()
812 $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
813 } else {
814 $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
815 }
816
817 // add categories
818 if ($categories = $item->get_categories()) {
819 foreach ($categories as $category) {
820 if ($category->get_label() !== null) {
821 $newitem->addElement('category', $category->get_label());
765 } 822 }
766 } 823 }
767 824 }
768 // check for enclosures 825
769 if ($options->keep_enclosures) { 826 // check for enclosures
770 if ($enclosures = $item->get_enclosures()) { 827 if ($options->keep_enclosures) {
771 foreach ($enclosures as $enclosure) { 828 if ($enclosures = $item->get_enclosures()) {
772 // thumbnails 829 foreach ($enclosures as $enclosure) {
773 foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { 830 // thumbnails
774 $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); 831 foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
775 } 832 $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
776 if (!$enclosure->get_link()) continue;
777 $enc = array();
778 // Media RSS spec ($enc): http://search.yahoo.com/mrss
779 // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
780 $enc['url'] = $enclosure->get_link();
781 if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
782 if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
783 if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
784 if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
785 if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
786 if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
787 if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
788 if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
789 if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
790 if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
791 if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
792 if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
793 $newitem->addElement('media:content', '', $enc);
794 } 833 }
834 if (!$enclosure->get_link()) continue;
835 $enc = array();
836 // Media RSS spec ($enc): http://search.yahoo.com/mrss
837 // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
838 $enc['url'] = $enclosure->get_link();
839 if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
840 if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
841 if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
842 if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
843 if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
844 if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
845 if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
846 if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
847 if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
848 if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
849 if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
850 if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
851 $newitem->addElement('media:content', '', $enc);
795 } 852 }
796 } 853 }
797 /* } */ 854 }
798 $output->addItem($newitem); 855 $output->addItem($newitem);
799 unset($html); 856 unset($html);
800 $item_count++; 857 $item_count++;
diff --git a/inc/3rdparty/makefulltextfeedHelpers.php b/inc/3rdparty/makefulltextfeedHelpers.php
index 1c11b8f6..4e985372 100755
--- a/inc/3rdparty/makefulltextfeedHelpers.php
+++ b/inc/3rdparty/makefulltextfeedHelpers.php
@@ -66,6 +66,38 @@ class DummySingleItem {
66// HELPER FUNCTIONS 66// HELPER FUNCTIONS
67/////////////////////////////// 67///////////////////////////////
68 68
69// Adapted from WordPress
70// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
71function get_excerpt($text, $num_words=55, $more=null) {
72 if (null === $more) $more = '&hellip;';
73 $text = strip_tags($text);
74 //TODO: Check if word count is based on single characters (East Asian characters)
75 /*
76 if (1==2) {
77 $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
78 preg_match_all('/./u', $text, $words_array);
79 $words_array = array_slice($words_array[0], 0, $num_words + 1);
80 $sep = '';
81 } else {
82 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
83 $sep = ' ';
84 }
85 */
86 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
87 $sep = ' ';
88 if (count($words_array) > $num_words) {
89 array_pop($words_array);
90 $text = implode($sep, $words_array);
91 $text = $text.$more;
92 } else {
93 $text = implode($sep, $words_array);
94 }
95 // trim whitespace at beginning or end of string
96 // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
97 $text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
98 return $text;
99}
100
69function url_allowed($url) { 101function url_allowed($url) {
70 global $options; 102 global $options;
71 if (!empty($options->allowed_urls)) { 103 if (!empty($options->allowed_urls)) {
@@ -165,14 +197,6 @@ function convert_to_utf8($html, $header=null)
165 if (strtolower($encoding) != 'utf-8') { 197 if (strtolower($encoding) != 'utf-8') {
166 debug('Converting to UTF-8'); 198 debug('Converting to UTF-8');
167 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); 199 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
168 /*
169 if (function_exists('iconv')) {
170 // iconv appears to handle certain character encodings better than mb_convert_encoding
171 $html = iconv($encoding, 'utf-8', $html);
172 } else {
173 $html = mb_convert_encoding($html, 'utf-8', $encoding);
174 }
175 */
176 } 200 }
177 } 201 }
178 } 202 }
@@ -196,7 +220,7 @@ function makeAbsolute($base, $elem) {
196} 220}
197function makeAbsoluteAttr($base, $e, $attr) { 221function makeAbsoluteAttr($base, $e, $attr) {
198 if ($e->hasAttribute($attr)) { 222 if ($e->hasAttribute($attr)) {
199 // Trim leading and trailing white space. I don't really like this but 223 // Trim leading and trailing white space. I don't really like this but
200 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" /> 224 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
201 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr))); 225 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
202 $url = str_replace(' ', '%20', $url); 226 $url = str_replace(' ', '%20', $url);
diff --git a/inc/3rdparty/site_config/custom/dailymotion.com.txt b/inc/3rdparty/site_config/custom/dailymotion.com.txt
new file mode 100755
index 00000000..0cad808f
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/dailymotion.com.txt
@@ -0,0 +1,12 @@
1title: //title
2body: //iframe
3
4replace_string(<![CDATA[): _
5replace_string(]]>): _
6
7single_page_link: //link[@type='application/xml+oembed']
8
9prune: no
10tidy: no
11
12http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people
diff --git a/inc/3rdparty/site_config/custom/index.php b/inc/3rdparty/site_config/custom/index.php
new file mode 100644
index 00000000..a3d5f739
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/index.php
@@ -0,0 +1,3 @@
1<?php
2// this is here to prevent directory listing over the web
3?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/custom/ted.com.txt b/inc/3rdparty/site_config/custom/ted.com.txt
new file mode 100755
index 00000000..4940d2bc
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/ted.com.txt
@@ -0,0 +1,11 @@
1title: //title
2body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image']
3
4strip_id_or_class: talk-transcript__para__time
5
6single_page_link: //a[@id='hero-transcript-link']
7
8#prune: no
9tidy: no
10
11test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are
diff --git a/inc/3rdparty/site_config/index.php b/inc/3rdparty/site_config/index.php
index a1b767fd..76ca8b3c 100644
--- a/inc/3rdparty/site_config/index.php
+++ b/inc/3rdparty/site_config/index.php
@@ -1,3 +1,2 @@
1<?php 1<?php
2// this is here to prevent directory listing over the web 2// this is here to prevent directory listing over the web \ No newline at end of file
3?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/version.txt b/inc/3rdparty/site_config/standard/version.txt
index bf0d87ab..eaf01ebd 100644
--- a/inc/3rdparty/site_config/standard/version.txt
+++ b/inc/3rdparty/site_config/standard/version.txt
@@ -1 +1 @@
4 \ No newline at end of file 2013-05-12T22:53:07Z \ No newline at end of file
diff --git a/inc/poche/Poche.class.php b/inc/poche/Poche.class.php
index 1b69cd61..37cf66a3 100755
--- a/inc/poche/Poche.class.php
+++ b/inc/poche/Poche.class.php
@@ -1142,11 +1142,12 @@ class Poche
1142 * return new purifier object with actual config 1142 * return new purifier object with actual config
1143 */ 1143 */
1144 protected function getPurifier() { 1144 protected function getPurifier() {
1145 $config = HTMLPurifier_Config::createDefault(); 1145 $config = HTMLPurifier_Config::createDefault();
1146 $config->set('Cache.SerializerPath', CACHE); 1146 $config->set('Cache.SerializerPath', CACHE);
1147 $config->set('HTML.SafeIframe', true); 1147 $config->set('HTML.SafeIframe', true);
1148 $config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/)%'); //allow YouTube and Vimeo$purifier = new HTMLPurifier($config); 1148 //allow YouTube, Vimeo and dailymotion videos
1149 1149 $config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/|www\.dailymotion\.com/embed/video/)%');
1150
1150 return new HTMLPurifier($config); 1151 return new HTMLPurifier($config);
1151 } 1152 }
1152 1153