aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--CONTRIBUTING.md2
-rwxr-xr-xinc/3rdparty/config.php104
-rw-r--r--inc/3rdparty/libraries/content-extractor/ContentExtractor.php1455
-rw-r--r--inc/3rdparty/libraries/content-extractor/SiteConfig.php681
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/feedwriter/FeedItem.php100
-rwxr-xr-xinc/3rdparty/libraries/feedwriter/FeedWriter.php15
-rw-r--r--inc/3rdparty/libraries/html5/TreeBuilder.php13
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/CookieJar.php807
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php1589
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php157
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect.php992
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php57
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php339
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php (renamed from inc/3rdparty/libraries/language-detect/Parser.php)19
-rw-r--r--inc/3rdparty/libraries/readability/Readability.php2274
-rwxr-xr-xinc/3rdparty/makefulltextfeed.php353
-rwxr-xr-xinc/3rdparty/makefulltextfeedHelpers.php42
-rwxr-xr-xinc/3rdparty/site_config/custom/dailymotion.com.txt12
-rw-r--r--inc/3rdparty/site_config/custom/index.php3
-rwxr-xr-xinc/3rdparty/site_config/custom/ted.com.txt11
-rw-r--r--inc/3rdparty/site_config/index.php5
-rw-r--r--inc/3rdparty/site_config/standard/version.txt2
-rwxr-xr-xinc/poche/Poche.class.php44
-rwxr-xr-xinc/poche/Tools.class.php8
-rwxr-xr-xinc/poche/config.inc.default.php7
-rw-r--r--inc/poche/pochePictures.php57
-rwxr-xr-xindex.php7
-rwxr-xr-xthemes/baggy/css/main.css5
-rwxr-xr-x[-rw-r--r--]themes/default/_search-form.twig14
-rwxr-xr-xthemes/default/css/style.css21
30 files changed, 4960 insertions, 4235 deletions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 76708404..9ccb0b14 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
1# How contributing 1# How to contribute
2 2
3## You found a bug 3## You found a bug
4Please [open a new issue](https://github.com/wallabag/wallabag/issues/new). 4Please [open a new issue](https://github.com/wallabag/wallabag/issues/new).
diff --git a/inc/3rdparty/config.php b/inc/3rdparty/config.php
index e618117b..ec680d86 100755
--- a/inc/3rdparty/config.php
+++ b/inc/3rdparty/config.php
@@ -19,7 +19,7 @@ if (!isset($options)) $options = new stdClass();
19// Enable service 19// Enable service
20// ---------------------- 20// ----------------------
21// Set this to false if you want to disable the service. 21// Set this to false if you want to disable the service.
22// If set to false, no feed is produced and users will 22// If set to false, no feed is produced and users will
23// be told that the service is disabled. 23// be told that the service is disabled.
24$options->enabled = true; 24$options->enabled = true;
25 25
@@ -43,10 +43,64 @@ $options->default_entries = 5;
43// ---------------------- 43// ----------------------
44// The maximum number of feed items to process when no access key is supplied. 44// The maximum number of feed items to process when no access key is supplied.
45// This limits the user-supplied &max=x value. For example, if the user 45// This limits the user-supplied &max=x value. For example, if the user
46// asks for 20 items to be processed (&max=20), if max_entries is set to 46// asks for 20 items to be processed (&max=20), if max_entries is set to
47// 10, only 10 will be processed. 47// 10, only 10 will be processed.
48$options->max_entries = 10; 48$options->max_entries = 10;
49 49
50// Full content
51// ----------------------
52// By default Full-Text RSS includes the extracted content in the output.
53// You can exclude this from the output by passing '&content=0' in the querystring.
54//
55// Possible values...
56// Always include: true
57// Never include: false
58// Include unless user overrides (&content=0): 'user' (default)
59//
60// Note: currently this does not disable full content extraction. It simply omits it
61// from the output.
62$options->content = 'user';
63
64// Excerpts
65// ----------------------
66// By default Full-Text RSS does not include excerpts in the output.
67// You can enable this by passing '&summary=1' in the querystring.
68// This will include a plain text excerpt from the extracted content.
69//
70// Possible values...
71// Always include: true (recommended for new users)
72// Never include: false
73// Don't include unless user overrides (&summary=1): 'user' (default)
74//
75// Important: if both content and excerpts are requested, the excerpt will be
76// placed in the description element and the full content inside content:encoded.
77// If excerpts are not requested, the full content will go inside the description element.
78//
79// Why are we not returning both excerpts and content by default?
80// Mainly for backward compatibility.
81// Excerpts should appear in the feed item's description element. Previous versions
82// of Full-Text RSS did not return excerpts, so the description element was always
83// used for the full content (as recommended by the RSS advisory). When returning both,
84// we need somewhere else to place the content (content:encoded).
85// Having both enabled should not create any problems for news readers, but it may create
86// problems for developers upgrading from one of our earlier versions who may now find
87// their applications are returning excerpts instead of the full content they were
88// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
89// excerpts must be explicitly requested in the querystring by default.
90//
91// Why not use a different element name for excerpts?
92// According to the RSS advisory:
93// "Publishers who employ summaries should store the summary in description and
94// the full content in content:encoded, ordering description first within the item.
95// On items with no summary, the full content should be stored in description."
96// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
97//
98// For more consistent element naming, we recommend new users set this option to true.
99// The full content can still be excluded via the querystring, but the element names
100// will not change: when $options->summary = true, the description element will always
101// be reserved for the excerpt and content:encoded always for full content.
102$options->summary = 'user';
103
50// Rewrite relative URLs 104// Rewrite relative URLs
51// ---------------------- 105// ----------------------
52// With this enabled relative URLs found in the extracted content 106// With this enabled relative URLs found in the extracted content
@@ -67,7 +121,7 @@ $options->exclude_items_on_fail = 'user';
67// Enable multi-page support 121// Enable multi-page support
68// ------------------------- 122// -------------------------
69// If enabled, we will try to follow next page links on multi-page articles. 123// If enabled, we will try to follow next page links on multi-page articles.
70// Currently this only happens for sites where next_page_link has been defined 124// Currently this only happens for sites where next_page_link has been defined
71// in a site config file. 125// in a site config file.
72$options->multipage = true; 126$options->multipage = true;
73 127
@@ -125,10 +179,10 @@ $options->detect_language = 1;
125 179
126// Registration key 180// Registration key
127// --------------- 181// ---------------
128// The registration key is optional. It is not required to use Full-Text RSS, 182// The registration key is optional. It is not required to use Full-Text RSS,
129// and does not affect the normal operation of Full-Text RSS. It is currently 183// and does not affect the normal operation of Full-Text RSS. It is currently
130// only used on admin pages which help you update site patterns with the 184// only used on admin pages which help you update site patterns with the
131// latest version offered by FiveFilters.org. For these admin-related 185// latest version offered by FiveFilters.org. For these admin-related
132// tasks to complete, we will require a valid registration key. 186// tasks to complete, we will require a valid registration key.
133// If you would like one, you can purchase the latest version of Full-Text RSS 187// If you would like one, you can purchase the latest version of Full-Text RSS
134// at http://fivefilters.org/content-only/ 188// at http://fivefilters.org/content-only/
@@ -144,12 +198,12 @@ $options->registration_key = '';
144// ---------------------- 198// ----------------------
145// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials. 199// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials.
146// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages. 200// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages.
147// If no password or username is set, pages requiring admin privelages will be inaccessible. 201// If no password or username is set, pages requiring admin privelages will be inaccessible.
148// The default username is 'admin'. 202// The default username is 'admin'.
149// If overriding with an environment variable, separate username and password with a colon, e.g.: 203// If overriding with an environment variable, separate username and password with a colon, e.g.:
150// ftr_admin_credentials: admin:my-secret-password 204// ftr_admin_credentials: admin:my-secret-password
151// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password'); 205// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
152$options->admin_credentials = array('username'=>'admin', 'password'=>'admin'); 206$options->admin_credentials = array('username'=>'admin', 'password'=>'');
153 207
154// URLs to allow 208// URLs to allow
155// ---------------------- 209// ----------------------
@@ -178,12 +232,12 @@ $options->key_required = false;
178// ---------------------- 232// ----------------------
179// By default, when processing feeds, we assume item titles in the feed 233// By default, when processing feeds, we assume item titles in the feed
180// have not been truncated. So after processing web pages, the extracted titles 234// have not been truncated. So after processing web pages, the extracted titles
181// are not used in the generated feed. If you prefer to have extracted titles in 235// are not used in the generated feed. If you prefer to have extracted titles in
182// the feed you can either set this to false, in which case we will always favour 236// the feed you can either set this to false, in which case we will always favour
183// extracted titles. Alternatively, if set to 'user' (default) we'll use the 237// extracted titles. Alternatively, if set to 'user' (default) we'll use the
184// extracted title if you pass '&use_extracted_title' in the querystring. 238// extracted title if you pass '&use_extracted_title' in the querystring.
185// Possible values: 239// Possible values:
186// * Favour feed titles: true 240// * Favour feed titles: true
187// * Favour extracted titles: false 241// * Favour extracted titles: false
188// * Favour feed titles with user override: 'user' (default) 242// * Favour feed titles with user override: 'user' (default)
189// Note: this has no effect when the input URL is to a web page - in these cases 243// Note: this has no effect when the input URL is to a web page - in these cases
@@ -192,17 +246,17 @@ $options->favour_feed_titles = 'user';
192 246
193// Access keys (password protected access) 247// Access keys (password protected access)
194// ------------------------------------ 248// ------------------------------------
195// NOTE: You do not need an API key from fivefilters.org to run your own 249// NOTE: You do not need an API key from fivefilters.org to run your own
196// copy of the code. This is here if you'd like to restrict access to 250// copy of the code. This is here if you'd like to restrict access to
197// _your_ copy. 251// _your_ copy.
198// Keys let you group users - those with a key and those without - and 252// Keys let you group users - those with a key and those without - and
199// restrict access to the service to those without a key. 253// restrict access to the service to those without a key.
200// If you want everyone to access the service in the same way, you can 254// If you want everyone to access the service in the same way, you can
201// leave the array below empty and ignore the access key options further down. 255// leave the array below empty and ignore the access key options further down.
202// The options further down let you control how the service should behave 256// The options further down let you control how the service should behave
203// in each mode. 257// in each mode.
204// Note: Explicitly including the index number (1 and 2 in the examples below) 258// Note: Explicitly including the index number (1 and 2 in the examples below)
205// is highly recommended (when generating feeds, we encode the key and 259// is highly recommended (when generating feeds, we encode the key and
206// refer to it by index number and hash). 260// refer to it by index number and hash).
207$options->api_keys = array(); 261$options->api_keys = array();
208// Example: 262// Example:
@@ -232,13 +286,13 @@ $options->max_entries_with_key = 10;
232// filter the resulting HTML for XSS attacks, making it redundant for 286// filter the resulting HTML for XSS attacks, making it redundant for
233// Full-Text RSS do the same. Similarly with frameworks/CMS which display 287// Full-Text RSS do the same. Similarly with frameworks/CMS which display
234// feed content - the content should be treated like any other user-submitted content. 288// feed content - the content should be treated like any other user-submitted content.
235// 289//
236// If you are writing an application yourself which is processing feeds generated by 290// If you are writing an application yourself which is processing feeds generated by
237// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks 291// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks
238// or enable this option. This might be useful if you are processing our generated 292// or enable this option. This might be useful if you are processing our generated
239// feeds with JavaScript on the client side - although there's client side xss 293// feeds with JavaScript on the client side - although there's client side xss
240// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer 294// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer
241// 295//
242// If enabled, we'll pass retrieved HTML content through htmLawed with 296// If enabled, we'll pass retrieved HTML content through htmLawed with
243// safe flag on and style attributes denied, see 297// safe flag on and style attributes denied, see
244// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6 298// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6
@@ -253,8 +307,8 @@ $options->xss_filter = 'user';
253// Allowed parsers 307// Allowed parsers
254// ---------------------- 308// ----------------------
255// Full-Text RSS attempts to use PHP's libxml extension to process HTML. 309// Full-Text RSS attempts to use PHP's libxml extension to process HTML.
256// While fast, on some sites it may not always produce good results. 310// While fast, on some sites it may not always produce good results.
257// For these sites, you can specify an alternative HTML parser: 311// For these sites, you can specify an alternative HTML parser:
258// parser: html5lib 312// parser: html5lib
259// The html5lib parser is bundled with Full-Text RSS. 313// The html5lib parser is bundled with Full-Text RSS.
260// see http://code.google.com/p/html5lib/ 314// see http://code.google.com/p/html5lib/
@@ -273,7 +327,7 @@ $options->cors = false;
273 327
274// Use APC user cache? 328// Use APC user cache?
275// ---------------------- 329// ----------------------
276// If enabled we will store site config files (when requested 330// If enabled we will store site config files (when requested
277// for the first time) in APC's user cache. Keys prefixed with 'sc.' 331// for the first time) in APC's user cache. Keys prefixed with 'sc.'
278// This improves performance by reducing disk access. 332// This improves performance by reducing disk access.
279// Note: this has no effect if APC is unavailable on your server. 333// Note: this has no effect if APC is unavailable on your server.
@@ -346,7 +400,7 @@ $options->rewrite_url = array(
346// Valid actions: 400// Valid actions:
347// * 'exclude' - exclude this item from the result 401// * 'exclude' - exclude this item from the result
348// * 'link' - create HTML link to the item 402// * 'link' - create HTML link to the item
349$options->content_type_exc = array( 403$options->content_type_exc = array(
350 'application/pdf' => array('action'=>'link', 'name'=>'PDF'), 404 'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
351 'image' => array('action'=>'link', 'name'=>'Image'), 405 'image' => array('action'=>'link', 'name'=>'Image'),
352 'audio' => array('action'=>'link', 'name'=>'Audio'), 406 'audio' => array('action'=>'link', 'name'=>'Audio'),
@@ -375,13 +429,13 @@ $options->cache_cleanup = 100;
375/// DO NOT CHANGE ANYTHING BELOW THIS /////////// 429/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
376///////////////////////////////////////////////// 430/////////////////////////////////////////////////
377 431
378if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1'); 432if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
379 433
380if (basename(__FILE__) == 'config.php') { 434if (basename(__FILE__) == 'config.php') {
381 if (file_exists(dirname(__FILE__).'/custom_config.php')) { 435 if (file_exists(dirname(__FILE__).'/custom_config.php')) {
382 require_once dirname(__FILE__).'/custom_config.php'; 436 require_once dirname(__FILE__).'/custom_config.php';
383 } 437 }
384 438
385 // check for environment variables - often used on cloud platforms 439 // check for environment variables - often used on cloud platforms
386 // environment variables should be prefixed with 'ftr_', e.g. 440 // environment variables should be prefixed with 'ftr_', e.g.
387 // ftr_max_entries: 1 441 // ftr_max_entries: 1
diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
index ddd33bb5..21e693e7 100644
--- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
+++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
@@ -1,728 +1,727 @@
1<?php 1<?php
2/** 2/**
3 * Content Extractor 3 * Content Extractor
4 * 4 *
5 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 5 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
6 * to extract content from HTML files. 6 * to extract content from HTML files.
7 * 7 *
8 * @version 1.0 8 * @version 1.0
9 * @date 2013-02-05 9 * @date 2013-02-05
10 * @author Keyvan Minoukadeh 10 * @author Keyvan Minoukadeh
11 * @copyright 2013 Keyvan Minoukadeh 11 * @copyright 2013 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
13 */ 13 */
14 14
15class ContentExtractor 15class ContentExtractor
16{ 16{
17 protected static $tidy_config = array( 17 protected static $tidy_config = array(
18 'clean' => true, 18 'clean' => true,
19 'output-xhtml' => true, 19 'output-xhtml' => true,
20 'logical-emphasis' => true, 20 'logical-emphasis' => true,
21 'show-body-only' => false, 21 'show-body-only' => false,
22 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 22 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
23 'new-inline-tags' => 'mark, time, meter, progress, data', 23 'new-inline-tags' => 'mark, time, meter, progress, data',
24 'wrap' => 0, 24 'wrap' => 0,
25 'drop-empty-paras' => true, 25 'drop-empty-paras' => true,
26 'drop-proprietary-attributes' => false, 26 'drop-proprietary-attributes' => false,
27 'enclose-text' => true, 27 'enclose-text' => true,
28 'enclose-block-text' => true, 28 'enclose-block-text' => true,
29 'merge-divs' => true, 29 'merge-divs' => true,
30 'merge-spans' => true, 30 'merge-spans' => true,
31 'char-encoding' => 'utf8', 31 'char-encoding' => 'utf8',
32 'hide-comments' => true 32 'hide-comments' => true
33 ); 33 );
34 protected $html; 34 protected $html;
35 protected $config; 35 protected $config;
36 protected $title; 36 protected $title;
37 protected $author = array(); 37 protected $author = array();
38 protected $language; 38 protected $language;
39 protected $date; 39 protected $date;
40 protected $body; 40 protected $body;
41 protected $success = false; 41 protected $success = false;
42 protected $nextPageUrl; 42 protected $nextPageUrl;
43 public $allowedParsers = array('libxml', 'html5lib'); 43 public $allowedParsers = array('libxml', 'html5lib');
44 public $fingerprints = array(); 44 public $fingerprints = array();
45 public $readability; 45 public $readability;
46 public $debug = false; 46 public $debug = false;
47 public $debugVerbose = false; 47 public $debugVerbose = false;
48 48
49 function __construct($path, $fallback=null) { 49 function __construct($path, $fallback=null) {
50 SiteConfig::set_config_path($path, $fallback); 50 SiteConfig::set_config_path($path, $fallback);
51 } 51 }
52 52
53 protected function debug($msg) { 53 protected function debug($msg) {
54 if ($this->debug) { 54 if ($this->debug) {
55 $mem = round(memory_get_usage()/1024, 2); 55 $mem = round(memory_get_usage()/1024, 2);
56 $memPeak = round(memory_get_peak_usage()/1024, 2); 56 $memPeak = round(memory_get_peak_usage()/1024, 2);
57 echo '* ',$msg; 57 echo '* ',$msg;
58 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; 58 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
59 echo "\n"; 59 echo "\n";
60 ob_flush(); 60 ob_flush();
61 flush(); 61 flush();
62 } 62 }
63 } 63 }
64 64
65 public function reset() { 65 public function reset() {
66 $this->html = null; 66 $this->html = null;
67 $this->readability = null; 67 $this->readability = null;
68 $this->config = null; 68 $this->config = null;
69 $this->title = null; 69 $this->title = null;
70 $this->body = null; 70 $this->body = null;
71 $this->author = array(); 71 $this->author = array();
72 $this->language = null; 72 $this->language = null;
73 $this->date = null; 73 $this->date = null;
74 $this->nextPageUrl = null; 74 $this->nextPageUrl = null;
75 $this->success = false; 75 $this->success = false;
76 } 76 }
77 77
78 public function findHostUsingFingerprints($html) { 78 public function findHostUsingFingerprints($html) {
79 $this->debug('Checking fingerprints...'); 79 $this->debug('Checking fingerprints...');
80 $head = substr($html, 0, 8000); 80 $head = substr($html, 0, 8000);
81 foreach ($this->fingerprints as $_fp => $_fphost) { 81 foreach ($this->fingerprints as $_fp => $_fphost) {
82 $lookin = 'html'; 82 $lookin = 'html';
83 if (is_array($_fphost)) { 83 if (is_array($_fphost)) {
84 if (isset($_fphost['head']) && $_fphost['head']) { 84 if (isset($_fphost['head']) && $_fphost['head']) {
85 $lookin = 'head'; 85 $lookin = 'head';
86 } 86 }
87 $_fphost = $_fphost['hostname']; 87 $_fphost = $_fphost['hostname'];
88 } 88 }
89 if (strpos($$lookin, $_fp) !== false) { 89 if (strpos($$lookin, $_fp) !== false) {
90 $this->debug("Found match: $_fphost"); 90 $this->debug("Found match: $_fphost");
91 return $_fphost; 91 return $_fphost;
92 } 92 }
93 } 93 }
94 $this->debug('No fingerprint matches'); 94 $this->debug('No fingerprint matches');
95 return false; 95 return false;
96 } 96 }
97 97
98 // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) 98 // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
99 public function buildSiteConfig($url, $html='', $add_to_cache=true) { 99 public function buildSiteConfig($url, $html='', $add_to_cache=true) {
100 // extract host name 100 // extract host name
101 $host = @parse_url($url, PHP_URL_HOST); 101 $host = @parse_url($url, PHP_URL_HOST);
102 $host = strtolower($host); 102 $host = strtolower($host);
103 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); 103 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
104 // is merged version already cached? 104 // is merged version already cached?
105 if (SiteConfig::is_cached("$host.merged")) { 105 if (SiteConfig::is_cached("$host.merged")) {
106 $this->debug("Returning cached and merged site config for $host"); 106 $this->debug("Returning cached and merged site config for $host");
107 return SiteConfig::build("$host.merged"); 107 return SiteConfig::build("$host.merged");
108 } 108 }
109 // let's build from site_config/custom/ and standard/ 109 // let's build from site_config/custom/ and standard/
110 $config = SiteConfig::build($host); 110 $config = SiteConfig::build($host);
111 if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { 111 if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
112 SiteConfig::add_to_cache($host, $config); 112 SiteConfig::add_to_cache($host, $config);
113 } 113 }
114 // if no match, use defaults 114 // if no match, use defaults
115 if (!$config) $config = new SiteConfig(); 115 if (!$config) $config = new SiteConfig();
116 // load fingerprint config? 116 // load fingerprint config?
117 if ($config->autodetect_on_failure()) { 117 if ($config->autodetect_on_failure()) {
118 // check HTML for fingerprints 118 // check HTML for fingerprints
119 if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { 119 if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
120 if ($config_fingerprint = SiteConfig::build($_fphost)) { 120 if ($config_fingerprint = SiteConfig::build($_fphost)) {
121 $this->debug("Appending site config settings from $_fphost (fingerprint match)"); 121 $this->debug("Appending site config settings from $_fphost (fingerprint match)");
122 $config->append($config_fingerprint); 122 $config->append($config_fingerprint);
123 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { 123 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
124 //$config_fingerprint->cache_in_apc = true; 124 //$config_fingerprint->cache_in_apc = true;
125 SiteConfig::add_to_cache($_fphost, $config_fingerprint); 125 SiteConfig::add_to_cache($_fphost, $config_fingerprint);
126 } 126 }
127 } 127 }
128 } 128 }
129 } 129 }
130 // load global config? 130 // load global config?
131 if ($config->autodetect_on_failure()) { 131 if ($config->autodetect_on_failure()) {
132 if ($config_global = SiteConfig::build('global', true)) { 132 if ($config_global = SiteConfig::build('global', true)) {
133 $this->debug('Appending site config settings from global.txt'); 133 $this->debug('Appending site config settings from global.txt');
134 $config->append($config_global); 134 $config->append($config_global);
135 if ($add_to_cache && !SiteConfig::is_cached('global')) { 135 if ($add_to_cache && !SiteConfig::is_cached('global')) {
136 //$config_global->cache_in_apc = true; 136 //$config_global->cache_in_apc = true;
137 SiteConfig::add_to_cache('global', $config_global); 137 SiteConfig::add_to_cache('global', $config_global);
138 } 138 }
139 } 139 }
140 } 140 }
141 // store copy of merged config 141 // store copy of merged config
142 if ($add_to_cache) { 142 if ($add_to_cache) {
143 // do not store in APC if wildcard match 143 // do not store in APC if wildcard match
144 $use_apc = ($host == $config->cache_key); 144 $use_apc = ($host == $config->cache_key);
145 $config->cache_key = null; 145 $config->cache_key = null;
146 SiteConfig::add_to_cache("$host.merged", $config, $use_apc); 146 SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
147 } 147 }
148 return $config; 148 return $config;
149 } 149 }
150 150
151 // returns true on success, false on failure 151 // returns true on success, false on failure
152 // $smart_tidy indicates that if tidy is used and no results are produced, we will 152 // $smart_tidy indicates that if tidy is used and no results are produced, we will
153 // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time 153 // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
154 // but it has problems of its own which we try to avoid with this option. 154 // but it has problems of its own which we try to avoid with this option.
155 public function process($html, $url, $smart_tidy=true) { 155 public function process($html, $url, $smart_tidy=true) {
156 $this->reset(); 156 $this->reset();
157 $this->config = $this->buildSiteConfig($url, $html); 157 $this->config = $this->buildSiteConfig($url, $html);
158 158
159 // do string replacements 159 // do string replacements
160 if (!empty($this->config->find_string)) { 160 if (!empty($this->config->find_string)) {
161 if (count($this->config->find_string) == count($this->config->replace_string)) { 161 if (count($this->config->find_string) == count($this->config->replace_string)) {
162 $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); 162 $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
163 $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); 163 $this->debug("Strings replaced: $_count (find_string and/or replace_string)");
164 } else { 164 } else {
165 $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); 165 $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
166 } 166 }
167 unset($_count); 167 unset($_count);
168 } 168 }
169 169
170 // use tidy (if it exists)? 170 // use tidy (if it exists)?
171 // This fixes problems with some sites which would otherwise 171 // This fixes problems with some sites which would otherwise
172 // trouble DOMDocument's HTML parsing. (Although sometimes it 172 // trouble DOMDocument's HTML parsing. (Although sometimes it
173 // makes matters worse, which is why you can override it in site config files.) 173 // makes matters worse, which is why you can override it in site config files.)
174 $tidied = false; 174 $tidied = false;
175 if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { 175 if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
176 $this->debug('Using Tidy'); 176 $this->debug('Using Tidy');
177 $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); 177 $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
178 if (tidy_clean_repair($tidy)) { 178 if (tidy_clean_repair($tidy)) {
179 $original_html = $html; 179 $original_html = $html;
180 $tidied = true; 180 $tidied = true;
181 $html = $tidy->value; 181 $html = $tidy->value;
182 } 182 }
183 unset($tidy); 183 unset($tidy);
184 } 184 }
185 185
186 // load and parse html 186 // load and parse html
187 $_parser = $this->config->parser(); 187 $_parser = $this->config->parser();
188 if (!in_array($_parser, $this->allowedParsers)) { 188 if (!in_array($_parser, $this->allowedParsers)) {
189 $this->debug("HTML parser $_parser not listed, using libxml instead"); 189 $this->debug("HTML parser $_parser not listed, using libxml instead");
190 $_parser = 'libxml'; 190 $_parser = 'libxml';
191 } 191 }
192 $this->debug("Attempting to parse HTML with $_parser"); 192 $this->debug("Attempting to parse HTML with $_parser");
193 $this->readability = new Readability($html, $url, $_parser); 193 $this->readability = new Readability($html, $url, $_parser);
194 194
195 // we use xpath to find elements in the given HTML document 195 // we use xpath to find elements in the given HTML document
196 // see http://en.wikipedia.org/wiki/XPath_1.0 196 // see http://en.wikipedia.org/wiki/XPath_1.0
197 $xpath = new DOMXPath($this->readability->dom); 197 $xpath = new DOMXPath($this->readability->dom);
198 198
199 // try to get next page link 199 // try to get next page link
200 foreach ($this->config->next_page_link as $pattern) { 200 foreach ($this->config->next_page_link as $pattern) {
201 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 201 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
202 if (is_string($elems)) { 202 if (is_string($elems)) {
203 $this->nextPageUrl = trim($elems); 203 $this->nextPageUrl = trim($elems);
204 break; 204 break;
205 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 205 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
206 foreach ($elems as $item) { 206 foreach ($elems as $item) {
207 if ($item instanceof DOMElement && $item->hasAttribute('href')) { 207 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
208 $this->nextPageUrl = $item->getAttribute('href'); 208 $this->nextPageUrl = $item->getAttribute('href');
209 break 2; 209 break 2;
210 } elseif ($item instanceof DOMAttr && $item->value) { 210 } elseif ($item instanceof DOMAttr && $item->value) {
211 $this->nextPageUrl = $item->value; 211 $this->nextPageUrl = $item->value;
212 break 2; 212 break 2;
213 } 213 }
214 } 214 }
215 } 215 }
216 } 216 }
217 217
218 // try to get title 218 // try to get title
219 foreach ($this->config->title as $pattern) { 219 foreach ($this->config->title as $pattern) {
220 // $this->debug("Trying $pattern"); 220 // $this->debug("Trying $pattern");
221 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 221 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
222 if (is_string($elems)) { 222 if (is_string($elems)) {
223 $this->title = trim($elems); 223 $this->title = trim($elems);
224 $this->debug('Title expression evaluated as string: '.$this->title); 224 $this->debug('Title expression evaluated as string: '.$this->title);
225 $this->debug("...XPath match: $pattern"); 225 $this->debug("...XPath match: $pattern");
226 break; 226 break;
227 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 227 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
228 $this->title = $elems->item(0)->textContent; 228 $this->title = $elems->item(0)->textContent;
229 $this->debug('Title matched: '.$this->title); 229 $this->debug('Title matched: '.$this->title);
230 $this->debug("...XPath match: $pattern"); 230 $this->debug("...XPath match: $pattern");
231 // remove title from document 231 // remove title from document
232 try { 232 try {
233 $elems->item(0)->parentNode->removeChild($elems->item(0)); 233 @$elems->item(0)->parentNode->removeChild($elems->item(0));
234 } catch (DOMException $e) { 234 } catch (DOMException $e) {
235 // do nothing 235 // do nothing
236 } 236 }
237 break; 237 break;
238 } 238 }
239 } 239 }
240 240
241 // try to get author (if it hasn't already been set) 241 // try to get author (if it hasn't already been set)
242 if (empty($this->author)) { 242 if (empty($this->author)) {
243 foreach ($this->config->author as $pattern) { 243 foreach ($this->config->author as $pattern) {
244 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 244 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
245 if (is_string($elems)) { 245 if (is_string($elems)) {
246 if (trim($elems) != '') { 246 if (trim($elems) != '') {
247 $this->author[] = trim($elems); 247 $this->author[] = trim($elems);
248 $this->debug('Author expression evaluated as string: '.trim($elems)); 248 $this->debug('Author expression evaluated as string: '.trim($elems));
249 $this->debug("...XPath match: $pattern"); 249 $this->debug("...XPath match: $pattern");
250 break; 250 break;
251 } 251 }
252 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 252 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
253 foreach ($elems as $elem) { 253 foreach ($elems as $elem) {
254 if (!isset($elem->parentNode)) continue; 254 if (!isset($elem->parentNode)) continue;
255 $this->author[] = trim($elem->textContent); 255 $this->author[] = trim($elem->textContent);
256 $this->debug('Author matched: '.trim($elem->textContent)); 256 $this->debug('Author matched: '.trim($elem->textContent));
257 } 257 }
258 if (!empty($this->author)) { 258 if (!empty($this->author)) {
259 $this->debug("...XPath match: $pattern"); 259 $this->debug("...XPath match: $pattern");
260 break; 260 break;
261 } 261 }
262 } 262 }
263 } 263 }
264 } 264 }
265 265
266 // try to get language 266 // try to get language
267 $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); 267 $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
268 foreach ($_lang_xpath as $pattern) { 268 foreach ($_lang_xpath as $pattern) {
269 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 269 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
270 if (is_string($elems)) { 270 if (is_string($elems)) {
271 if (trim($elems) != '') { 271 if (trim($elems) != '') {
272 $this->language = trim($elems); 272 $this->language = trim($elems);
273 $this->debug('Language matched: '.$this->language); 273 $this->debug('Language matched: '.$this->language);
274 break; 274 break;
275 } 275 }
276 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 276 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
277 foreach ($elems as $elem) { 277 foreach ($elems as $elem) {
278 if (!isset($elem->parentNode)) continue; 278 if (!isset($elem->parentNode)) continue;
279 $this->language = trim($elem->textContent); 279 $this->language = trim($elem->textContent);
280 $this->debug('Language matched: '.$this->language); 280 $this->debug('Language matched: '.$this->language);
281 } 281 }
282 if ($this->language) break; 282 if ($this->language) break;
283 } 283 }
284 } 284 }
285 285
286 // try to get date 286 // try to get date
287 foreach ($this->config->date as $pattern) { 287 foreach ($this->config->date as $pattern) {
288 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 288 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
289 if (is_string($elems)) { 289 if (is_string($elems)) {
290 $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); 290 $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));
291 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 291 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
292 $this->date = $elems->item(0)->textContent; 292 $this->date = $elems->item(0)->textContent;
293 $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); 293 $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
294 // remove date from document 294 // remove date from document
295 // $elems->item(0)->parentNode->removeChild($elems->item(0)); 295 // $elems->item(0)->parentNode->removeChild($elems->item(0));
296 } 296 }
297 if (!$this->date) { 297 if (!$this->date) {
298 $this->date = null; 298 $this->date = null;
299 } else { 299 } else {
300 $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); 300 $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
301 $this->debug("...XPath match: $pattern"); 301 $this->debug("...XPath match: $pattern");
302 break; 302 break;
303 } 303 }
304 } 304 }
305 305
306 // strip elements (using xpath expressions) 306 // strip elements (using xpath expressions)
307 foreach ($this->config->strip as $pattern) { 307 foreach ($this->config->strip as $pattern) {
308 $elems = @$xpath->query($pattern, $this->readability->dom); 308 $elems = @$xpath->query($pattern, $this->readability->dom);
309 // check for matches 309 // check for matches
310 if ($elems && $elems->length > 0) { 310 if ($elems && $elems->length > 0) {
311 $this->debug('Stripping '.$elems->length.' elements (strip)'); 311 $this->debug('Stripping '.$elems->length.' elements (strip)');
312 for ($i=$elems->length-1; $i >= 0; $i--) { 312 for ($i=$elems->length-1; $i >= 0; $i--) {
313 $elems->item($i)->parentNode->removeChild($elems->item($i)); 313 $elems->item($i)->parentNode->removeChild($elems->item($i));
314 } 314 }
315 } 315 }
316 } 316 }
317 317
318 // strip elements (using id and class attribute values) 318 // strip elements (using id and class attribute values)
319 foreach ($this->config->strip_id_or_class as $string) { 319 foreach ($this->config->strip_id_or_class as $string) {
320 $string = strtr($string, array("'"=>'', '"'=>'')); 320 $string = strtr($string, array("'"=>'', '"'=>''));
321 $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); 321 $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
322 // check for matches 322 // check for matches
323 if ($elems && $elems->length > 0) { 323 if ($elems && $elems->length > 0) {
324 $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); 324 $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
325 for ($i=$elems->length-1; $i >= 0; $i--) { 325 for ($i=$elems->length-1; $i >= 0; $i--) {
326 $elems->item($i)->parentNode->removeChild($elems->item($i)); 326 $elems->item($i)->parentNode->removeChild($elems->item($i));
327 } 327 }
328 } 328 }
329 } 329 }
330 330
331 // strip images (using src attribute values) 331 // strip images (using src attribute values)
332 foreach ($this->config->strip_image_src as $string) { 332 foreach ($this->config->strip_image_src as $string) {
333 $string = strtr($string, array("'"=>'', '"'=>'')); 333 $string = strtr($string, array("'"=>'', '"'=>''));
334 $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); 334 $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
335 // check for matches 335 // check for matches
336 if ($elems && $elems->length > 0) { 336 if ($elems && $elems->length > 0) {
337 $this->debug('Stripping '.$elems->length.' image elements'); 337 $this->debug('Stripping '.$elems->length.' image elements');
338 for ($i=$elems->length-1; $i >= 0; $i--) { 338 for ($i=$elems->length-1; $i >= 0; $i--) {
339 $elems->item($i)->parentNode->removeChild($elems->item($i)); 339 $elems->item($i)->parentNode->removeChild($elems->item($i));
340 } 340 }
341 } 341 }
342 } 342 }
343 // strip elements using Readability.com and Instapaper.com ignore class names 343 // strip elements using Readability.com and Instapaper.com ignore class names
344 // .entry-unrelated and .instapaper_ignore 344 // .entry-unrelated and .instapaper_ignore
345 // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines 345 // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
346 // and http://blog.instapaper.com/post/730281947 346 // and http://blog.instapaper.com/post/730281947
347 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); 347 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
348 // check for matches 348 // check for matches
349 if ($elems && $elems->length > 0) { 349 if ($elems && $elems->length > 0) {
350 $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); 350 $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
351 for ($i=$elems->length-1; $i >= 0; $i--) { 351 for ($i=$elems->length-1; $i >= 0; $i--) {
352 $elems->item($i)->parentNode->removeChild($elems->item($i)); 352 $elems->item($i)->parentNode->removeChild($elems->item($i));
353 } 353 }
354 } 354 }
355 355
356 // strip elements that contain style="display: none;" 356 // strip elements that contain style="display: none;"
357 $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); 357 $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
358 // check for matches 358 // check for matches
359 if ($elems && $elems->length > 0) { 359 if ($elems && $elems->length > 0) {
360 $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); 360 $this->debug('Stripping '.$elems->length.' elements with inline display:none style');
361 for ($i=$elems->length-1; $i >= 0; $i--) { 361 for ($i=$elems->length-1; $i >= 0; $i--) {
362 $elems->item($i)->parentNode->removeChild($elems->item($i)); 362 $elems->item($i)->parentNode->removeChild($elems->item($i));
363 } 363 }
364 } 364 }
365 365
366 // try to get body 366 // try to get body
367 foreach ($this->config->body as $pattern) { 367 foreach ($this->config->body as $pattern) {
368 $elems = @$xpath->query($pattern, $this->readability->dom); 368 $elems = @$xpath->query($pattern, $this->readability->dom);
369 // check for matches 369 // check for matches
370 if ($elems && $elems->length > 0) { 370 if ($elems && $elems->length > 0) {
371 $this->debug('Body matched'); 371 $this->debug('Body matched');
372 $this->debug("...XPath match: $pattern"); 372 $this->debug("...XPath match: $pattern");
373 if ($elems->length == 1) { 373 if ($elems->length == 1) {
374 $this->body = $elems->item(0); 374 $this->body = $elems->item(0);
375 // prune (clean up elements that may not be content) 375 // prune (clean up elements that may not be content)
376 if ($this->config->prune()) { 376 if ($this->config->prune()) {
377 $this->debug('...pruning content'); 377 $this->debug('...pruning content');
378 $this->readability->prepArticle($this->body); 378 $this->readability->prepArticle($this->body);
379 } 379 }
380 break; 380 break;
381 } else { 381 } else {
382 $this->body = $this->readability->dom->createElement('div'); 382 $this->body = $this->readability->dom->createElement('div');
383 $this->debug($elems->length.' body elems found'); 383 $this->debug($elems->length.' body elems found');
384 foreach ($elems as $elem) { 384 foreach ($elems as $elem) {
385 if (!isset($elem->parentNode)) continue; 385 if (!isset($elem->parentNode)) continue;
386 $isDescendant = false; 386 $isDescendant = false;
387 foreach ($this->body->childNodes as $parent) { 387 foreach ($this->body->childNodes as $parent) {
388 if ($this->isDescendant($parent, $elem)) { 388 if ($this->isDescendant($parent, $elem)) {
389 $isDescendant = true; 389 $isDescendant = true;
390 break; 390 break;
391 } 391 }
392 } 392 }
393 if ($isDescendant) { 393 if ($isDescendant) {
394 $this->debug('...element is child of another body element, skipping.'); 394 $this->debug('...element is child of another body element, skipping.');
395 } else { 395 } else {
396 // prune (clean up elements that may not be content) 396 // prune (clean up elements that may not be content)
397 if ($this->config->prune()) { 397 if ($this->config->prune()) {
398 $this->debug('Pruning content'); 398 $this->debug('Pruning content');
399 $this->readability->prepArticle($elem); 399 $this->readability->prepArticle($elem);
400 } 400 }
401 $this->debug('...element added to body'); 401 $this->debug('...element added to body');
402 $this->body->appendChild($elem); 402 $this->body->appendChild($elem);
403 } 403 }
404 } 404 }
405 if ($this->body->hasChildNodes()) break; 405 if ($this->body->hasChildNodes()) break;
406 } 406 }
407 } 407 }
408 } 408 }
409 409
410 // auto detect? 410 // auto detect?
411 $detect_title = $detect_body = $detect_author = $detect_date = false; 411 $detect_title = $detect_body = $detect_author = $detect_date = false;
412 // detect title? 412 // detect title?
413 if (!isset($this->title)) { 413 if (!isset($this->title)) {
414 if (empty($this->config->title) || $this->config->autodetect_on_failure()) { 414 if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
415 $detect_title = true; 415 $detect_title = true;
416 } 416 }
417 } 417 }
418 // detect body? 418 // detect body?
419 if (!isset($this->body)) { 419 if (!isset($this->body)) {
420 if (empty($this->config->body) || $this->config->autodetect_on_failure()) { 420 if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
421 $detect_body = true; 421 $detect_body = true;
422 } 422 }
423 } 423 }
424 // detect author? 424 // detect author?
425 if (empty($this->author)) { 425 if (empty($this->author)) {
426 if (empty($this->config->author) || $this->config->autodetect_on_failure()) { 426 if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
427 $detect_author = true; 427 $detect_author = true;
428 } 428 }
429 } 429 }
430 // detect date? 430 // detect date?
431 if (!isset($this->date)) { 431 if (!isset($this->date)) {
432 if (empty($this->config->date) || $this->config->autodetect_on_failure()) { 432 if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
433 $detect_date = true; 433 $detect_date = true;
434 } 434 }
435 } 435 }
436 436
437 // check for hNews 437 // check for hNews
438 if ($detect_title || $detect_body) { 438 if ($detect_title || $detect_body) {
439 // check for hentry 439 // check for hentry
440 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); 440 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
441 if ($elems && $elems->length > 0) { 441 if ($elems && $elems->length > 0) {
442 $this->debug('hNews: found hentry'); 442 $this->debug('hNews: found hentry');
443 $hentry = $elems->item(0); 443 $hentry = $elems->item(0);
444 444
445 if ($detect_title) { 445 if ($detect_title) {
446 // check for entry-title 446 // check for entry-title
447 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); 447 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
448 if ($elems && $elems->length > 0) { 448 if ($elems && $elems->length > 0) {
449 $this->title = $elems->item(0)->textContent; 449 $this->title = $elems->item(0)->textContent;
450 $this->debug('hNews: found entry-title: '.$this->title); 450 $this->debug('hNews: found entry-title: '.$this->title);
451 // remove title from document 451 // remove title from document
452 $elems->item(0)->parentNode->removeChild($elems->item(0)); 452 $elems->item(0)->parentNode->removeChild($elems->item(0));
453 $detect_title = false; 453 $detect_title = false;
454 } 454 }
455 } 455 }
456 456
457 if ($detect_date) { 457 if ($detect_date) {
458 // check for time element with pubdate attribute 458 // check for time element with pubdate attribute
459 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); 459 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
460 if ($elems && $elems->length > 0) { 460 if ($elems && $elems->length > 0) {
461 $this->date = strtotime(trim($elems->item(0)->textContent)); 461 $this->date = strtotime(trim($elems->item(0)->textContent));
462 // remove date from document 462 // remove date from document
463 //$elems->item(0)->parentNode->removeChild($elems->item(0)); 463 //$elems->item(0)->parentNode->removeChild($elems->item(0));
464 if ($this->date) { 464 if ($this->date) {
465 $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); 465 $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
466 $detect_date = false; 466 $detect_date = false;
467 } else { 467 } else {
468 $this->date = null; 468 $this->date = null;
469 } 469 }
470 } 470 }
471 } 471 }
472 472
473 if ($detect_author) { 473 if ($detect_author) {
474 // check for time element with pubdate attribute 474 // check for time element with pubdate attribute
475 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); 475 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
476 if ($elems && $elems->length > 0) { 476 if ($elems && $elems->length > 0) {
477 $author = $elems->item(0); 477 $author = $elems->item(0);
478 $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); 478 $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
479 if ($fn && $fn->length > 0) { 479 if ($fn && $fn->length > 0) {
480 foreach ($fn as $_fn) { 480 foreach ($fn as $_fn) {
481 if (trim($_fn->textContent) != '') { 481 if (trim($_fn->textContent) != '') {
482 $this->author[] = trim($_fn->textContent); 482 $this->author[] = trim($_fn->textContent);
483 $this->debug('hNews: found author: '.trim($_fn->textContent)); 483 $this->debug('hNews: found author: '.trim($_fn->textContent));
484 } 484 }
485 } 485 }
486 } else { 486 } else {
487 if (trim($author->textContent) != '') { 487 if (trim($author->textContent) != '') {
488 $this->author[] = trim($author->textContent); 488 $this->author[] = trim($author->textContent);
489 $this->debug('hNews: found author: '.trim($author->textContent)); 489 $this->debug('hNews: found author: '.trim($author->textContent));
490 } 490 }
491 } 491 }
492 $detect_author = empty($this->author); 492 $detect_author = empty($this->author);
493 } 493 }
494 } 494 }
495 495
496 // check for entry-content. 496 // check for entry-content.
497 // according to hAtom spec, if there are multiple elements marked entry-content, 497 // according to hAtom spec, if there are multiple elements marked entry-content,
498 // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content 498 // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
499 if ($detect_body) { 499 if ($detect_body) {
500 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); 500 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
501 if ($elems && $elems->length > 0) { 501 if ($elems && $elems->length > 0) {
502 $this->debug('hNews: found entry-content'); 502 $this->debug('hNews: found entry-content');
503 if ($elems->length == 1) { 503 if ($elems->length == 1) {
504 // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) 504 // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
505 $e = $elems->item(0); 505 $e = $elems->item(0);
506 if (($e->tagName == 'img') || (trim($e->textContent) != '')) { 506 if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
507 $this->body = $elems->item(0); 507 $this->body = $elems->item(0);
508 // prune (clean up elements that may not be content) 508 // prune (clean up elements that may not be content)
509 if ($this->config->prune()) { 509 if ($this->config->prune()) {
510 $this->debug('Pruning content'); 510 $this->debug('Pruning content');
511 $this->readability->prepArticle($this->body); 511 $this->readability->prepArticle($this->body);
512 } 512 }
513 $detect_body = false; 513 $detect_body = false;
514 } else { 514 } else {
515 $this->debug('hNews: skipping entry-content - appears not to contain content'); 515 $this->debug('hNews: skipping entry-content - appears not to contain content');
516 } 516 }
517 unset($e); 517 unset($e);
518 } else { 518 } else {
519 $this->body = $this->readability->dom->createElement('div'); 519 $this->body = $this->readability->dom->createElement('div');
520 $this->debug($elems->length.' entry-content elems found'); 520 $this->debug($elems->length.' entry-content elems found');
521 foreach ($elems as $elem) { 521 foreach ($elems as $elem) {
522 if (!isset($elem->parentNode)) continue; 522 if (!isset($elem->parentNode)) continue;
523 $isDescendant = false; 523 $isDescendant = false;
524 foreach ($this->body->childNodes as $parent) { 524 foreach ($this->body->childNodes as $parent) {
525 if ($this->isDescendant($parent, $elem)) { 525 if ($this->isDescendant($parent, $elem)) {
526 $isDescendant = true; 526 $isDescendant = true;
527 break; 527 break;
528 } 528 }
529 } 529 }
530 if ($isDescendant) { 530 if ($isDescendant) {
531 $this->debug('Element is child of another body element, skipping.'); 531 $this->debug('Element is child of another body element, skipping.');
532 } else { 532 } else {
533 // prune (clean up elements that may not be content) 533 // prune (clean up elements that may not be content)
534 if ($this->config->prune()) { 534 if ($this->config->prune()) {
535 $this->debug('Pruning content'); 535 $this->debug('Pruning content');
536 $this->readability->prepArticle($elem); 536 $this->readability->prepArticle($elem);
537 } 537 }
538 $this->debug('Element added to body'); 538 $this->debug('Element added to body');
539 $this->body->appendChild($elem); 539 $this->body->appendChild($elem);
540 } 540 }
541 } 541 }
542 $detect_body = false; 542 $detect_body = false;
543 } 543 }
544 } 544 }
545 } 545 }
546 } 546 }
547 } 547 }
548 548
549 // check for elements marked with instapaper_title 549 // check for elements marked with instapaper_title
550 if ($detect_title) { 550 if ($detect_title) {
551 // check for instapaper_title 551 // check for instapaper_title
552 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); 552 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
553 if ($elems && $elems->length > 0) { 553 if ($elems && $elems->length > 0) {
554 $this->title = $elems->item(0)->textContent; 554 $this->title = $elems->item(0)->textContent;
555 $this->debug('Title found (.instapaper_title): '.$this->title); 555 $this->debug('Title found (.instapaper_title): '.$this->title);
556 // remove title from document 556 // remove title from document
557 $elems->item(0)->parentNode->removeChild($elems->item(0)); 557 $elems->item(0)->parentNode->removeChild($elems->item(0));
558 $detect_title = false; 558 $detect_title = false;
559 } 559 }
560 } 560 }
561 // check for elements marked with instapaper_body 561 // check for elements marked with instapaper_body
562 if ($detect_body) { 562 if ($detect_body) {
563 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); 563 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
564 if ($elems && $elems->length > 0) { 564 if ($elems && $elems->length > 0) {
565 $this->debug('body found (.instapaper_body)'); 565 $this->debug('body found (.instapaper_body)');
566 $this->body = $elems->item(0); 566 $this->body = $elems->item(0);
567 // prune (clean up elements that may not be content) 567 // prune (clean up elements that may not be content)
568 if ($this->config->prune()) { 568 if ($this->config->prune()) {
569 $this->debug('Pruning content'); 569 $this->debug('Pruning content');
570 $this->readability->prepArticle($this->body); 570 $this->readability->prepArticle($this->body);
571 } 571 }
572 $detect_body = false; 572 $detect_body = false;
573 } 573 }
574 } 574 }
575 575
576 // Find author in rel="author" marked element 576 // Find author in rel="author" marked element
577 // We only use this if there's exactly one. 577 // We only use this if there's exactly one.
578 // If there's more than one, it could indicate more than 578 // If there's more than one, it could indicate more than
579 // one author, but it could also indicate that we're processing 579 // one author, but it could also indicate that we're processing
580 // a page listing different articles with different authors. 580 // a page listing different articles with different authors.
581 if ($detect_author) { 581 if ($detect_author) {
582 $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); 582 $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
583 if ($elems && $elems->length == 1) { 583 if ($elems && $elems->length == 1) {
584 $author = trim($elems->item(0)->textContent); 584 $author = trim($elems->item(0)->textContent);
585 if ($author != '') { 585 if ($author != '') {
586 $this->debug("Author found (rel=\"author\"): $author"); 586 $this->debug("Author found (rel=\"author\"): $author");
587 $this->author[] = $author; 587 $this->author[] = $author;
588 $detect_author = false; 588 $detect_author = false;
589 } 589 }
590 } 590 }
591 } 591 }
592 592
593 // Find date in pubdate marked time element 593 // Find date in pubdate marked time element
594 // For the same reason given above, we only use this 594 // For the same reason given above, we only use this
595 // if there's exactly one element. 595 // if there's exactly one element.
596 if ($detect_date) { 596 if ($detect_date) {
597 $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); 597 $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
598 if ($elems && $elems->length == 1) { 598 if ($elems && $elems->length == 1) {
599 $this->date = strtotime(trim($elems->item(0)->textContent)); 599 $this->date = strtotime(trim($elems->item(0)->textContent));
600 // remove date from document 600 // remove date from document
601 //$elems->item(0)->parentNode->removeChild($elems->item(0)); 601 //$elems->item(0)->parentNode->removeChild($elems->item(0));
602 if ($this->date) { 602 if ($this->date) {
603 $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); 603 $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
604 $detect_date = false; 604 $detect_date = false;
605 } else { 605 } else {
606 $this->date = null; 606 $this->date = null;
607 } 607 }
608 } 608 }
609 } 609 }
610 610
611 // still missing title or body, so we detect using Readability 611 // still missing title or body, so we detect using Readability
612 if ($detect_title || $detect_body) { 612 if ($detect_title || $detect_body) {
613 $this->debug('Using Readability'); 613 $this->debug('Using Readability');
614 // clone body if we're only using Readability for title (otherwise it may interfere with body element) 614 // clone body if we're only using Readability for title (otherwise it may interfere with body element)
615 if (isset($this->body)) $this->body = $this->body->cloneNode(true); 615 if (isset($this->body)) $this->body = $this->body->cloneNode(true);
616 $success = $this->readability->init(); 616 $success = $this->readability->init();
617 } 617 }
618 if ($detect_title) { 618 if ($detect_title) {
619 $this->debug('Detecting title'); 619 $this->debug('Detecting title');
620 $this->title = $this->readability->getTitle()->textContent; 620 $this->title = $this->readability->getTitle()->textContent;
621 } 621 }
622 if ($detect_body && $success) { 622 if ($detect_body && $success) {
623 $this->debug('Detecting body'); 623 $this->debug('Detecting body');
624 $this->body = $this->readability->getContent(); 624 $this->body = $this->readability->getContent();
625 if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { 625 if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
626 $this->body = $this->body->firstChild; 626 $this->body = $this->body->firstChild;
627 } 627 }
628 // prune (clean up elements that may not be content) 628 // prune (clean up elements that may not be content)
629 if ($this->config->prune()) { 629 if ($this->config->prune()) {
630 $this->debug('Pruning content'); 630 $this->debug('Pruning content');
631 $this->readability->prepArticle($this->body); 631 $this->readability->prepArticle($this->body);
632 } 632 }
633 } 633 }
634 if (isset($this->body)) { 634 if (isset($this->body)) {
635 // remove scripts 635 // remove scripts
636 $this->readability->removeScripts($this->body); 636 $this->readability->removeScripts($this->body);
637 // remove any h1-h6 elements that appear as first thing in the body 637 // remove any h1-h6 elements that appear as first thing in the body
638 // and which match our title 638 // and which match our title
639 if (isset($this->title) && ($this->title != '')) { 639 if (isset($this->title) && ($this->title != '')) {
640 $firstChild = $this->body->firstChild; 640 $firstChild = $this->body->firstChild;
641 while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { 641 while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
642 $firstChild = $firstChild->nextSibling; 642 $firstChild = $firstChild->nextSibling;
643 } 643 }
644 if (($firstChild->nodeType === XML_ELEMENT_NODE) 644 if (($firstChild->nodeType === XML_ELEMENT_NODE)
645 && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) 645 && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
646 && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { 646 && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
647 $this->body->removeChild($firstChild); 647 $this->body->removeChild($firstChild);
648 } 648 }
649 } 649 }
650 // prevent self-closing iframes 650 // prevent self-closing iframes
651 $elems = $this->body->getElementsByTagName('iframe'); 651 $elems = $this->body->getElementsByTagName('iframe');
652 for ($i = $elems->length-1; $i >= 0; $i--) { 652 for ($i = $elems->length-1; $i >= 0; $i--) {
653 $e = $elems->item($i); 653 $e = $elems->item($i);
654 if (!$e->hasChildNodes()) { 654 if (!$e->hasChildNodes()) {
655 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); 655 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
656 } 656 }
657 } 657 }
658 // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ 658 // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
659 // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src 659 // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
660 // inside the data-lazy-src attribute. It also places the original image inside a noscript element 660 // inside the data-lazy-src attribute. It also places the original image inside a noscript element
661 // next to the amended one. 661 // next to the amended one.
662 $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); 662 $elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
663 for ($i = $elems->length-1; $i >= 0; $i--) { 663 for ($i = $elems->length-1; $i >= 0; $i--) {
664 $e = $elems->item($i); 664 $e = $elems->item($i);
665 // let's see if we can grab image from noscript 665 // let's see if we can grab image from noscript
666 if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { 666 if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
667 $_new_elem = $e->ownerDocument->createDocumentFragment(); 667 $_new_elem = $e->ownerDocument->createDocumentFragment();
668 @$_new_elem->appendXML($e->nextSibling->innerHTML); 668 @$_new_elem->appendXML($e->nextSibling->innerHTML);
669 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); 669 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
670 $e->parentNode->removeChild($e); 670 $e->parentNode->removeChild($e);
671 } else { 671 } else {
672 // Use data-lazy-src as src value 672 // Use data-lazy-src as src value
673 $e->setAttribute('src', $e->getAttribute('data-lazy-src')); 673 $e->setAttribute('src', $e->getAttribute('data-lazy-src'));
674 $e->removeAttribute('data-lazy-src'); 674 $e->removeAttribute('data-lazy-src');
675 } 675 }
676 } 676 }
677 677
678 $this->success = true; 678 $this->success = true;
679 } 679 }
680 680
681 // if we've had no success and we've used tidy, there's a chance 681 // if we've had no success and we've used tidy, there's a chance
682 // that tidy has messed up. So let's try again without tidy... 682 // that tidy has messed up. So let's try again without tidy...
683 if (!$this->success && $tidied && $smart_tidy) { 683 if (!$this->success && $tidied && $smart_tidy) {
684 $this->debug('Trying again without tidy'); 684 $this->debug('Trying again without tidy');
685 $this->process($original_html, $url, false); 685 $this->process($original_html, $url, false);
686 } 686 }
687 687
688 return $this->success; 688 return $this->success;
689 } 689 }
690 690
691 private function isDescendant(DOMElement $parent, DOMElement $child) { 691 private function isDescendant(DOMElement $parent, DOMElement $child) {
692 $node = $child->parentNode; 692 $node = $child->parentNode;
693 while ($node != null) { 693 while ($node != null) {
694 if ($node->isSameNode($parent)) return true; 694 if ($node->isSameNode($parent)) return true;
695 $node = $node->parentNode; 695 $node = $node->parentNode;
696 } 696 }
697 return false; 697 return false;
698 } 698 }
699 699
700 public function getContent() { 700 public function getContent() {
701 return $this->body; 701 return $this->body;
702 } 702 }
703 703
704 public function getTitle() { 704 public function getTitle() {
705 return $this->title; 705 return $this->title;
706 } 706 }
707 707
708 public function getAuthors() { 708 public function getAuthors() {
709 return $this->author; 709 return $this->author;
710 } 710 }
711 711
712 public function getLanguage() { 712 public function getLanguage() {
713 return $this->language; 713 return $this->language;
714 } 714 }
715 715
716 public function getDate() { 716 public function getDate() {
717 return $this->date; 717 return $this->date;
718 } 718 }
719 719
720 public function getSiteConfig() { 720 public function getSiteConfig() {
721 return $this->config; 721 return $this->config;
722 } 722 }
723 723
724 public function getNextPageUrl() { 724 public function getNextPageUrl() {
725 return $this->nextPageUrl; 725 return $this->nextPageUrl;
726 } 726 }
727} 727} \ No newline at end of file
728?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
index c5e300d7..1f6a7603 100644
--- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php
+++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
@@ -1,338 +1,343 @@
1<?php 1<?php
2/** 2/**
3 * Site Config 3 * Site Config
4 * 4 *
5 * Each instance of this class should hold extraction patterns and other directives 5 * Each instance of this class should hold extraction patterns and other directives
6 * for a website. See ContentExtractor class to see how it's used. 6 * for a website. See ContentExtractor class to see how it's used.
7 * 7 *
8 * @version 0.7 8 * @version 0.8
9 * @date 2012-08-27 9 * @date 2013-04-16
10 * @author Keyvan Minoukadeh 10 * @author Keyvan Minoukadeh
11 * @copyright 2012 Keyvan Minoukadeh 11 * @copyright 2013 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
13 */ 13 */
14 14
15class SiteConfig 15class SiteConfig
16{ 16{
17 // Use first matching element as title (0 or more xpath expressions) 17 // Use first matching element as title (0 or more xpath expressions)
18 public $title = array(); 18 public $title = array();
19 19
20 // Use first matching element as body (0 or more xpath expressions) 20 // Use first matching element as body (0 or more xpath expressions)
21 public $body = array(); 21 public $body = array();
22 22
23 // Use first matching element as author (0 or more xpath expressions) 23 // Use first matching element as author (0 or more xpath expressions)
24 public $author = array(); 24 public $author = array();
25 25
26 // Use first matching element as date (0 or more xpath expressions) 26 // Use first matching element as date (0 or more xpath expressions)
27 public $date = array(); 27 public $date = array();
28 28
29 // Strip elements matching these xpath expressions (0 or more) 29 // Strip elements matching these xpath expressions (0 or more)
30 public $strip = array(); 30 public $strip = array();
31 31
32 // Strip elements which contain these strings (0 or more) in the id or class attribute 32 // Strip elements which contain these strings (0 or more) in the id or class attribute
33 public $strip_id_or_class = array(); 33 public $strip_id_or_class = array();
34 34
35 // Strip images which contain these strings (0 or more) in the src attribute 35 // Strip images which contain these strings (0 or more) in the src attribute
36 public $strip_image_src = array(); 36 public $strip_image_src = array();
37 37
38 // Additional HTTP headers to send 38 // Additional HTTP headers to send
39 // NOT YET USED 39 // NOT YET USED
40 public $http_header = array(); 40 public $http_header = array();
41 41
42 // Process HTML with tidy before creating DOM (bool or null if undeclared) 42 // Process HTML with tidy before creating DOM (bool or null if undeclared)
43 public $tidy = null; 43 public $tidy = null;
44 44
45 protected $default_tidy = true; // used if undeclared 45 protected $default_tidy = true; // used if undeclared
46 46
47 // Autodetect title/body if xpath expressions fail to produce results. 47 // Autodetect title/body if xpath expressions fail to produce results.
48 // Note that this applies to title and body separately, ie. 48 // Note that this applies to title and body separately, ie.
49 // * if we get a body match but no title match, this option will determine whether we autodetect title 49 // * if we get a body match but no title match, this option will determine whether we autodetect title
50 // * if neither match, this determines whether we autodetect title and body. 50 // * if neither match, this determines whether we autodetect title and body.
51 // Also note that this only applies when there is at least one xpath expression in title or body, ie. 51 // Also note that this only applies when there is at least one xpath expression in title or body, ie.
52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) 52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. 53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). 54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
55 // bool or null if undeclared 55 // bool or null if undeclared
56 public $autodetect_on_failure = null; 56 public $autodetect_on_failure = null;
57 protected $default_autodetect_on_failure = true; // used if undeclared 57 protected $default_autodetect_on_failure = true; // used if undeclared
58 58
59 // Clean up content block - attempt to remove elements that appear to be superfluous 59 // Clean up content block - attempt to remove elements that appear to be superfluous
60 // bool or null if undeclared 60 // bool or null if undeclared
61 public $prune = null; 61 public $prune = null;
62 protected $default_prune = true; // used if undeclared 62 protected $default_prune = true; // used if undeclared
63 63
64 // Test URL - if present, can be used to test the config above 64 // Test URL - if present, can be used to test the config above
65 public $test_url = array(); 65 public $test_url = array();
66 66
67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article 67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article
68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to 68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page 69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page
70 // which displays the entire article on one page (e.g. 'print view'). 70 // which displays the entire article on one page (e.g. 'print view').
71 // This should be an XPath expression identifying the link to that page. If present and we find a match, 71 // This should be an XPath expression identifying the link to that page. If present and we find a match,
72 // we will retrieve that page and the rest of the options in this config will be applied to the new page. 72 // we will retrieve that page and the rest of the options in this config will be applied to the new page.
73 public $single_page_link = array(); 73 public $single_page_link = array();
74 74
75 public $next_page_link = array(); 75 public $next_page_link = array();
76 76
77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed 77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
78 public $single_page_link_in_feed = array(); 78 public $single_page_link_in_feed = array();
79 79
80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') 80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
81 // string or null if undeclared 81 // string or null if undeclared
82 public $parser = null; 82 public $parser = null;
83 protected $default_parser = 'libxml'; // used if undeclared 83 protected $default_parser = 'libxml'; // used if undeclared
84 84
85 // Strings to search for in HTML before processing begins (used with $replace_string) 85 // Strings to search for in HTML before processing begins (used with $replace_string)
86 public $find_string = array(); 86 public $find_string = array();
87 // Strings to replace those found in $find_string before HTML processing begins 87 // Strings to replace those found in $find_string before HTML processing begins
88 public $replace_string = array(); 88 public $replace_string = array();
89 89
90 // the options below cannot be set in the config files which this class represents 90 // the options below cannot be set in the config files which this class represents
91 91
92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not 92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not
93 public $cache_key = null; 93 public $cache_key = null;
94 public static $debug = false; 94 public static $debug = false;
95 protected static $apc = false; 95 protected static $apc = false;
96 protected static $config_path; 96 protected static $config_path;
97 protected static $config_path_fallback; 97 protected static $config_path_fallback;
98 protected static $config_cache = array(); 98 protected static $config_cache = array();
99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; 99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
100 100
101 protected static function debug($msg) { 101 protected static function debug($msg) {
102 if (self::$debug) { 102 if (self::$debug) {
103 //$mem = round(memory_get_usage()/1024, 2); 103 //$mem = round(memory_get_usage()/1024, 2);
104 //$memPeak = round(memory_get_peak_usage()/1024, 2); 104 //$memPeak = round(memory_get_peak_usage()/1024, 2);
105 echo '* ',$msg; 105 echo '* ',$msg;
106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; 106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n";
107 echo "\n"; 107 echo "\n";
108 ob_flush(); 108 ob_flush();
109 flush(); 109 flush();
110 } 110 }
111 } 111 }
112 112
113 // enable APC caching of certain site config files? 113 // enable APC caching of certain site config files?
114 // If enabled the following site config files will be 114 // If enabled the following site config files will be
115 // cached in APC cache (when requested for first time): 115 // cached in APC cache (when requested for first time):
116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/ 116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/
117 // * the site config files associated with HTML fingerprints 117 // * the site config files associated with HTML fingerprints
118 // * the global site config file 118 // * the global site config file
119 // returns true if enabled, false otherwise 119 // returns true if enabled, false otherwise
120 public static function use_apc($apc=true) { 120 public static function use_apc($apc=true) {
121 if (!function_exists('apc_add')) { 121 if (!function_exists('apc_add')) {
122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); 122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
123 return false; 123 return false;
124 } 124 }
125 self::$apc = $apc; 125 self::$apc = $apc;
126 return $apc; 126 return $apc;
127 } 127 }
128 128
129 // return bool or null 129 // return bool or null
130 public function tidy($use_default=true) { 130 public function tidy($use_default=true) {
131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; 131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
132 return $this->tidy; 132 return $this->tidy;
133 } 133 }
134 134
135 // return bool or null 135 // return bool or null
136 public function prune($use_default=true) { 136 public function prune($use_default=true) {
137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; 137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
138 return $this->prune; 138 return $this->prune;
139 } 139 }
140 140
141 // return string or null 141 // return string or null
142 public function parser($use_default=true) { 142 public function parser($use_default=true) {
143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; 143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
144 return $this->parser; 144 return $this->parser;
145 } 145 }
146 146
147 // return bool or null 147 // return bool or null
148 public function autodetect_on_failure($use_default=true) { 148 public function autodetect_on_failure($use_default=true) {
149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; 149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
150 return $this->autodetect_on_failure; 150 return $this->autodetect_on_failure;
151 } 151 }
152 152
153 public static function set_config_path($path, $fallback=null) { 153 public static function set_config_path($path, $fallback=null) {
154 self::$config_path = $path; 154 self::$config_path = $path;
155 self::$config_path_fallback = $fallback; 155 self::$config_path_fallback = $fallback;
156 } 156 }
157 157
158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { 158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
159 $key = strtolower($key); 159 $key = strtolower($key);
160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); 160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
161 if ($config->cache_key) $key = $config->cache_key; 161 if ($config->cache_key) $key = $config->cache_key;
162 self::$config_cache[$key] = $config; 162 self::$config_cache[$key] = $config;
163 if (self::$apc && $use_apc) { 163 if (self::$apc && $use_apc) {
164 self::debug("Adding site config to APC cache with key sc.$key"); 164 self::debug("Adding site config to APC cache with key sc.$key");
165 apc_add("sc.$key", $config); 165 apc_add("sc.$key", $config);
166 } 166 }
167 self::debug("Cached site config with key $key"); 167 self::debug("Cached site config with key $key");
168 } 168 }
169 169
170 public static function is_cached($key) { 170 public static function is_cached($key) {
171 $key = strtolower($key); 171 $key = strtolower($key);
172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); 172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
173 if (array_key_exists($key, self::$config_cache)) { 173 if (array_key_exists($key, self::$config_cache)) {
174 return true; 174 return true;
175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { 175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
176 return true; 176 return true;
177 } 177 }
178 return false; 178 return false;
179 } 179 }
180 180
181 public function append(SiteConfig $newconfig) { 181 public function append(SiteConfig $newconfig) {
182 // check for commands where we accept multiple statements (no test_url) 182 // check for commands where we accept multiple statements (no test_url)
183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { 183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
184 // append array elements for this config variable from $newconfig to this config 184 // append array elements for this config variable from $newconfig to this config
185 //$this->$var = $this->$var + $newconfig->$var; 185 //$this->$var = $this->$var + $newconfig->$var;
186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); 186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
187 } 187 }
188 // check for single statement commands 188 // check for single statement commands
189 // we do not overwrite existing non null values 189 // we do not overwrite existing non null values
190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { 190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
191 if ($this->$var === null) $this->$var = $newconfig->$var; 191 if ($this->$var === null) $this->$var = $newconfig->$var;
192 } 192 }
193 } 193 // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
194 194 foreach (array('find_string', 'replace_string') as $var) {
195 // returns SiteConfig instance if an appropriate one is found, false otherwise 195 // append array elements for this config variable from $newconfig to this config
196 // if $exact_host_match is true, we will not look for wildcard config matches 196 //$this->$var = $this->$var + $newconfig->$var;
197 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists 197 $this->$var = array_merge($this->$var, $newconfig->$var);
198 public static function build($host, $exact_host_match=false) { 198 }
199 $host = strtolower($host); 199 }
200 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); 200
201 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; 201 // returns SiteConfig instance if an appropriate one is found, false otherwise
202 // check for site configuration 202 // if $exact_host_match is true, we will not look for wildcard config matches
203 $try = array($host); 203 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
204 // should we look for wildcard matches 204 public static function build($host, $exact_host_match=false) {
205 if (!$exact_host_match) { 205 $host = strtolower($host);
206 $split = explode('.', $host); 206 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
207 if (count($split) > 1) { 207 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
208 array_shift($split); 208 // check for site configuration
209 $try[] = '.'.implode('.', $split); 209 $try = array($host);
210 } 210 // should we look for wildcard matches
211 } 211 if (!$exact_host_match) {
212 212 $split = explode('.', $host);
213 // look for site config file in primary folder 213 if (count($split) > 1) {
214 self::debug(". looking for site config for $host in primary folder"); 214 array_shift($split);
215 foreach ($try as $h) { 215 $try[] = '.'.implode('.', $split);
216 if (array_key_exists($h, self::$config_cache)) { 216 }
217 self::debug("... site config for $h already loaded in this request"); 217 }
218 return self::$config_cache[$h]; 218
219 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { 219 // look for site config file in primary folder
220 self::debug("... site config for $h in APC cache"); 220 self::debug(". looking for site config for $host in primary folder");
221 return $sconfig; 221 foreach ($try as $h) {
222 } elseif (file_exists(self::$config_path."/$h.txt")) { 222 if (array_key_exists($h, self::$config_cache)) {
223 self::debug("... found site config ($h.txt)"); 223 self::debug("... site config for $h already loaded in this request");
224 $file_primary = self::$config_path."/$h.txt"; 224 return self::$config_cache[$h];
225 $matched_name = $h; 225 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
226 break; 226 self::debug("... site config for $h in APC cache");
227 } 227 return $sconfig;
228 } 228 } elseif (file_exists(self::$config_path."/$h.txt")) {
229 229 self::debug("... found site config ($h.txt)");
230 // if we found site config, process it 230 $file_primary = self::$config_path."/$h.txt";
231 if (isset($file_primary)) { 231 $matched_name = $h;
232 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 232 break;
233 if (!$config_lines || !is_array($config_lines)) return false; 233 }
234 $config = self::build_from_array($config_lines); 234 }
235 // if APC caching is available and enabled, mark this for cache 235
236 //$config->cache_in_apc = true; 236 // if we found site config, process it
237 $config->cache_key = $matched_name; 237 if (isset($file_primary)) {
238 238 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
239 // if autodetec on failure is off (on by default) we do not need to look 239 if (!$config_lines || !is_array($config_lines)) return false;
240 // in secondary folder 240 $config = self::build_from_array($config_lines);
241 if (!$config->autodetect_on_failure()) { 241 // if APC caching is available and enabled, mark this for cache
242 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); 242 //$config->cache_in_apc = true;
243 return $config; 243 $config->cache_key = $matched_name;
244 } 244
245 } 245 // if autodetec on failure is off (on by default) we do not need to look
246 246 // in secondary folder
247 // look for site config file in secondary folder 247 if (!$config->autodetect_on_failure()) {
248 if (isset(self::$config_path_fallback)) { 248 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
249 self::debug(". looking for site config for $host in secondary folder"); 249 return $config;
250 foreach ($try as $h) { 250 }
251 if (file_exists(self::$config_path_fallback."/$h.txt")) { 251 }
252 self::debug("... found site config in secondary folder ($h.txt)"); 252
253 $file_secondary = self::$config_path_fallback."/$h.txt"; 253 // look for site config file in secondary folder
254 $matched_name = $h; 254 if (isset(self::$config_path_fallback)) {
255 break; 255 self::debug(". looking for site config for $host in secondary folder");
256 } 256 foreach ($try as $h) {
257 } 257 if (file_exists(self::$config_path_fallback."/$h.txt")) {
258 if (!isset($file_secondary)) { 258 self::debug("... found site config in secondary folder ($h.txt)");
259 self::debug("... no site config match in secondary folder"); 259 $file_secondary = self::$config_path_fallback."/$h.txt";
260 } 260 $matched_name = $h;
261 } 261 break;
262 262 }
263 // return false if no config file found 263 }
264 if (!isset($file_primary) && !isset($file_secondary)) { 264 if (!isset($file_secondary)) {
265 self::debug("... no site config match for $host"); 265 self::debug("... no site config match in secondary folder");
266 return false; 266 }
267 } 267 }
268 268
269 // return primary config if secondary not found 269 // return false if no config file found
270 if (!isset($file_secondary) && isset($config)) { 270 if (!isset($file_primary) && !isset($file_secondary)) {
271 return $config; 271 self::debug("... no site config match for $host");
272 } 272 return false;
273 273 }
274 // process secondary config file 274
275 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 275 // return primary config if secondary not found
276 if (!$config_lines || !is_array($config_lines)) { 276 if (!isset($file_secondary) && isset($config)) {
277 // failed to process secondary 277 return $config;
278 if (isset($config)) { 278 }
279 // return primary config 279
280 return $config; 280 // process secondary config file
281 } else { 281 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
282 return false; 282 if (!$config_lines || !is_array($config_lines)) {
283 } 283 // failed to process secondary
284 } 284 if (isset($config)) {
285 285 // return primary config
286 // merge with primary and return 286 return $config;
287 if (isset($config)) { 287 } else {
288 self::debug('. merging config files'); 288 return false;
289 $config->append(self::build_from_array($config_lines)); 289 }
290 return $config; 290 }
291 } else { 291
292 // return just secondary 292 // merge with primary and return
293 $config = self::build_from_array($config_lines); 293 if (isset($config)) {
294 // if APC caching is available and enabled, mark this for cache 294 self::debug('. merging config files');
295 //$config->cache_in_apc = true; 295 $config->append(self::build_from_array($config_lines));
296 $config->cache_key = $matched_name; 296 return $config;
297 return $config; 297 } else {
298 } 298 // return just secondary
299 } 299 $config = self::build_from_array($config_lines);
300 300 // if APC caching is available and enabled, mark this for cache
301 public static function build_from_array(array $lines) { 301 //$config->cache_in_apc = true;
302 $config = new SiteConfig(); 302 $config->cache_key = $matched_name;
303 foreach ($lines as $line) { 303 return $config;
304 $line = trim($line); 304 }
305 305 }
306 // skip comments, empty lines 306
307 if ($line == '' || $line[0] == '#') continue; 307 public static function build_from_array(array $lines) {
308 308 $config = new SiteConfig();
309 // get command 309 foreach ($lines as $line) {
310 $command = explode(':', $line, 2); 310 $line = trim($line);
311 // if there's no colon ':', skip this line 311
312 if (count($command) != 2) continue; 312 // skip comments, empty lines
313 $val = trim($command[1]); 313 if ($line == '' || $line[0] == '#') continue;
314 $command = trim($command[0]); 314
315 if ($command == '' || $val == '') continue; 315 // get command
316 316 $command = explode(':', $line, 2);
317 // check for commands where we accept multiple statements 317 // if there's no colon ':', skip this line
318 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { 318 if (count($command) != 2) continue;
319 array_push($config->$command, $val); 319 $val = trim($command[1]);
320 // check for single statement commands that evaluate to true or false 320 $command = trim($command[0]);
321 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { 321 if ($command == '' || $val == '') continue;
322 $config->$command = ($val == 'yes'); 322
323 // check for single statement commands stored as strings 323 // check for commands where we accept multiple statements
324 } elseif (in_array($command, array('parser'))) { 324 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
325 $config->$command = $val; 325 array_push($config->$command, $val);
326 // check for replace_string(find): replace 326 // check for single statement commands that evaluate to true or false
327 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { 327 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
328 if (in_array($match[1], array('replace_string'))) { 328 $config->$command = ($val == 'yes');
329 $command = $match[1]; 329 // check for single statement commands stored as strings
330 array_push($config->find_string, $match[2]); 330 } elseif (in_array($command, array('parser'))) {
331 array_push($config->$command, $val); 331 $config->$command = $val;
332 } 332 // check for replace_string(find): replace
333 } 333 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
334 } 334 if (in_array($match[1], array('replace_string'))) {
335 return $config; 335 $command = $match[1];
336 } 336 array_push($config->find_string, $match[2]);
337} 337 array_push($config->$command, $val);
338?> \ No newline at end of file 338 }
339 }
340 }
341 return $config;
342 }
343} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php
index 54a56f22..40786598 100644..100755
--- a/inc/3rdparty/libraries/feedwriter/FeedItem.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php
@@ -1,7 +1,7 @@
1<?php 1<?php
2 /** 2 /**
3 * Univarsel Feed Writer 3 * Univarsel Feed Writer
4 * 4 *
5 * FeedItem class - Used as feed element in FeedWriter class 5 * FeedItem class - Used as feed element in FeedWriter class
6 * 6 *
7 * @package UnivarselFeedWriter 7 * @package UnivarselFeedWriter
@@ -12,20 +12,20 @@
12 { 12 {
13 private $elements = array(); //Collection of feed elements 13 private $elements = array(); //Collection of feed elements
14 private $version; 14 private $version;
15 15
16 /** 16 /**
17 * Constructor 17 * Constructor
18 * 18 *
19 * @param contant (RSS1/RSS2/ATOM) RSS2 is default. 19 * @param contant (RSS1/RSS2/ATOM) RSS2 is default.
20 */ 20 */
21 function __construct($version = RSS2) 21 function __construct($version = RSS2)
22 { 22 {
23 $this->version = $version; 23 $this->version = $version;
24 } 24 }
25 25
26 /** 26 /**
27 * Set element (overwrites existing elements with $elementName) 27 * Set element (overwrites existing elements with $elementName)
28 * 28 *
29 * @access public 29 * @access public
30 * @param srting The tag name of an element 30 * @param srting The tag name of an element
31 * @param srting The content of tag 31 * @param srting The content of tag
@@ -38,11 +38,11 @@
38 unset($this->elements[$elementName]); 38 unset($this->elements[$elementName]);
39 } 39 }
40 $this->addElement($elementName, $content, $attributes); 40 $this->addElement($elementName, $content, $attributes);
41 } 41 }
42 42
43 /** 43 /**
44 * Add an element to elements array 44 * Add an element to elements array
45 * 45 *
46 * @access public 46 * @access public
47 * @param srting The tag name of an element 47 * @param srting The tag name of an element
48 * @param srting The content of tag 48 * @param srting The content of tag
@@ -61,11 +61,11 @@
61 $this->elements[$elementName][$i]['content'] = $content; 61 $this->elements[$elementName][$i]['content'] = $content;
62 $this->elements[$elementName][$i]['attributes'] = $attributes; 62 $this->elements[$elementName][$i]['attributes'] = $attributes;
63 } 63 }
64 64
65 /** 65 /**
66 * Set multiple feed elements from an array. 66 * Set multiple feed elements from an array.
67 * Elements which have attributes cannot be added by this method 67 * Elements which have attributes cannot be added by this method
68 * 68 *
69 * @access public 69 * @access public
70 * @param array array of elements in 'tagName' => 'tagContent' format. 70 * @param array array of elements in 'tagName' => 'tagContent' format.
71 * @return void 71 * @return void
@@ -73,15 +73,15 @@
73 public function addElementArray($elementArray) 73 public function addElementArray($elementArray)
74 { 74 {
75 if(! is_array($elementArray)) return; 75 if(! is_array($elementArray)) return;
76 foreach ($elementArray as $elementName => $content) 76 foreach ($elementArray as $elementName => $content)
77 { 77 {
78 $this->addElement($elementName, $content); 78 $this->addElement($elementName, $content);
79 } 79 }
80 } 80 }
81 81
82 /** 82 /**
83 * Return the collection of elements in this feed item 83 * Return the collection of elements in this feed item
84 * 84 *
85 * @access public 85 * @access public
86 * @return array 86 * @return array
87 */ 87 */
@@ -89,68 +89,74 @@
89 { 89 {
90 return $this->elements; 90 return $this->elements;
91 } 91 }
92 92
93 // Wrapper functions ------------------------------------------------------ 93 // Wrapper functions ------------------------------------------------------
94 94
95 /** 95 /**
96 * Set the 'dscription' element of feed item 96 * Set the 'dscription' element of feed item
97 * 97 *
98 * @access public 98 * @access public
99 * @param string The content of 'description' element 99 * @param string The content of 'description' element
100 * @return void 100 * @return void
101 */ 101 */
102 public function setDescription($description) 102 public function setDescription($description)
103 { 103 {
104 $this->setElement('description', $description); 104 $tag = ($this->version == ATOM)? 'summary' : 'description';
105 $this->setElement($tag, $description);
105 } 106 }
106 107
107 /** 108 /**
108 * @desc Set the 'title' element of feed item 109 * @desc Set the 'title' element of feed item
109 * @access public 110 * @access public
110 * @param string The content of 'title' element 111 * @param string The content of 'title' element
111 * @return void 112 * @return void
112 */ 113 */
113 public function setTitle($title) 114 public function setTitle($title)
114 { 115 {
115 $this->setElement('title', $title); 116 $this->setElement('title', $title);
116 } 117 }
117 118
118 /** 119 /**
119 * Set the 'date' element of feed item 120 * Set the 'date' element of feed item
120 * 121 *
121 * @access public 122 * @access public
122 * @param string The content of 'date' element 123 * @param string The content of 'date' element
123 * @return void 124 * @return void
124 */ 125 */
125 public function setDate($date) 126 public function setDate($date)
126 { 127 {
127 if(! is_numeric($date)) 128 if(! is_numeric($date))
128 { 129 {
129 $date = strtotime($date); 130 $date = strtotime($date);
130 } 131 }
131 132
132 if($this->version == RSS2) 133 if($this->version == ATOM)
134 {
135 $tag = 'updated';
136 $value = date(DATE_ATOM, $date);
137 }
138 elseif($this->version == RSS2)
133 { 139 {
134 $tag = 'pubDate'; 140 $tag = 'pubDate';
135 $value = date(DATE_RSS, $date); 141 $value = date(DATE_RSS, $date);
136 } 142 }
137 else 143 else
138 { 144 {
139 $tag = 'dc:date'; 145 $tag = 'dc:date';
140 $value = date("Y-m-d", $date); 146 $value = date("Y-m-d", $date);
141 } 147 }
142 148
143 $this->setElement($tag, $value); 149 $this->setElement($tag, $value);
144 } 150 }
145 151
146 /** 152 /**
147 * Set the 'link' element of feed item 153 * Set the 'link' element of feed item
148 * 154 *
149 * @access public 155 * @access public
150 * @param string The content of 'link' element 156 * @param string The content of 'link' element
151 * @return void 157 * @return void
152 */ 158 */
153 public function setLink($link) 159 public function setLink($link)
154 { 160 {
155 if($this->version == RSS2 || $this->version == RSS1) 161 if($this->version == RSS2 || $this->version == RSS1)
156 { 162 {
@@ -161,27 +167,27 @@
161 { 167 {
162 $this->setElement('link','',array('href'=>$link)); 168 $this->setElement('link','',array('href'=>$link));
163 $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); 169 $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:'));
164 } 170 }
165 171
166 } 172 }
167 173
168 /** 174 /**
169 * Set the 'source' element of feed item 175 * Set the 'source' element of feed item
170 * 176 *
171 * @access public 177 * @access public
172 * @param string The content of 'source' element 178 * @param string The content of 'source' element
173 * @return void 179 * @return void
174 */ 180 */
175 public function setSource($link) 181 public function setSource($link)
176 { 182 {
177 $attributes = array('url'=>$link); 183 $attributes = array('url'=>$link);
178 $this->setElement('source', "wallabag",$attributes); 184 $this->setElement('source', "wallabag",$attributes);
179 } 185 }
180 186
181 /** 187 /**
182 * Set the 'encloser' element of feed item 188 * Set the 'encloser' element of feed item
183 * For RSS 2.0 only 189 * For RSS 2.0 only
184 * 190 *
185 * @access public 191 * @access public
186 * @param string The url attribute of encloser tag 192 * @param string The url attribute of encloser tag
187 * @param string The length attribute of encloser tag 193 * @param string The length attribute of encloser tag
@@ -193,6 +199,6 @@
193 $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); 199 $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);
194 $this->setElement('enclosure','',$attributes); 200 $this->setElement('enclosure','',$attributes);
195 } 201 }
196 202
197 } // end of class FeedItem 203 } // end of class FeedItem
198?> \ No newline at end of file 204?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
index d708e99b..9446cddf 100755
--- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
@@ -2,6 +2,7 @@
2define('RSS2', 1, true); 2define('RSS2', 1, true);
3define('JSON', 2, true); 3define('JSON', 2, true);
4define('JSONP', 3, true); 4define('JSONP', 3, true);
5define('ATOM', 4, true);
5 6
6 /** 7 /**
7 * Univarsel Feed Writer class 8 * Univarsel Feed Writer class
@@ -101,11 +102,11 @@ define('JSONP', 3, true);
101 header('Content-type: application/javascript; charset=UTF-8'); 102 header('Content-type: application/javascript; charset=UTF-8');
102 } 103 }
103 } 104 }
104 105
105 if ($this->version == JSON || $this->version == JSONP) { 106 if ($this->version == JSON || $this->version == JSONP) {
106 $this->json = new stdClass(); 107 $this->json = new stdClass();
107 } 108 }
108 109
109 110
110 $this->printHead(); 111 $this->printHead();
111 $this->printChannels(); 112 $this->printChannels();
@@ -116,6 +117,11 @@ define('JSONP', 3, true);
116 } 117 }
117 } 118 }
118 119
120 public function &getItems()
121 {
122 return $this->items;
123 }
124
119 /** 125 /**
120 * Create a new FeedItem. 126 * Create a new FeedItem.
121 * 127 *
@@ -199,7 +205,8 @@ define('JSONP', 3, true);
199 */ 205 */
200 public function setDescription($description) 206 public function setDescription($description)
201 { 207 {
202 $this->setChannelElement('description', $description); 208 $tag = ($this->version == ATOM)? 'subtitle' : 'description';
209 $this->setChannelElement($tag, $description);
203 } 210 }
204 211
205 /** 212 /**
@@ -244,7 +251,7 @@ define('JSONP', 3, true);
244 { 251 {
245 $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; 252 $out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
246 if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; 253 if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
247 $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; 254 $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
248 echo $out; 255 echo $out;
249 } 256 }
250 elseif ($this->version == JSON || $this->version == JSONP) 257 elseif ($this->version == JSON || $this->version == JSONP)
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php
index 2f5244f9..c4a48b21 100644
--- a/inc/3rdparty/libraries/html5/TreeBuilder.php
+++ b/inc/3rdparty/libraries/html5/TreeBuilder.php
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
134 134
135 // Namespaces for foreign content 135 // Namespaces for foreign content
136 const NS_HTML = null; // to prevent DOM from requiring NS on everything 136 const NS_HTML = null; // to prevent DOM from requiring NS on everything
137 const NS_XHTML = 'http://www.w3.org/1999/xhtml';
137 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; 138 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
138 const NS_SVG = 'http://www.w3.org/2000/svg'; 139 const NS_SVG = 'http://www.w3.org/2000/svg';
139 const NS_XLINK = 'http://www.w3.org/1999/xlink'; 140 const NS_XLINK = 'http://www.w3.org/1999/xlink';
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
3157 } 3158 }
3158 3159
3159 private function insertElement($token, $append = true) { 3160 private function insertElement($token, $append = true) {
3160 $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); 3161 //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
3162 $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
3163 $el = $this->dom->createElementNS($namespaceURI, $token['name']);
3161 3164
3162 if (!empty($token['attr'])) { 3165 if (!empty($token['attr'])) {
3163 foreach($token['attr'] as $attr) { 3166 foreach($token['attr'] as $attr) {
3164 if(!$el->hasAttribute($attr['name'])) { 3167
3168 // mike@macgirvin.com 2011-11-17, check attribute name for
3169 // validity (ignoring extenders and combiners) as illegal chars in names
3170 // causes everything to abort
3171
3172 $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
3173 if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
3165 $el->setAttribute($attr['name'], $attr['value']); 3174 $el->setAttribute($attr['name'], $attr['value']);
3166 } 3175 }
3167 } 3176 }
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
index 83e94f14..e4d5f495 100644
--- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
+++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
@@ -1,404 +1,403 @@
1<?php 1<?php
2/** 2/**
3 * Cookie Jar 3 * Cookie Jar
4 * 4 *
5 * PHP class for handling cookies, as defined by the Netscape spec: 5 * PHP class for handling cookies, as defined by the Netscape spec:
6 * <http://curl.haxx.se/rfc/cookie_spec.html> 6 * <http://curl.haxx.se/rfc/cookie_spec.html>
7 * 7 *
8 * This class should be used to handle cookies (storing cookies from HTTP response messages, and 8 * This class should be used to handle cookies (storing cookies from HTTP response messages, and
9 * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org 9 * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org
10 * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ 10 * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
11 * 11 *
12 * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ 12 * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
13 * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. 13 * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
14 * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. 14 * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
15 * 15 *
16 * @version 0.5 16 * @version 0.5
17 * @date 2011-03-15 17 * @date 2011-03-15
18 * @see http://php.net/HttpRequestPool 18 * @see http://php.net/HttpRequestPool
19 * @author Keyvan Minoukadeh 19 * @author Keyvan Minoukadeh
20 * @copyright 2011 Keyvan Minoukadeh 20 * @copyright 2011 Keyvan Minoukadeh
21 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 21 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
22 */ 22 */
23 23
24class CookieJar 24class CookieJar
25{ 25{
26 /** 26 /**
27 * Cookies - array containing all cookies. 27 * Cookies - array containing all cookies.
28 * 28 *
29 * <pre> 29 * <pre>
30 * Cookies are stored like this: 30 * Cookies are stored like this:
31 * [domain][path][name] = array 31 * [domain][path][name] = array
32 * where array is: 32 * where array is:
33 * 0 => value, 1 => secure, 2 => expires 33 * 0 => value, 1 => secure, 2 => expires
34 * </pre> 34 * </pre>
35 * @var array 35 * @var array
36 * @access private 36 * @access private
37 */ 37 */
38 public $cookies = array(); 38 public $cookies = array();
39 public $debug = false; 39 public $debug = false;
40 40
41 /** 41 /**
42 * Constructor 42 * Constructor
43 */ 43 */
44 function __construct() { 44 function __construct() {
45 } 45 }
46 46
47 protected function debug($msg, $file=null, $line=null) { 47 protected function debug($msg, $file=null, $line=null) {
48 if ($this->debug) { 48 if ($this->debug) {
49 $mem = round(memory_get_usage()/1024, 2); 49 $mem = round(memory_get_usage()/1024, 2);
50 $memPeak = round(memory_get_peak_usage()/1024, 2); 50 $memPeak = round(memory_get_peak_usage()/1024, 2);
51 echo '* ',$msg; 51 echo '* ',$msg;
52 if (isset($file, $line)) echo " ($file line $line)"; 52 if (isset($file, $line)) echo " ($file line $line)";
53 echo ' - mem used: ',$mem," (peak: $memPeak)\n"; 53 echo ' - mem used: ',$mem," (peak: $memPeak)\n";
54 ob_flush(); 54 ob_flush();
55 flush(); 55 flush();
56 } 56 }
57 } 57 }
58 58
59 /** 59 /**
60 * Get matching cookies 60 * Get matching cookies
61 * 61 *
62 * Only use this method if you cannot use add_cookie_header(), for example, if you want to use 62 * Only use this method if you cannot use add_cookie_header(), for example, if you want to use
63 * this cookie jar class without using the request class. 63 * this cookie jar class without using the request class.
64 * 64 *
65 * @param array $param associative array containing 'domain', 'path', 'secure' keys 65 * @param array $param associative array containing 'domain', 'path', 'secure' keys
66 * @return string 66 * @return string
67 * @see add_cookie_header() 67 * @see add_cookie_header()
68 */ 68 */
69 public function getMatchingCookies($url) 69 public function getMatchingCookies($url)
70 { 70 {
71 if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { 71 if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
72 $param['domain'] = $parts['host']; 72 $param['domain'] = $parts['host'];
73 $param['path'] = $parts['path']; 73 $param['path'] = $parts['path'];
74 $param['secure'] = (strtolower($parts['scheme']) == 'https'); 74 $param['secure'] = (strtolower($parts['scheme']) == 'https');
75 unset($parts); 75 unset($parts);
76 } else { 76 } else {
77 return false; 77 return false;
78 } 78 }
79 // RFC 2965 notes: 79 // RFC 2965 notes:
80 // If multiple cookies satisfy the criteria above, they are ordered in 80 // If multiple cookies satisfy the criteria above, they are ordered in
81 // the Cookie header such that those with more specific Path attributes 81 // the Cookie header such that those with more specific Path attributes
82 // precede those with less specific. Ordering with respect to other 82 // precede those with less specific. Ordering with respect to other
83 // attributes (e.g., Domain) is unspecified. 83 // attributes (e.g., Domain) is unspecified.
84 $domain = $param['domain']; 84 $domain = $param['domain'];
85 if (strpos($domain, '.') === false) $domain .= '.local'; 85 if (strpos($domain, '.') === false) $domain .= '.local';
86 $request_path = $param['path']; 86 $request_path = $param['path'];
87 if ($request_path == '') $request_path = '/'; 87 if ($request_path == '') $request_path = '/';
88 $request_secure = $param['secure']; 88 $request_secure = $param['secure'];
89 $now = time(); 89 $now = time();
90 $matched_cookies = array(); 90 $matched_cookies = array();
91 // domain - find matching domains 91 // domain - find matching domains
92 $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); 92 $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
93 while (strpos($domain, '.') !== false) { 93 while (strpos($domain, '.') !== false) {
94 if (isset($this->cookies[$domain])) { 94 if (isset($this->cookies[$domain])) {
95 $this->debug(' domain match found: '.$domain); 95 $this->debug(' domain match found: '.$domain);
96 $cookies =& $this->cookies[$domain]; 96 $cookies =& $this->cookies[$domain];
97 } else { 97 } else {
98 $domain = $this->_reduce_domain($domain); 98 $domain = $this->_reduce_domain($domain);
99 continue; 99 continue;
100 } 100 }
101 // paths - find matching paths starting from most specific 101 // paths - find matching paths starting from most specific
102 $this->debug(' - Finding matching paths for '.$request_path); 102 $this->debug(' - Finding matching paths for '.$request_path);
103 $paths = array_keys($cookies); 103 $paths = array_keys($cookies);
104 usort($paths, array($this, '_cmp_length')); 104 usort($paths, array($this, '_cmp_length'));
105 foreach ($paths as $path) { 105 foreach ($paths as $path) {
106 // continue to next cookie if request path does not path-match cookie path 106 // continue to next cookie if request path does not path-match cookie path
107 if (!$this->_path_match($request_path, $path)) continue; 107 if (!$this->_path_match($request_path, $path)) continue;
108 // loop through cookie names 108 // loop through cookie names
109 $this->debug(' path match found: '.$path); 109 $this->debug(' path match found: '.$path);
110 foreach ($cookies[$path] as $name => $values) { 110 foreach ($cookies[$path] as $name => $values) {
111 // if this cookie is secure but request isn't, continue to next cookie 111 // if this cookie is secure but request isn't, continue to next cookie
112 if ($values[1] && !$request_secure) continue; 112 if ($values[1] && !$request_secure) continue;
113 // if cookie is not a session cookie and has expired, continue to next cookie 113 // if cookie is not a session cookie and has expired, continue to next cookie
114 if (is_int($values[2]) && ($values[2] < $now)) continue; 114 if (is_int($values[2]) && ($values[2] < $now)) continue;
115 // cookie matches request 115 // cookie matches request
116 $this->debug(' cookie match: '.$name.'='.$values[0]); 116 $this->debug(' cookie match: '.$name.'='.$values[0]);
117 $matched_cookies[] = $name.'='.$values[0]; 117 $matched_cookies[] = $name.'='.$values[0];
118 } 118 }
119 } 119 }
120 $domain = $this->_reduce_domain($domain); 120 $domain = $this->_reduce_domain($domain);
121 } 121 }
122 // return cookies 122 // return cookies
123 return implode('; ', $matched_cookies); 123 return implode('; ', $matched_cookies);
124 } 124 }
125 125
126 /** 126 /**
127 * Parse Set-Cookie values. 127 * Parse Set-Cookie values.
128 * 128 *
129 * Only use this method if you cannot use extract_cookies(), for example, if you want to use 129 * Only use this method if you cannot use extract_cookies(), for example, if you want to use
130 * this cookie jar class without using the response class. 130 * this cookie jar class without using the response class.
131 * 131 *
132 * @param array $set_cookies array holding 1 or more "Set-Cookie" header values 132 * @param array $set_cookies array holding 1 or more "Set-Cookie" header values
133 * @param array $param associative array containing 'host', 'path' keys 133 * @param array $param associative array containing 'host', 'path' keys
134 * @return void 134 * @return void
135 * @see extract_cookies() 135 * @see extract_cookies()
136 */ 136 */
137 public function storeCookies($url, $set_cookies) 137 public function storeCookies($url, $set_cookies)
138 { 138 {
139 if (count($set_cookies) == 0) return; 139 if (count($set_cookies) == 0) return;
140 $param = @parse_url($url); 140 $param = @parse_url($url);
141 if (!is_array($param) || !isset($param['host'])) return; 141 if (!is_array($param) || !isset($param['host'])) return;
142 $request_host = $param['host']; 142 $request_host = $param['host'];
143 if (strpos($request_host, '.') === false) $request_host .= '.local'; 143 if (strpos($request_host, '.') === false) $request_host .= '.local';
144 $request_path = @$param['path']; 144 $request_path = @$param['path'];
145 if ($request_path == '') $request_path = '/'; 145 if ($request_path == '') $request_path = '/';
146 // 146 //
147 // loop through set-cookie headers 147 // loop through set-cookie headers
148 // 148 //
149 foreach ($set_cookies as $set_cookie) { 149 foreach ($set_cookies as $set_cookie) {
150 $this->debug('Parsing: '.$set_cookie); 150 $this->debug('Parsing: '.$set_cookie);
151 // temporary cookie store (before adding to jar) 151 // temporary cookie store (before adding to jar)
152 $tmp_cookie = array(); 152 $tmp_cookie = array();
153 $param = explode(';', $set_cookie); 153 $param = explode(';', $set_cookie);
154 // loop through params 154 // loop through params
155 for ($x=0; $x<count($param); $x++) { 155 for ($x=0; $x<count($param); $x++) {
156 $key_val = explode('=', $param[$x], 2); 156 $key_val = explode('=', $param[$x], 2);
157 if (count($key_val) != 2) { 157 if (count($key_val) != 2) {
158 // if the first param isn't a name=value pair, continue to the next set-cookie 158 // if the first param isn't a name=value pair, continue to the next set-cookie
159 // header 159 // header
160 if ($x == 0) continue 2; 160 if ($x == 0) continue 2;
161 // check for secure flag 161 // check for secure flag
162 if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; 162 if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
163 // continue to next param 163 // continue to next param
164 continue; 164 continue;
165 } 165 }
166 list($key, $val) = array_map('trim', $key_val); 166 list($key, $val) = array_map('trim', $key_val);
167 // first name=value pair is the cookie name and value 167 // first name=value pair is the cookie name and value
168 // the name and value are stored under 'name' and 'value' to avoid conflicts 168 // the name and value are stored under 'name' and 'value' to avoid conflicts
169 // with later parameters. 169 // with later parameters.
170 if ($x == 0) { 170 if ($x == 0) {
171 $tmp_cookie = array('name'=>$key, 'value'=>$val); 171 $tmp_cookie = array('name'=>$key, 'value'=>$val);
172 continue; 172 continue;
173 } 173 }
174 $key = strtolower($key); 174 $key = strtolower($key);
175 if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { 175 if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
176 $tmp_cookie[$key] = $val; 176 $tmp_cookie[$key] = $val;
177 } 177 }
178 } 178 }
179 // 179 //
180 // set cookie 180 // set cookie
181 // 181 //
182 // check domain 182 // check domain
183 if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && 183 if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
184 ($tmp_cookie['domain'] != ".$request_host")) { 184 ($tmp_cookie['domain'] != ".$request_host")) {
185 $domain = $tmp_cookie['domain']; 185 $domain = $tmp_cookie['domain'];
186 if ((strpos($domain, '.') === false) && ($domain != 'local')) { 186 if ((strpos($domain, '.') === false) && ($domain != 'local')) {
187 $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); 187 $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
188 continue; 188 continue;
189 } 189 }
190 if (preg_match('/\.[0-9]+$/', $domain)) { 190 if (preg_match('/\.[0-9]+$/', $domain)) {
191 $this->debug(' - domain "'.$domain.'" appears to be an ip address'); 191 $this->debug(' - domain "'.$domain.'" appears to be an ip address');
192 continue; 192 continue;
193 } 193 }
194 if (substr($domain, 0, 1) != '.') $domain = ".$domain"; 194 if (substr($domain, 0, 1) != '.') $domain = ".$domain";
195 if (!$this->_domain_match($request_host, $domain)) { 195 if (!$this->_domain_match($request_host, $domain)) {
196 $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); 196 $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
197 continue; 197 continue;
198 } 198 }
199 } else { 199 } else {
200 // if domain is not specified in the set-cookie header, domain will default to 200 // if domain is not specified in the set-cookie header, domain will default to
201 // the request host 201 // the request host
202 $domain = $request_host; 202 $domain = $request_host;
203 } 203 }
204 // check path 204 // check path
205 if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { 205 if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
206 $path = urldecode($tmp_cookie['path']); 206 $path = urldecode($tmp_cookie['path']);
207 if (!$this->_path_match($request_path, $path)) { 207 if (!$this->_path_match($request_path, $path)) {
208 $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); 208 $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
209 continue; 209 continue;
210 } 210 }
211 } else { 211 } else {
212 $path = $request_path; 212 $path = $request_path;
213 $path = substr($path, 0, strrpos($path, '/')); 213 $path = substr($path, 0, strrpos($path, '/'));
214 if ($path == '') $path = '/'; 214 if ($path == '') $path = '/';
215 } 215 }
216 // check if secure 216 // check if secure
217 $secure = (isset($tmp_cookie['secure'])) ? true : false; 217 $secure = (isset($tmp_cookie['secure'])) ? true : false;
218 // check expiry 218 // check expiry
219 if (isset($tmp_cookie['expires'])) { 219 if (isset($tmp_cookie['expires'])) {
220 if (($expires = strtotime($tmp_cookie['expires'])) < 0) { 220 if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
221 $expires = null; 221 $expires = null;
222 } 222 }
223 } else { 223 } else {
224 $expires = null; 224 $expires = null;
225 } 225 }
226 // set cookie 226 // set cookie
227 $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); 227 $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
228 } 228 }
229 } 229 }
230 230
231 // return array of set-cookie values extracted from HTTP response headers (string $h) 231 // return array of set-cookie values extracted from HTTP response headers (string $h)
232 public function extractCookies($h) { 232 public function extractCookies($h) {
233 $x = 0; 233 $x = 0;
234 $lines = 0; 234 $lines = 0;
235 $headers = array(); 235 $headers = array();
236 $last_match = false; 236 $last_match = false;
237 $h = explode("\n", $h); 237 $h = explode("\n", $h);
238 foreach ($h as $line) { 238 foreach ($h as $line) {
239 $line = rtrim($line); 239 $line = rtrim($line);
240 $lines++; 240 $lines++;
241 241
242 $trimmed_line = trim($line); 242 $trimmed_line = trim($line);
243 if (isset($line_last)) { 243 if (isset($line_last)) {
244 // check if we have \r\n\r\n (indicating the end of headers) 244 // check if we have \r\n\r\n (indicating the end of headers)
245 // some servers will not use CRLF (\r\n), so we make CR (\r) optional. 245 // some servers will not use CRLF (\r\n), so we make CR (\r) optional.
246 // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { 246 // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
247 // break; 247 // break;
248 // } 248 // }
249 // As an alternative, we can check if the current trimmed line is empty 249 // As an alternative, we can check if the current trimmed line is empty
250 if ($trimmed_line == '') { 250 if ($trimmed_line == '') {
251 break; 251 break;
252 } 252 }
253 253
254 // check for continuation line... 254 // check for continuation line...
255 // RFC 2616 Section 2.2 "Basic Rules": 255 // RFC 2616 Section 2.2 "Basic Rules":
256 // HTTP/1.1 header field values can be folded onto multiple lines if the 256 // HTTP/1.1 header field values can be folded onto multiple lines if the
257 // continuation line begins with a space or horizontal tab. All linear 257 // continuation line begins with a space or horizontal tab. All linear
258 // white space, including folding, has the same semantics as SP. A 258 // white space, including folding, has the same semantics as SP. A
259 // recipient MAY replace any linear white space with a single SP before 259 // recipient MAY replace any linear white space with a single SP before
260 // interpreting the field value or forwarding the message downstream. 260 // interpreting the field value or forwarding the message downstream.
261 if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { 261 if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
262 // append to previous header value 262 // append to previous header value
263 $headers[$x-1] .= ' '.rtrim($match[1]); 263 $headers[$x-1] .= ' '.rtrim($match[1]);
264 continue; 264 continue;
265 } 265 }
266 } 266 }
267 $line_last = $line; 267 $line_last = $line;
268 268
269 // split header name and value 269 // split header name and value
270 if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { 270 if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
271 $headers[$x++] = rtrim($match[1]); 271 $headers[$x++] = rtrim($match[1]);
272 $last_match = true; 272 $last_match = true;
273 } else { 273 } else {
274 $last_match = false; 274 $last_match = false;
275 } 275 }
276 } 276 }
277 return $headers; 277 return $headers;
278 } 278 }
279 279
280 /** 280 /**
281 * Set Cookie 281 * Set Cookie
282 * @param string $domain 282 * @param string $domain
283 * @param string $path 283 * @param string $path
284 * @param string $name cookie name 284 * @param string $name cookie name
285 * @param string $value cookie value 285 * @param string $value cookie value
286 * @param bool $secure 286 * @param bool $secure
287 * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) 287 * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
288 * @return void 288 * @return void
289 */ 289 */
290 function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) 290 function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
291 { 291 {
292 if ($domain == '') return; 292 if ($domain == '') return;
293 if ($path == '') return; 293 if ($path == '') return;
294 if ($name == '') return; 294 if ($name == '') return;
295 // check if cookie needs to go 295 // check if cookie needs to go
296 if (isset($expires) && ($expires <= 0)) { 296 if (isset($expires) && ($expires <= 0)) {
297 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); 297 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
298 return; 298 return;
299 } 299 }
300 if ($value == '') return; 300 if ($value == '') return;
301 $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); 301 $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
302 return; 302 return;
303 } 303 }
304 304
305 /** 305 /**
306 * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. 306 * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
307 * @param string $domain 307 * @param string $domain
308 * @param string $path 308 * @param string $path
309 * @param string $name 309 * @param string $name
310 * @return void 310 * @return void
311 */ 311 */
312 function clear($domain=null, $path=null, $name=null) 312 function clear($domain=null, $path=null, $name=null)
313 { 313 {
314 if (!isset($domain)) { 314 if (!isset($domain)) {
315 $this->cookies = array(); 315 $this->cookies = array();
316 } elseif (!isset($path)) { 316 } elseif (!isset($path)) {
317 if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); 317 if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
318 } elseif (!isset($name)) { 318 } elseif (!isset($name)) {
319 if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); 319 if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
320 } elseif (isset($name)) { 320 } elseif (isset($name)) {
321 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); 321 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
322 } 322 }
323 } 323 }
324 324
325 /** 325 /**
326 * Compare string length - used for sorting 326 * Compare string length - used for sorting
327 * @access private 327 * @access private
328 * @return int 328 * @return int
329 */ 329 */
330 function _cmp_length($a, $b) 330 function _cmp_length($a, $b)
331 { 331 {
332 $la = strlen($a); $lb = strlen($b); 332 $la = strlen($a); $lb = strlen($b);
333 if ($la == $lb) return 0; 333 if ($la == $lb) return 0;
334 return ($la > $lb) ? -1 : 1; 334 return ($la > $lb) ? -1 : 1;
335 } 335 }
336 336
337 /** 337 /**
338 * Reduce domain 338 * Reduce domain
339 * @param string $domain 339 * @param string $domain
340 * @return string 340 * @return string
341 * @access private 341 * @access private
342 */ 342 */
343 function _reduce_domain($domain) 343 function _reduce_domain($domain)
344 { 344 {
345 if ($domain == '') return ''; 345 if ($domain == '') return '';
346 if (substr($domain, 0, 1) == '.') return substr($domain, 1); 346 if (substr($domain, 0, 1) == '.') return substr($domain, 1);
347 return substr($domain, strpos($domain, '.')); 347 return substr($domain, strpos($domain, '.'));
348 } 348 }
349 349
350 /** 350 /**
351 * Path match - check if path1 path-matches path2 351 * Path match - check if path1 path-matches path2
352 * 352 *
353 * From RFC 2965: 353 * From RFC 2965:
354 * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 354 * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
355 * if P2 is a prefix of P1 (including the case where P1 and P2 string- 355 * if P2 is a prefix of P1 (including the case where P1 and P2 string-
356 * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> 356 * compare equal). Thus, the string /tec/waldo path-matches /tec.</i>
357 * @param string $path1 357 * @param string $path1
358 * @param string $path2 358 * @param string $path2
359 * @return bool 359 * @return bool
360 * @access private 360 * @access private
361 */ 361 */
362 function _path_match($path1, $path2) 362 function _path_match($path1, $path2)
363 { 363 {
364 return (substr($path1, 0, strlen($path2)) == $path2); 364 return (substr($path1, 0, strlen($path2)) == $path2);
365 } 365 }
366 366
367 /** 367 /**
368 * Domain match - check if domain1 domain-matches domain2 368 * Domain match - check if domain1 domain-matches domain2
369 * 369 *
370 * A few extracts from RFC 2965: 370 * A few extracts from RFC 2965:
371 * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com 371 * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
372 * would be rejected, because H is y.x and contains a dot. 372 * would be rejected, because H is y.x and contains a dot.
373 * 373 *
374 * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com 374 * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
375 * would be accepted. 375 * would be accepted.
376 * 376 *
377 * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be 377 * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be
378 * rejected, because there is no embedded dot. 378 * rejected, because there is no embedded dot.
379 * 379 *
380 * - A Set-Cookie2 from request-host example for Domain=.local will 380 * - A Set-Cookie2 from request-host example for Domain=.local will
381 * be accepted, because the effective host name for the request- 381 * be accepted, because the effective host name for the request-
382 * host is example.local, and example.local domain-matches .local. 382 * host is example.local, and example.local domain-matches .local.
383 * 383 *
384 * I'm ignoring the first point for now (must check to see how other browsers handle 384 * I'm ignoring the first point for now (must check to see how other browsers handle
385 * this rule for Set-Cookie headers) 385 * this rule for Set-Cookie headers)
386 * 386 *
387 * @param string $domain1 387 * @param string $domain1
388 * @param string $domain2 388 * @param string $domain2
389 * @return bool 389 * @return bool
390 * @access private 390 * @access private
391 */ 391 */
392 function _domain_match($domain1, $domain2) 392 function _domain_match($domain1, $domain2)
393 { 393 {
394 $domain1 = strtolower($domain1); 394 $domain1 = strtolower($domain1);
395 $domain2 = strtolower($domain2); 395 $domain2 = strtolower($domain2);
396 while (strpos($domain1, '.') !== false) { 396 while (strpos($domain1, '.') !== false) {
397 if ($domain1 == $domain2) return true; 397 if ($domain1 == $domain2) return true;
398 $domain1 = $this->_reduce_domain($domain1); 398 $domain1 = $this->_reduce_domain($domain1);
399 continue; 399 continue;
400 } 400 }
401 return false; 401 return false;
402 } 402 }
403} 403} \ No newline at end of file
404?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
index e4f1b3b3..963f0c05 100644
--- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -1,779 +1,810 @@
1<?php 1<?php
2/** 2/**
3 * Humble HTTP Agent 3 * Humble HTTP Agent
4 * 4 *
5 * This class is designed to take advantage of parallel HTTP requests 5 * This class is designed to take advantage of parallel HTTP requests
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions. 6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
7 * For environments which do not have these options, it reverts to standard sequential 7 * For environments which do not have these options, it reverts to standard sequential
8 * requests (using file_get_contents()) 8 * requests (using file_get_contents())
9 * 9 *
10 * @version 1.1 10 * @version 1.4
11 * @date 2012-08-20 11 * @date 2013-05-10
12 * @see http://php.net/HttpRequestPool 12 * @see http://php.net/HttpRequestPool
13 * @author Keyvan Minoukadeh 13 * @author Keyvan Minoukadeh
14 * @copyright 2011-2012 Keyvan Minoukadeh 14 * @copyright 2011-2013 Keyvan Minoukadeh
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
16 */ 16 */
17 17
18class HumbleHttpAgent 18class HumbleHttpAgent
19{ 19{
20 const METHOD_REQUEST_POOL = 1; 20 const METHOD_REQUEST_POOL = 1;
21 const METHOD_CURL_MULTI = 2; 21 const METHOD_CURL_MULTI = 2;
22 const METHOD_FILE_GET_CONTENTS = 4; 22 const METHOD_FILE_GET_CONTENTS = 4;
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; 23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; 24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
25 const UA_PHP = 'PHP/5.2'; 25 const UA_PHP = 'PHP/5.4';
26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; 26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
27 27
28 protected $requests = array(); 28 protected $requests = array();
29 protected $redirectQueue = array(); 29 protected $redirectQueue = array();
30 protected $requestOptions; 30 protected $requestOptions;
31 protected $maxParallelRequests = 5; 31 protected $maxParallelRequests = 5;
32 protected $cache = null; //TODO 32 protected $cache = null; //TODO
33 protected $httpContext; 33 protected $httpContext;
34 protected $minimiseMemoryUse = false; //TODO 34 protected $minimiseMemoryUse = false; //TODO
35 protected $method; 35 protected $method;
36 protected $cookieJar; 36 protected $cookieJar;
37 public $debug = false; 37 public $debug = false;
38 public $debugVerbose = false; 38 public $debugVerbose = false;
39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html 39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
40 public $maxRedirects = 5; 40 public $maxRedirects = 5;
41 public $userAgentMap = array(); 41 public $userAgentMap = array();
42 public $rewriteUrls = array(); 42 public $rewriteUrls = array();
43 public $userAgentDefault; 43 public $userAgentDefault;
44 public $referer; 44 public $referer;
45 //public $userAgent = 'Mozilla/5.0'; 45 //public $userAgent = 'Mozilla/5.0';
46 46
47 // Prevent certain file/mime types 47 // Prevent certain file/mime types
48 // HTTP responses which match these content types will 48 // HTTP responses which match these content types will
49 // be returned without body. 49 // be returned without body.
50 public $headerOnlyTypes = array(); 50 public $headerOnlyTypes = array();
51 // URLs ending with one of these extensions will 51 // URLs ending with one of these extensions will
52 // prompt Humble HTTP Agent to send a HEAD request first 52 // prompt Humble HTTP Agent to send a HEAD request first
53 // to see if returned content type matches $headerOnlyTypes. 53 // to see if returned content type matches $headerOnlyTypes.
54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); 54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
55 // AJAX triggers to search for. 55 // AJAX triggers to search for.
56 // for AJAX sites, e.g. Blogger with its dynamic views templates. 56 // for AJAX sites, e.g. Blogger with its dynamic views templates.
57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); 57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
58 58
59 //TODO: set max file size 59 //TODO: set max file size
60 //TODO: normalise headers 60 //TODO: normalise headers
61 61
62 function __construct($requestOptions=null, $method=null) { 62 function __construct($requestOptions=null, $method=null) {
63 $this->userAgentDefault = self::UA_BROWSER; 63 $this->userAgentDefault = self::UA_BROWSER;
64 $this->referer = self::REF_GOOGLE; 64 $this->referer = self::REF_GOOGLE;
65 // set the request method 65 // set the request method
66 if (in_array($method, array(1,2,4))) { 66 if (in_array($method, array(1,2,4))) {
67 $this->method = $method; 67 $this->method = $method;
68 } else { 68 } else {
69 if (class_exists('HttpRequestPool')) { 69 if (class_exists('HttpRequestPool')) {
70 $this->method = self::METHOD_REQUEST_POOL; 70 $this->method = self::METHOD_REQUEST_POOL;
71 } elseif (function_exists('curl_multi_init')) { 71 } elseif (function_exists('curl_multi_init')) {
72 $this->method = self::METHOD_CURL_MULTI; 72 $this->method = self::METHOD_CURL_MULTI;
73 } else { 73 } else {
74 $this->method = self::METHOD_FILE_GET_CONTENTS; 74 $this->method = self::METHOD_FILE_GET_CONTENTS;
75 } 75 }
76 } 76 }
77 if ($this->method == self::METHOD_CURL_MULTI) { 77 if ($this->method == self::METHOD_CURL_MULTI) {
78 require_once(dirname(__FILE__).'/RollingCurl.php'); 78 require_once(dirname(__FILE__).'/RollingCurl.php');
79 } 79 }
80 // create cookie jar 80 // create cookie jar
81 $this->cookieJar = new CookieJar(); 81 $this->cookieJar = new CookieJar();
82 // set request options (redirect must be 0) 82 // set request options (redirect must be 0)
83 $this->requestOptions = array( 83 $this->requestOptions = array(
84 'timeout' => 15, 84 'timeout' => 15,
85 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web 85 'connecttimeout' => 15,
86 // TODO: test onprogress? 86 'dns_cache_timeout' => 300,
87 ); 87 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
88 if (is_array($requestOptions)) { 88 // TODO: test onprogress?
89 $this->requestOptions = array_merge($this->requestOptions, $requestOptions); 89 );
90 } 90 if (is_array($requestOptions)) {
91 $this->httpContext = array( 91 $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
92 'http' => array( 92 }
93 'ignore_errors' => true, 93 $this->httpContext = array(
94 'timeout' => $this->requestOptions['timeout'], 94 'http' => array(
95 'max_redirects' => $this->requestOptions['redirect'], 95 'ignore_errors' => true,
96 'header' => "Accept: */*\r\n" 96 'timeout' => $this->requestOptions['timeout'],
97 ) 97 'max_redirects' => $this->requestOptions['redirect'],
98 ); 98 'header' => "Accept: */*\r\n"
99 } 99 )
100 100 );
101 protected function debug($msg) { 101 }
102 if ($this->debug) { 102
103 $mem = round(memory_get_usage()/1024, 2); 103 protected function debug($msg) {
104 $memPeak = round(memory_get_peak_usage()/1024, 2); 104 if ($this->debug) {
105 echo '* ',$msg; 105 $mem = round(memory_get_usage()/1024, 2);
106 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; 106 $memPeak = round(memory_get_peak_usage()/1024, 2);
107 echo "\n"; 107 echo '* ',$msg;
108 ob_flush(); 108 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
109 flush(); 109 echo "\n";
110 } 110 ob_flush();
111 } 111 flush();
112 112 }
113 protected function getUserAgent($url, $asArray=false) { 113 }
114 $host = @parse_url($url, PHP_URL_HOST); 114
115 if (strtolower(substr($host, 0, 4)) == 'www.') { 115 protected function getUserAgent($url, $asArray=false) {
116 $host = substr($host, 4); 116 $host = @parse_url($url, PHP_URL_HOST);
117 } 117 if (strtolower(substr($host, 0, 4)) == 'www.') {
118 if ($host) { 118 $host = substr($host, 4);
119 $try = array($host); 119 }
120 $split = explode('.', $host); 120 if ($host) {
121 if (count($split) > 1) { 121 $try = array($host);
122 array_shift($split); 122 $split = explode('.', $host);
123 $try[] = '.'.implode('.', $split); 123 if (count($split) > 1) {
124 } 124 array_shift($split);
125 foreach ($try as $h) { 125 $try[] = '.'.implode('.', $split);
126 if (isset($this->userAgentMap[$h])) { 126 }
127 $ua = $this->userAgentMap[$h]; 127 foreach ($try as $h) {
128 break; 128 if (isset($this->userAgentMap[$h])) {
129 } 129 $ua = $this->userAgentMap[$h];
130 } 130 break;
131 } 131 }
132 if (!isset($ua)) $ua = $this->userAgentDefault; 132 }
133 if ($asArray) { 133 }
134 return array('User-Agent' => $ua); 134 if (!isset($ua)) $ua = $this->userAgentDefault;
135 } else { 135 if ($asArray) {
136 return 'User-Agent: '.$ua; 136 return array('User-Agent' => $ua);
137 } 137 } else {
138 } 138 return 'User-Agent: '.$ua;
139 139 }
140 public function rewriteHashbangFragment($url) { 140 }
141 // return $url if there's no '#!' 141
142 if (strpos($url, '#!') === false) return $url; 142 public function rewriteHashbangFragment($url) {
143 // split $url and rewrite 143 // return $url if there's no '#!'
144 // TODO: is SimplePie_IRI included? 144 if (strpos($url, '#!') === false) return $url;
145 $iri = new SimplePie_IRI($url); 145 // split $url and rewrite
146 $fragment = substr($iri->fragment, 1); // strip '!' 146 // TODO: is SimplePie_IRI included?
147 $iri->fragment = null; 147 $iri = new SimplePie_IRI($url);
148 if (isset($iri->query)) { 148 $fragment = substr($iri->fragment, 1); // strip '!'
149 parse_str($iri->query, $query); 149 $iri->fragment = null;
150 } else { 150 if (isset($iri->query)) {
151 $query = array(); 151 parse_str($iri->query, $query);
152 } 152 } else {
153 $query['_escaped_fragment_'] = (string)$fragment; 153 $query = array();
154 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites 154 }
155 return $iri->get_iri(); 155 $query['_escaped_fragment_'] = (string)$fragment;
156 } 156 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
157 157 return $iri->get_iri();
158 public function getUglyURL($url, $html) { 158 }
159 if ($html == '') return false; 159
160 $found = false; 160 public function getRedirectURLfromHTML($url, $html) {
161 foreach ($this->ajaxTriggers as $string) { 161 $redirect_url = $this->getMetaRefreshURL($url, $html);
162 if (stripos($html, $string)) { 162 if (!$redirect_url) {
163 $found = true; 163 $redirect_url = $this->getUglyURL($url, $html);
164 break; 164 }
165 } 165 return $redirect_url;
166 } 166 }
167 if (!$found) return false; 167
168 $iri = new SimplePie_IRI($url); 168 public function getMetaRefreshURL($url, $html) {
169 if (isset($iri->query)) { 169 if ($html == '') return false;
170 parse_str($iri->query, $query); 170 // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
171 } else { 171 if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
172 $query = array(); 172 return false;
173 } 173 }
174 $query['_escaped_fragment_'] = ''; 174 $redirect_url = $match[1];
175 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites 175 if (preg_match('!^https?://!i', $redirect_url)) {
176 return $iri->get_iri(); 176 // already absolute
177 } 177 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
178 178 return $redirect_url;
179 public function removeFragment($url) { 179 }
180 $pos = strpos($url, '#'); 180 // absolutize redirect URL
181 if ($pos === false) { 181 $base = new SimplePie_IRI($url);
182 return $url; 182 // remove '//' in URL path (causes URLs not to resolve properly)
183 } else { 183 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
184 return substr($url, 0, $pos); 184 if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
185 } 185 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
186 } 186 return $absolute;
187 187 }
188 public function rewriteUrls($url) { 188 return false;
189 foreach ($this->rewriteUrls as $find => $action) { 189 }
190 if (strpos($url, $find) !== false) { 190
191 if (is_array($action)) { 191 public function getUglyURL($url, $html) {
192 return strtr($url, $action); 192 if ($html == '') return false;
193 } 193 $found = false;
194 } 194 foreach ($this->ajaxTriggers as $string) {
195 } 195 if (stripos($html, $string)) {
196 return $url; 196 $found = true;
197 } 197 break;
198 198 }
199 public function enableDebug($bool=true) { 199 }
200 $this->debug = (bool)$bool; 200 if (!$found) return false;
201 } 201 $iri = new SimplePie_IRI($url);
202 202 if (isset($iri->query)) {
203 public function minimiseMemoryUse($bool = true) { 203 parse_str($iri->query, $query);
204 $this->minimiseMemoryUse = $bool; 204 } else {
205 } 205 $query = array();
206 206 }
207 public function setMaxParallelRequests($max) { 207 $query['_escaped_fragment_'] = '';
208 $this->maxParallelRequests = $max; 208 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
209 } 209 $ugly_url = $iri->get_iri();
210 210 $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
211 public function validateUrl($url) { 211 return $ugly_url;
212 $url = filter_var($url, FILTER_SANITIZE_URL); 212 }
213 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); 213
214 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) 214 public function removeFragment($url) {
215 if ($test === false) { 215 $pos = strpos($url, '#');
216 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); 216 if ($pos === false) {
217 } 217 return $url;
218 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { 218 } else {
219 return $url; 219 return substr($url, 0, $pos);
220 } else { 220 }
221 return false; 221 }
222 } 222
223 } 223 public function rewriteUrls($url) {
224 224 foreach ($this->rewriteUrls as $find => $action) {
225 public function fetchAll(array $urls) { 225 if (strpos($url, $find) !== false) {
226 $this->fetchAllOnce($urls, $isRedirect=false); 226 if (is_array($action)) {
227 $redirects = 0; 227 return strtr($url, $action);
228 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { 228 }
229 $this->debug("Following redirects #$redirects..."); 229 }
230 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); 230 }
231 } 231 return $url;
232 } 232 }
233 233
234 // fetch all URLs without following redirects 234 public function enableDebug($bool=true) {
235 public function fetchAllOnce(array $urls, $isRedirect=false) { 235 $this->debug = (bool)$bool;
236 if (!$isRedirect) $urls = array_unique($urls); 236 }
237 if (empty($urls)) return; 237
238 238 public function minimiseMemoryUse($bool = true) {
239 ////////////////////////////////////////////////////// 239 $this->minimiseMemoryUse = $bool;
240 // parallel (HttpRequestPool) 240 }
241 if ($this->method == self::METHOD_REQUEST_POOL) { 241
242 $this->debug('Starting parallel fetch (HttpRequestPool)'); 242 public function setMaxParallelRequests($max) {
243 try { 243 $this->maxParallelRequests = $max;
244 while (count($urls) > 0) { 244 }
245 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); 245
246 $subset = array_splice($urls, 0, $this->maxParallelRequests); 246 public function validateUrl($url) {
247 $pool = new HttpRequestPool(); 247 $url = filter_var($url, FILTER_SANITIZE_URL);
248 foreach ($subset as $orig => $url) { 248 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
249 if (!$isRedirect) $orig = $url; 249 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
250 unset($this->redirectQueue[$orig]); 250 if ($test === false) {
251 $this->debug("...$url"); 251 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
252 if (!$isRedirect && isset($this->requests[$url])) { 252 }
253 $this->debug("......in memory"); 253 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
254 /* 254 return $url;
255 } elseif ($this->isCached($url)) { 255 } else {
256 $this->debug("......is cached"); 256 return false;
257 if (!$this->minimiseMemoryUse) { 257 }
258 $this->requests[$url] = $this->getCached($url); 258 }
259 } 259
260 */ 260 public function fetchAll(array $urls) {
261 } else { 261 $this->fetchAllOnce($urls, $isRedirect=false);
262 $this->debug("......adding to pool"); 262 $redirects = 0;
263 $req_url = $this->rewriteUrls($url); 263 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
264 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 264 $this->debug("Following redirects #$redirects...");
265 $req_url = $this->removeFragment($req_url); 265 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
266 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { 266 }
267 $_meth = HttpRequest::METH_HEAD; 267 }
268 } else { 268
269 $_meth = HttpRequest::METH_GET; 269 // fetch all URLs without following redirects
270 unset($this->requests[$orig]['wrongGuess']); 270 public function fetchAllOnce(array $urls, $isRedirect=false) {
271 } 271 if (!$isRedirect) $urls = array_unique($urls);
272 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); 272 if (empty($urls)) return;
273 // send cookies, if we have any 273
274 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 274 //////////////////////////////////////////////////////
275 $this->debug("......sending cookies: $cookies"); 275 // parallel (HttpRequestPool)
276 $httpRequest->addHeaders(array('Cookie' => $cookies)); 276 if ($this->method == self::METHOD_REQUEST_POOL) {
277 } 277 $this->debug('Starting parallel fetch (HttpRequestPool)');
278 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); 278 try {
279 $httpRequest->addHeaders($this->getUserAgent($req_url, true)); 279 while (count($urls) > 0) {
280 // add referer for picky sites 280 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
281 $httpRequest->addheaders(array('Referer' => $this->referer)); 281 $subset = array_splice($urls, 0, $this->maxParallelRequests);
282 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); 282 $pool = new HttpRequestPool();
283 $this->requests[$orig]['original_url'] = $orig; 283 foreach ($subset as $orig => $url) {
284 $pool->attach($httpRequest); 284 if (!$isRedirect) $orig = $url;
285 } 285 unset($this->redirectQueue[$orig]);
286 } 286 $this->debug("...$url");
287 // did we get anything into the pool? 287 if (!$isRedirect && isset($this->requests[$url])) {
288 if (count($pool) > 0) { 288 $this->debug("......in memory");
289 $this->debug('Sending request...'); 289 /*
290 try { 290 } elseif ($this->isCached($url)) {
291 $pool->send(); 291 $this->debug("......is cached");
292 } catch (HttpRequestPoolException $e) { 292 if (!$this->minimiseMemoryUse) {
293 // do nothing 293 $this->requests[$url] = $this->getCached($url);
294 } 294 }
295 $this->debug('Received responses'); 295 */
296 foreach($subset as $orig => $url) { 296 } else {
297 if (!$isRedirect) $orig = $url; 297 $this->debug("......adding to pool");
298 $request = $this->requests[$orig]['httpRequest']; 298 $req_url = $this->rewriteUrls($url);
299 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); 299 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
300 // getResponseHeader() doesn't return status line, so, for consistency... 300 $req_url = $this->removeFragment($req_url);
301 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); 301 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
302 // check content type 302 $_meth = HttpRequest::METH_HEAD;
303 // TODO: use getResponseHeader('content-type') or getResponseInfo() 303 } else {
304 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 304 $_meth = HttpRequest::METH_GET;
305 $this->requests[$orig]['body'] = ''; 305 unset($this->requests[$orig]['wrongGuess']);
306 $_header_only_type = true; 306 }
307 $this->debug('Header only type returned'); 307 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
308 } else { 308 // send cookies, if we have any
309 $this->requests[$orig]['body'] = $request->getResponseBody(); 309 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
310 $_header_only_type = false; 310 $this->debug("......sending cookies: $cookies");
311 } 311 $httpRequest->addHeaders(array('Cookie' => $cookies));
312 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); 312 }
313 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); 313 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
314 // is redirect? 314 $httpRequest->addHeaders($this->getUserAgent($req_url, true));
315 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { 315 // add referer for picky sites
316 $redirectURL = $request->getResponseHeader('location'); 316 $httpRequest->addheaders(array('Referer' => $this->referer));
317 if (!preg_match('!^https?://!i', $redirectURL)) { 317 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
318 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 318 $this->requests[$orig]['original_url'] = $orig;
319 } 319 $pool->attach($httpRequest);
320 if ($this->validateURL($redirectURL)) { 320 }
321 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 321 }
322 // store any cookies 322 // did we get anything into the pool?
323 $cookies = $request->getResponseHeader('set-cookie'); 323 if (count($pool) > 0) {
324 if ($cookies && !is_array($cookies)) $cookies = array($cookies); 324 $this->debug('Sending request...');
325 if ($cookies) $this->cookieJar->storeCookies($url, $cookies); 325 try {
326 $this->redirectQueue[$orig] = $redirectURL; 326 $pool->send();
327 } else { 327 } catch (HttpRequestPoolException $e) {
328 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 328 // do nothing
329 } 329 }
330 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { 330 $this->debug('Received responses');
331 // the response content-type did not match our 'header only' types, 331 foreach($subset as $orig => $url) {
332 // but we'd issues a HEAD request because we assumed it would. So 332 if (!$isRedirect) $orig = $url;
333 // let's queue a proper GET request for this item... 333 $request = $this->requests[$orig]['httpRequest'];
334 $this->debug('Wrong guess at content-type, queing GET request'); 334 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
335 $this->requests[$orig]['wrongGuess'] = true; 335 // getResponseHeader() doesn't return status line, so, for consistency...
336 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; 336 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
337 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 337 // check content type
338 // check for <meta name='fragment' content='!'/> 338 // TODO: use getResponseHeader('content-type') or getResponseInfo()
339 // for AJAX sites, e.g. Blogger with its dynamic views templates. 339 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
340 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 340 $this->requests[$orig]['body'] = '';
341 if (isset($this->requests[$orig]['body'])) { 341 $_header_only_type = true;
342 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 342 $this->debug('Header only type returned');
343 if ($redirectURL) { 343 } else {
344 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 344 $this->requests[$orig]['body'] = $request->getResponseBody();
345 $this->redirectQueue[$orig] = $redirectURL; 345 $_header_only_type = false;
346 } 346 }
347 } 347 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
348 } 348 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
349 //die($url.' -multi- '.$request->getResponseInfo('effective_url')); 349 // is redirect?
350 $pool->detach($request); 350 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
351 unset($this->requests[$orig]['httpRequest'], $request); 351 $redirectURL = $request->getResponseHeader('location');
352 /* 352 if (!preg_match('!^https?://!i', $redirectURL)) {
353 if ($this->minimiseMemoryUse) { 353 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
354 if ($this->cache($url)) { 354 }
355 unset($this->requests[$url]); 355 if ($this->validateURL($redirectURL)) {
356 } 356 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
357 } 357 // store any cookies
358 */ 358 $cookies = $request->getResponseHeader('set-cookie');
359 } 359 if ($cookies && !is_array($cookies)) $cookies = array($cookies);
360 } 360 if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
361 } 361 $this->redirectQueue[$orig] = $redirectURL;
362 } catch (HttpException $e) { 362 } else {
363 $this->debug($e); 363 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
364 return false; 364 }
365 } 365 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
366 } 366 // the response content-type did not match our 'header only' types,
367 367 // but we'd issues a HEAD request because we assumed it would. So
368 ////////////////////////////////////////////////////////// 368 // let's queue a proper GET request for this item...
369 // parallel (curl_multi_*) 369 $this->debug('Wrong guess at content-type, queing GET request');
370 elseif ($this->method == self::METHOD_CURL_MULTI) { 370 $this->requests[$orig]['wrongGuess'] = true;
371 $this->debug('Starting parallel fetch (curl_multi_*)'); 371 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
372 while (count($urls) > 0) { 372 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
373 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); 373 // check for <meta name='fragment' content='!'/>
374 $subset = array_splice($urls, 0, $this->maxParallelRequests); 374 // for AJAX sites, e.g. Blogger with its dynamic views templates.
375 $pool = new RollingCurl(array($this, 'handleCurlResponse')); 375 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
376 $pool->window_size = count($subset); 376 if (isset($this->requests[$orig]['body'])) {
377 377 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
378 foreach ($subset as $orig => $url) { 378 if ($redirectURL) {
379 if (!$isRedirect) $orig = $url; 379 $this->redirectQueue[$orig] = $redirectURL;
380 unset($this->redirectQueue[$orig]); 380 }
381 $this->debug("...$url"); 381 }
382 if (!$isRedirect && isset($this->requests[$url])) { 382 }
383 $this->debug("......in memory"); 383 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
384 /* 384 $pool->detach($request);
385 } elseif ($this->isCached($url)) { 385 unset($this->requests[$orig]['httpRequest'], $request);
386 $this->debug("......is cached"); 386 /*
387 if (!$this->minimiseMemoryUse) { 387 if ($this->minimiseMemoryUse) {
388 $this->requests[$url] = $this->getCached($url); 388 if ($this->cache($url)) {
389 } 389 unset($this->requests[$url]);
390 */ 390 }
391 } else { 391 }
392 $this->debug("......adding to pool"); 392 */
393 $req_url = $this->rewriteUrls($url); 393 }
394 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 394 }
395 $req_url = $this->removeFragment($req_url); 395 }
396 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { 396 } catch (HttpException $e) {
397 $_meth = 'HEAD'; 397 $this->debug($e);
398 } else { 398 return false;
399 $_meth = 'GET'; 399 }
400 unset($this->requests[$orig]['wrongGuess']); 400 }
401 } 401
402 $headers = array(); 402 //////////////////////////////////////////////////////////
403 //$headers[] = 'User-Agent: '.$this->userAgent; 403 // parallel (curl_multi_*)
404 $headers[] = $this->getUserAgent($req_url); 404 elseif ($this->method == self::METHOD_CURL_MULTI) {
405 // add referer for picky sites 405 $this->debug('Starting parallel fetch (curl_multi_*)');
406 $headers[] = 'Referer: '.$this->referer; 406 while (count($urls) > 0) {
407 // send cookies, if we have any 407 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
408 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 408 $subset = array_splice($urls, 0, $this->maxParallelRequests);
409 $this->debug("......sending cookies: $cookies"); 409 $pool = new RollingCurl(array($this, 'handleCurlResponse'));
410 $headers[] = 'Cookie: '.$cookies; 410 $pool->window_size = count($subset);
411 } 411
412 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( 412 foreach ($subset as $orig => $url) {
413 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], 413 if (!$isRedirect) $orig = $url;
414 CURLOPT_TIMEOUT => $this->requestOptions['timeout'] 414 unset($this->redirectQueue[$orig]);
415 )); 415 $this->debug("...$url");
416 $httpRequest->set_original_url($orig); 416 if (!$isRedirect && isset($this->requests[$url])) {
417 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); 417 $this->debug("......in memory");
418 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? 418 /*
419 $pool->add($httpRequest); 419 } elseif ($this->isCached($url)) {
420 } 420 $this->debug("......is cached");
421 } 421 if (!$this->minimiseMemoryUse) {
422 // did we get anything into the pool? 422 $this->requests[$url] = $this->getCached($url);
423 if (count($pool) > 0) { 423 }
424 $this->debug('Sending request...'); 424 */
425 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] 425 } else {
426 $this->debug('Received responses'); 426 $this->debug("......adding to pool");
427 foreach($subset as $orig => $url) { 427 $req_url = $this->rewriteUrls($url);
428 if (!$isRedirect) $orig = $url; 428 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
429 // $this->requests[$orig]['headers'] 429 $req_url = $this->removeFragment($req_url);
430 // $this->requests[$orig]['body'] 430 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
431 // $this->requests[$orig]['effective_url'] 431 $_meth = 'HEAD';
432 // check content type 432 } else {
433 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 433 $_meth = 'GET';
434 $this->requests[$orig]['body'] = ''; 434 unset($this->requests[$orig]['wrongGuess']);
435 $_header_only_type = true; 435 }
436 $this->debug('Header only type returned'); 436 $headers = array();
437 } else { 437 //$headers[] = 'User-Agent: '.$this->userAgent;
438 $_header_only_type = false; 438 $headers[] = $this->getUserAgent($req_url);
439 } 439 // add referer for picky sites
440 $status_code = $this->requests[$orig]['status_code']; 440 $headers[] = 'Referer: '.$this->referer;
441 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { 441 // send cookies, if we have any
442 $redirectURL = $this->requests[$orig]['location']; 442 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
443 if (!preg_match('!^https?://!i', $redirectURL)) { 443 $this->debug("......sending cookies: $cookies");
444 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 444 $headers[] = 'Cookie: '.$cookies;
445 } 445 }
446 if ($this->validateURL($redirectURL)) { 446 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
447 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 447 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
448 // store any cookies 448 CURLOPT_TIMEOUT => $this->requestOptions['timeout']
449 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); 449 ));
450 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); 450 $httpRequest->set_original_url($orig);
451 $this->redirectQueue[$orig] = $redirectURL; 451 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
452 } else { 452 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
453 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 453 $pool->add($httpRequest);
454 } 454 }
455 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { 455 }
456 // the response content-type did not match our 'header only' types, 456 // did we get anything into the pool?
457 // but we'd issues a HEAD request because we assumed it would. So 457 if (count($pool) > 0) {
458 // let's queue a proper GET request for this item... 458 $this->debug('Sending request...');
459 $this->debug('Wrong guess at content-type, queing GET request'); 459 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
460 $this->requests[$orig]['wrongGuess'] = true; 460 $this->debug('Received responses');
461 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; 461 foreach($subset as $orig => $url) {
462 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 462 if (!$isRedirect) $orig = $url;
463 // check for <meta name='fragment' content='!'/> 463 // $this->requests[$orig]['headers']
464 // for AJAX sites, e.g. Blogger with its dynamic views templates. 464 // $this->requests[$orig]['body']
465 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 465 // $this->requests[$orig]['effective_url']
466 if (isset($this->requests[$orig]['body'])) { 466 // check content type
467 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 467 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
468 if ($redirectURL) { 468 $this->requests[$orig]['body'] = '';
469 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 469 $_header_only_type = true;
470 $this->redirectQueue[$orig] = $redirectURL; 470 $this->debug('Header only type returned');
471 } 471 } else {
472 } 472 $_header_only_type = false;
473 } 473 }
474 // die($url.' -multi- '.$request->getResponseInfo('effective_url')); 474 $status_code = $this->requests[$orig]['status_code'];
475 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); 475 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
476 } 476 $redirectURL = $this->requests[$orig]['location'];
477 } 477 if (!preg_match('!^https?://!i', $redirectURL)) {
478 } 478 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
479 } 479 }
480 480 if ($this->validateURL($redirectURL)) {
481 ////////////////////////////////////////////////////// 481 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
482 // sequential (file_get_contents) 482 // store any cookies
483 else { 483 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
484 $this->debug('Starting sequential fetch (file_get_contents)'); 484 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
485 $this->debug('Processing set of '.count($urls)); 485 $this->redirectQueue[$orig] = $redirectURL;
486 foreach ($urls as $orig => $url) { 486 } else {
487 if (!$isRedirect) $orig = $url; 487 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
488 unset($this->redirectQueue[$orig]); 488 }
489 $this->debug("...$url"); 489 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
490 if (!$isRedirect && isset($this->requests[$url])) { 490 // the response content-type did not match our 'header only' types,
491 $this->debug("......in memory"); 491 // but we'd issues a HEAD request because we assumed it would. So
492 /* 492 // let's queue a proper GET request for this item...
493 } elseif ($this->isCached($url)) { 493 $this->debug('Wrong guess at content-type, queing GET request');
494 $this->debug("......is cached"); 494 $this->requests[$orig]['wrongGuess'] = true;
495 if (!$this->minimiseMemoryUse) { 495 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
496 $this->requests[$url] = $this->getCached($url); 496 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
497 } 497 // check for <meta name='fragment' content='!'/>
498 */ 498 // for AJAX sites, e.g. Blogger with its dynamic views templates.
499 } else { 499 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
500 $this->debug("Sending request for $url"); 500 if (isset($this->requests[$orig]['body'])) {
501 $this->requests[$orig]['original_url'] = $orig; 501 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
502 $req_url = $this->rewriteUrls($url); 502 if ($redirectURL) {
503 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 503 $this->redirectQueue[$orig] = $redirectURL;
504 $req_url = $this->removeFragment($req_url); 504 }
505 // send cookies, if we have any 505 }
506 $httpContext = $this->httpContext; 506 }
507 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; 507 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
508 // add referer for picky sites 508 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
509 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; 509 }
510 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 510 }
511 $this->debug("......sending cookies: $cookies"); 511 }
512 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; 512 }
513 } 513
514 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { 514 //////////////////////////////////////////////////////
515 $this->debug('Received response'); 515 // sequential (file_get_contents)
516 // get status code 516 else {
517 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { 517 $this->debug('Starting sequential fetch (file_get_contents)');
518 $this->debug('Error: no status code found'); 518 $this->debug('Processing set of '.count($urls));
519 // TODO: handle error - no status code 519 foreach ($urls as $orig => $url) {
520 } else { 520 if (!$isRedirect) $orig = $url;
521 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); 521 unset($this->redirectQueue[$orig]);
522 // check content type 522 $this->debug("...$url");
523 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 523 if (!$isRedirect && isset($this->requests[$url])) {
524 $this->requests[$orig]['body'] = ''; 524 $this->debug("......in memory");
525 } else { 525 /*
526 $this->requests[$orig]['body'] = $html; 526 } elseif ($this->isCached($url)) {
527 } 527 $this->debug("......is cached");
528 $this->requests[$orig]['effective_url'] = $req_url; 528 if (!$this->minimiseMemoryUse) {
529 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; 529 $this->requests[$url] = $this->getCached($url);
530 unset($match); 530 }
531 // handle redirect 531 */
532 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { 532 } else {
533 $this->requests[$orig]['location'] = trim($match[1]); 533 $this->debug("Sending request for $url");
534 } 534 $this->requests[$orig]['original_url'] = $orig;
535 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { 535 $req_url = $this->rewriteUrls($url);
536 $redirectURL = $this->requests[$orig]['location']; 536 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
537 if (!preg_match('!^https?://!i', $redirectURL)) { 537 $req_url = $this->removeFragment($req_url);
538 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 538 // send cookies, if we have any
539 } 539 $httpContext = $this->httpContext;
540 if ($this->validateURL($redirectURL)) { 540 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
541 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 541 // add referer for picky sites
542 // store any cookies 542 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
543 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); 543 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
544 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); 544 $this->debug("......sending cookies: $cookies");
545 $this->redirectQueue[$orig] = $redirectURL; 545 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
546 } else { 546 }
547 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 547 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
548 } 548 $this->debug('Received response');
549 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 549 // get status code
550 // check for <meta name='fragment' content='!'/> 550 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
551 // for AJAX sites, e.g. Blogger with its dynamic views templates. 551 $this->debug('Error: no status code found');
552 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 552 // TODO: handle error - no status code
553 if (isset($this->requests[$orig]['body'])) { 553 } else {
554 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 554 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
555 if ($redirectURL) { 555 // check content type
556 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 556 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
557 $this->redirectQueue[$orig] = $redirectURL; 557 $this->requests[$orig]['body'] = '';
558 } 558 } else {
559 } 559 $this->requests[$orig]['body'] = $html;
560 } 560 }
561 } 561 $this->requests[$orig]['effective_url'] = $req_url;
562 } else { 562 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
563 $this->debug('Error retrieving URL'); 563 unset($match);
564 //print_r($req_url); 564 // handle redirect
565 //print_r($http_response_header); 565 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
566 //print_r($html); 566 $this->requests[$orig]['location'] = trim($match[1]);
567 567 }
568 // TODO: handle error - failed to retrieve URL 568 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
569 } 569 $redirectURL = $this->requests[$orig]['location'];
570 } 570 if (!preg_match('!^https?://!i', $redirectURL)) {
571 } 571 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
572 } 572 }
573 } 573 if ($this->validateURL($redirectURL)) {
574 574 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
575 public function handleCurlResponse($response, $info, $request) { 575 // store any cookies
576 $orig = $request->url_original; 576 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
577 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); 577 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
578 $this->requests[$orig]['body'] = substr($response, $info['header_size']); 578 $this->redirectQueue[$orig] = $redirectURL;
579 $this->requests[$orig]['method'] = $request->method; 579 } else {
580 $this->requests[$orig]['effective_url'] = $info['url']; 580 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
581 $this->requests[$orig]['status_code'] = (int)$info['http_code']; 581 }
582 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { 582 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
583 $this->requests[$orig]['location'] = trim($match[1]); 583 // check for <meta name='fragment' content='!'/>
584 } 584 // for AJAX sites, e.g. Blogger with its dynamic views templates.
585 } 585 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
586 586 if (isset($this->requests[$orig]['body'])) {
587 protected function headersToString(array $headers, $associative=true) { 587 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
588 if (!$associative) { 588 if ($redirectURL) {
589 return implode("\n", $headers); 589 $this->redirectQueue[$orig] = $redirectURL;
590 } else { 590 }
591 $str = ''; 591 }
592 foreach ($headers as $key => $val) { 592 }
593 if (is_array($val)) { 593 }
594 foreach ($val as $v) $str .= "$key: $v\n"; 594 } else {
595 } else { 595 $this->debug('Error retrieving URL');
596 $str .= "$key: $val\n"; 596 //print_r($req_url);
597 } 597 //print_r($http_response_header);
598 } 598 //print_r($html);
599 return rtrim($str); 599
600 } 600 // TODO: handle error - failed to retrieve URL
601 } 601 }
602 602 }
603 public function get($url, $remove=false, $gzdecode=true) { 603 }
604 $url = "$url"; 604 }
605 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { 605 }
606 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); 606
607 $response = $this->requests[$url]; 607 public function handleCurlResponse($response, $info, $request) {
608 /* 608 $orig = $request->url_original;
609 } elseif ($this->isCached($url)) { 609 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
610 $this->debug("URL already fetched - in disk cache ($url)"); 610 $this->requests[$orig]['body'] = substr($response, $info['header_size']);
611 $response = $this->getCached($url); 611 $this->requests[$orig]['method'] = $request->method;
612 $this->requests[$url] = $response; 612 $this->requests[$orig]['effective_url'] = $info['url'];
613 */ 613 $this->requests[$orig]['status_code'] = (int)$info['http_code'];
614 } else { 614 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
615 $this->debug("Fetching URL ($url)"); 615 $this->requests[$orig]['location'] = trim($match[1]);
616 $this->fetchAll(array($url)); 616 }
617 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { 617 }
618 $response = $this->requests[$url]; 618
619 } else { 619 protected function headersToString(array $headers, $associative=true) {
620 $this->debug("Request failed"); 620 if (!$associative) {
621 $response = false; 621 return implode("\n", $headers);
622 } 622 } else {
623 } 623 $str = '';
624 /* 624 foreach ($headers as $key => $val) {
625 if ($this->minimiseMemoryUse && $response) { 625 if (is_array($val)) {
626 $this->cache($url); 626 foreach ($val as $v) $str .= "$key: $v\n";
627 unset($this->requests[$url]); 627 } else {
628 } 628 $str .= "$key: $val\n";
629 */ 629 }
630 if ($remove && $response) unset($this->requests[$url]); 630 }
631 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { 631 return rtrim($str);
632 if ($html = gzdecode($response['body'])) { 632 }
633 $response['body'] = $html; 633 }
634 } 634
635 } 635 public function get($url, $remove=false, $gzdecode=true) {
636 return $response; 636 $url = "$url";
637 } 637 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
638 638 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
639 public function parallelSupport() { 639 $response = $this->requests[$url];
640 return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); 640 /*
641 } 641 } elseif ($this->isCached($url)) {
642 642 $this->debug("URL already fetched - in disk cache ($url)");
643 private function headerOnlyType($headers) { 643 $response = $this->getCached($url);
644 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { 644 $this->requests[$url] = $response;
645 // look for full mime type (e.g. image/jpeg) or just type (e.g. image) 645 */
646 $match[1] = strtolower(trim($match[1])); 646 } else {
647 $match[2] = strtolower(trim($match[2])); 647 $this->debug("Fetching URL ($url)");
648 foreach (array($match[1], $match[2]) as $mime) { 648 $this->fetchAll(array($url));
649 if (in_array($mime, $this->headerOnlyTypes)) return true; 649 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
650 } 650 $response = $this->requests[$url];
651 } 651 } else {
652 return false; 652 $this->debug("Request failed");
653 } 653 $response = false;
654 654 }
655 private function possibleUnsupportedType($url) { 655 }
656 $path = @parse_url($url, PHP_URL_PATH); 656 /*
657 if ($path && strpos($path, '.') !== false) { 657 if ($this->minimiseMemoryUse && $response) {
658 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); 658 $this->cache($url);
659 return in_array($ext, $this->headerOnlyClues); 659 unset($this->requests[$url]);
660 } 660 }
661 return false; 661 */
662 } 662 if ($remove && $response) unset($this->requests[$url]);
663} 663 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
664 664 if ($html = gzdecode($response['body'])) {
665// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 665 $response['body'] = $html;
666if (!function_exists('gzdecode')) { 666 }
667 function gzdecode($data,&$filename='',&$error='',$maxlength=null) 667 }
668 { 668 return $response;
669 $len = strlen($data); 669 }
670 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { 670
671 $error = "Not in GZIP format."; 671 public function parallelSupport() {
672 return null; // Not GZIP format (See RFC 1952) 672 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
673 } 673 }
674 $method = ord(substr($data,2,1)); // Compression method 674
675 $flags = ord(substr($data,3,1)); // Flags 675 private function headerOnlyType($headers) {
676 if ($flags & 31 != $flags) { 676 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
677 $error = "Reserved bits not allowed."; 677 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
678 return null; 678 $match[1] = strtolower(trim($match[1]));
679 } 679 $match[2] = strtolower(trim($match[2]));
680 // NOTE: $mtime may be negative (PHP integer limitations) 680 foreach (array($match[1], $match[2]) as $mime) {
681 $mtime = unpack("V", substr($data,4,4)); 681 if (in_array($mime, $this->headerOnlyTypes)) return true;
682 $mtime = $mtime[1]; 682 }
683 $xfl = substr($data,8,1); 683 }
684 $os = substr($data,8,1); 684 return false;
685 $headerlen = 10; 685 }
686 $extralen = 0; 686
687 $extra = ""; 687 private function possibleUnsupportedType($url) {
688 if ($flags & 4) { 688 $path = @parse_url($url, PHP_URL_PATH);
689 // 2-byte length prefixed EXTRA data in header 689 if ($path && strpos($path, '.') !== false) {
690 if ($len - $headerlen - 2 < 8) { 690 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
691 return false; // invalid 691 return in_array($ext, $this->headerOnlyClues);
692 } 692 }
693 $extralen = unpack("v",substr($data,8,2)); 693 return false;
694 $extralen = $extralen[1]; 694 }
695 if ($len - $headerlen - 2 - $extralen < 8) { 695}
696 return false; // invalid 696
697 } 697// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
698 $extra = substr($data,10,$extralen); 698if (!function_exists('gzdecode')) {
699 $headerlen += 2 + $extralen; 699 function gzdecode($data,&$filename='',&$error='',$maxlength=null)
700 } 700 {
701 $filenamelen = 0; 701 $len = strlen($data);
702 $filename = ""; 702 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
703 if ($flags & 8) { 703 $error = "Not in GZIP format.";
704 // C-style string 704 return null; // Not GZIP format (See RFC 1952)
705 if ($len - $headerlen - 1 < 8) { 705 }
706 return false; // invalid 706 $method = ord(substr($data,2,1)); // Compression method
707 } 707 $flags = ord(substr($data,3,1)); // Flags
708 $filenamelen = strpos(substr($data,$headerlen),chr(0)); 708 if ($flags & 31 != $flags) {
709 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { 709 $error = "Reserved bits not allowed.";
710 return false; // invalid 710 return null;
711 } 711 }
712 $filename = substr($data,$headerlen,$filenamelen); 712 // NOTE: $mtime may be negative (PHP integer limitations)
713 $headerlen += $filenamelen + 1; 713 $mtime = unpack("V", substr($data,4,4));
714 } 714 $mtime = $mtime[1];
715 $commentlen = 0; 715 $xfl = substr($data,8,1);
716 $comment = ""; 716 $os = substr($data,8,1);
717 if ($flags & 16) { 717 $headerlen = 10;
718 // C-style string COMMENT data in header 718 $extralen = 0;
719 if ($len - $headerlen - 1 < 8) { 719 $extra = "";
720 return false; // invalid 720 if ($flags & 4) {
721 } 721 // 2-byte length prefixed EXTRA data in header
722 $commentlen = strpos(substr($data,$headerlen),chr(0)); 722 if ($len - $headerlen - 2 < 8) {
723 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { 723 return false; // invalid
724 return false; // Invalid header format 724 }
725 } 725 $extralen = unpack("v",substr($data,8,2));
726 $comment = substr($data,$headerlen,$commentlen); 726 $extralen = $extralen[1];
727 $headerlen += $commentlen + 1; 727 if ($len - $headerlen - 2 - $extralen < 8) {
728 } 728 return false; // invalid
729 $headercrc = ""; 729 }
730 if ($flags & 2) { 730 $extra = substr($data,10,$extralen);
731 // 2-bytes (lowest order) of CRC32 on header present 731 $headerlen += 2 + $extralen;
732 if ($len - $headerlen - 2 < 8) { 732 }
733 return false; // invalid 733 $filenamelen = 0;
734 } 734 $filename = "";
735 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; 735 if ($flags & 8) {
736 $headercrc = unpack("v", substr($data,$headerlen,2)); 736 // C-style string
737 $headercrc = $headercrc[1]; 737 if ($len - $headerlen - 1 < 8) {
738 if ($headercrc != $calccrc) { 738 return false; // invalid
739 $error = "Header checksum failed."; 739 }
740 return false; // Bad header CRC 740 $filenamelen = strpos(substr($data,$headerlen),chr(0));
741 } 741 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
742 $headerlen += 2; 742 return false; // invalid
743 } 743 }
744 // GZIP FOOTER 744 $filename = substr($data,$headerlen,$filenamelen);
745 $datacrc = unpack("V",substr($data,-8,4)); 745 $headerlen += $filenamelen + 1;
746 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); 746 }
747 $isize = unpack("V",substr($data,-4)); 747 $commentlen = 0;
748 $isize = $isize[1]; 748 $comment = "";
749 // decompression: 749 if ($flags & 16) {
750 $bodylen = $len-$headerlen-8; 750 // C-style string COMMENT data in header
751 if ($bodylen < 1) { 751 if ($len - $headerlen - 1 < 8) {
752 // IMPLEMENTATION BUG! 752 return false; // invalid
753 return null; 753 }
754 } 754 $commentlen = strpos(substr($data,$headerlen),chr(0));
755 $body = substr($data,$headerlen,$bodylen); 755 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
756 $data = ""; 756 return false; // Invalid header format
757 if ($bodylen > 0) { 757 }
758 switch ($method) { 758 $comment = substr($data,$headerlen,$commentlen);
759 case 8: 759 $headerlen += $commentlen + 1;
760 // Currently the only supported compression method: 760 }
761 $data = gzinflate($body,$maxlength); 761 $headercrc = "";
762 break; 762 if ($flags & 2) {
763 default: 763 // 2-bytes (lowest order) of CRC32 on header present
764 $error = "Unknown compression method."; 764 if ($len - $headerlen - 2 < 8) {
765 return false; 765 return false; // invalid
766 } 766 }
767 } // zero-byte body content is allowed 767 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
768 // Verifiy CRC32 768 $headercrc = unpack("v", substr($data,$headerlen,2));
769 $crc = sprintf("%u",crc32($data)); 769 $headercrc = $headercrc[1];
770 $crcOK = $crc == $datacrc; 770 if ($headercrc != $calccrc) {
771 $lenOK = $isize == strlen($data); 771 $error = "Header checksum failed.";
772 if (!$lenOK || !$crcOK) { 772 return false; // Bad header CRC
773 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); 773 }
774 return false; 774 $headerlen += 2;
775 } 775 }
776 return $data; 776 // GZIP FOOTER
777 } 777 $datacrc = unpack("V",substr($data,-8,4));
778} 778 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
779?> \ No newline at end of file 779 $isize = unpack("V",substr($data,-4));
780 $isize = $isize[1];
781 // decompression:
782 $bodylen = $len-$headerlen-8;
783 if ($bodylen < 1) {
784 // IMPLEMENTATION BUG!
785 return null;
786 }
787 $body = substr($data,$headerlen,$bodylen);
788 $data = "";
789 if ($bodylen > 0) {
790 switch ($method) {
791 case 8:
792 // Currently the only supported compression method:
793 $data = gzinflate($body,$maxlength);
794 break;
795 default:
796 $error = "Unknown compression method.";
797 return false;
798 }
799 } // zero-byte body content is allowed
800 // Verifiy CRC32
801 $crc = sprintf("%u",crc32($data));
802 $crcOK = $crc == $datacrc;
803 $lenOK = $isize == strlen($data);
804 if (!$lenOK || !$crcOK) {
805 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
806 return false;
807 }
808 return $data;
809 }
810} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
index ecd46d5f..c524a1ee 100644
--- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
@@ -1,79 +1,78 @@
1<?php 1<?php
2/** 2/**
3 * Humble HTTP Agent extension for SimplePie_File 3 * Humble HTTP Agent extension for SimplePie_File
4 * 4 *
5 * This class is designed to extend and override SimplePie_File 5 * This class is designed to extend and override SimplePie_File
6 * in order to prevent duplicate HTTP requests being sent out. 6 * in order to prevent duplicate HTTP requests being sent out.
7 * The idea is to initialise an instance of Humble HTTP Agent 7 * The idea is to initialise an instance of Humble HTTP Agent
8 * and attach it, to a static class variable, of this class. 8 * and attach it, to a static class variable, of this class.
9 * SimplePie will then automatically initialise this class 9 * SimplePie will then automatically initialise this class
10 * 10 *
11 * @date 2011-02-28 11 * @date 2011-02-28
12 */ 12 */
13 13
14class SimplePie_HumbleHttpAgent extends SimplePie_File 14class SimplePie_HumbleHttpAgent extends SimplePie_File
15{ 15{
16 protected static $agent; 16 protected static $agent;
17 var $url; 17 var $url;
18 var $useragent; 18 var $useragent;
19 var $success = true; 19 var $success = true;
20 var $headers = array(); 20 var $headers = array();
21 var $body; 21 var $body;
22 var $status_code; 22 var $status_code;
23 var $redirects = 0; 23 var $redirects = 0;
24 var $error; 24 var $error;
25 var $method = SIMPLEPIE_FILE_SOURCE_NONE; 25 var $method = SIMPLEPIE_FILE_SOURCE_NONE;
26 26
27 public static function set_agent(HumbleHttpAgent $agent) { 27 public static function set_agent(HumbleHttpAgent $agent) {
28 self::$agent = $agent; 28 self::$agent = $agent;
29 } 29 }
30 30
31 public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { 31 public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
32 if (class_exists('idna_convert')) 32 if (class_exists('idna_convert'))
33 { 33 {
34 $idn = new idna_convert(); 34 $idn = new idna_convert();
35 $parsed = SimplePie_Misc::parse_url($url); 35 $parsed = SimplePie_Misc::parse_url($url);
36 $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); 36 $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
37 } 37 }
38 $this->url = $url; 38 $this->url = $url;
39 $this->useragent = $useragent; 39 $this->useragent = $useragent;
40 if (preg_match('/^http(s)?:\/\//i', $url)) 40 if (preg_match('/^http(s)?:\/\//i', $url))
41 { 41 {
42 if (!is_array($headers)) 42 if (!is_array($headers))
43 { 43 {
44 $headers = array(); 44 $headers = array();
45 } 45 }
46 $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; 46 $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
47 $headers2 = array(); 47 $headers2 = array();
48 foreach ($headers as $key => $value) { 48 foreach ($headers as $key => $value) {
49 $headers2[] = "$key: $value"; 49 $headers2[] = "$key: $value";
50 } 50 }
51 //TODO: allow for HTTP headers 51 //TODO: allow for HTTP headers
52 // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); 52 // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
53 53
54 $response = self::$agent->get($url); 54 $response = self::$agent->get($url);
55 55
56 if ($response === false || !isset($response['status_code'])) { 56 if ($response === false || !isset($response['status_code'])) {
57 $this->error = 'failed to fetch URL'; 57 $this->error = 'failed to fetch URL';
58 $this->success = false; 58 $this->success = false;
59 } else { 59 } else {
60 // The extra lines at the end are there to satisfy SimplePie's HTTP parser. 60 // The extra lines at the end are there to satisfy SimplePie's HTTP parser.
61 // The class expects a full HTTP message, whereas we're giving it only 61 // The class expects a full HTTP message, whereas we're giving it only
62 // headers - the new lines indicate the start of the body. 62 // headers - the new lines indicate the start of the body.
63 $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); 63 $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
64 if ($parser->parse()) { 64 if ($parser->parse()) {
65 $this->headers = $parser->headers; 65 $this->headers = $parser->headers;
66 //$this->body = $parser->body; 66 //$this->body = $parser->body;
67 $this->body = $response['body']; 67 $this->body = $response['body'];
68 $this->status_code = $parser->status_code; 68 $this->status_code = $parser->status_code;
69 } 69 }
70 } 70 }
71 } 71 }
72 else 72 else
73 { 73 {
74 $this->error = 'invalid URL'; 74 $this->error = 'invalid URL';
75 $this->success = false; 75 $this->success = false;
76 } 76 }
77 } 77 }
78} 78} \ No newline at end of file
79?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
index 09b11546..382d869c 100644
--- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
@@ -6,23 +6,24 @@
6 * Attempts to detect the language of a sample of text by correlating ranked 6 * Attempts to detect the language of a sample of text by correlating ranked
7 * 3-gram frequencies to a table of 3-gram frequencies of known languages. 7 * 3-gram frequencies to a table of 3-gram frequencies of known languages.
8 * 8 *
9 * Implements a version of a technique originally proposed by Cavnar & Trenkle 9 * Implements a version of a technique originally proposed by Cavnar & Trenkle
10 * (1994): "N-Gram-Based Text Categorization" 10 * (1994): "N-Gram-Based Text Categorization"
11 * 11 *
12 * PHP versions 4 and 5 12 * PHP version 5
13 * 13 *
14 * @category Text 14 * @category Text
15 * @package Text_LanguageDetect 15 * @package Text_LanguageDetect
16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
17 * @copyright 2005-2006 Nicholas Pisarro 17 * @copyright 2005-2006 Nicholas Pisarro
18 * @license http://www.debian.org/misc/bsd.license BSD 18 * @license http://www.debian.org/misc/bsd.license BSD
19 * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ 19 * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
20 * @link http://pear.php.net/package/Text_LanguageDetect/ 20 * @link http://pear.php.net/package/Text_LanguageDetect/
21 * @link http://langdetect.blogspot.com/ 21 * @link http://langdetect.blogspot.com/
22 */ 22 */
23 23
24//require_once 'PEAR.php'; 24require_once 'LanguageDetect/Exception.php';
25require_once 'Parser.php'; 25require_once 'LanguageDetect/Parser.php';
26require_once 'LanguageDetect/ISO639.php';
26 27
27/** 28/**
28 * Language detection class 29 * Language detection class
@@ -41,9 +42,10 @@ require_once 'Parser.php';
41 * 42 *
42 * echo "Supported languages:\n"; 43 * echo "Supported languages:\n";
43 * 44 *
44 * $langs = $l->getLanguages(); 45 * try {
45 * if (PEAR::isError($langs)) { 46 * $langs = $l->getLanguages();
46 * die($langs->getMessage()); 47 * } catch (Text_LanguageDetect_Exception $e) {
48 * die($e->getMessage());
47 * } 49 * }
48 * 50 *
49 * sort($langs); 51 * sort($langs);
@@ -54,38 +56,38 @@ require_once 'Parser.php';
54 * } 56 * }
55 * </code> 57 * </code>
56 * 58 *
57 * @category Text 59 * @category Text
58 * @package Text_LanguageDetect 60 * @package Text_LanguageDetect
59 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 61 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
60 * @copyright 2005 Nicholas Pisarro 62 * @copyright 2005 Nicholas Pisarro
61 * @license http://www.debian.org/misc/bsd.license BSD 63 * @license http://www.debian.org/misc/bsd.license BSD
62 * @version Release: @package_version@ 64 * @version Release: @package_version@
63 * @todo allow users to generate their own language models 65 * @link http://pear.php.net/package/Text_LanguageDetect/
66 * @todo allow users to generate their own language models
64 */ 67 */
65
66class Text_LanguageDetect 68class Text_LanguageDetect
67{ 69{
68 /** 70 /**
69 * The filename that stores the trigram data for the detector 71 * The filename that stores the trigram data for the detector
70 * 72 *
71 * If this value starts with a slash (/) or a dot (.) the value of 73 * If this value starts with a slash (/) or a dot (.) the value of
72 * $this->_data_dir will be ignored 74 * $this->_data_dir will be ignored
73 * 75 *
74 * @var string 76 * @var string
75 * @access private 77 * @access private
76 */ 78 */
77 var $_db_filename = './lang.dat'; 79 var $_db_filename = 'lang.dat';
78 80
79 /** 81 /**
80 * The filename that stores the unicode block definitions 82 * The filename that stores the unicode block definitions
81 * 83 *
82 * If this value starts with a slash (/) or a dot (.) the value of 84 * If this value starts with a slash (/) or a dot (.) the value of
83 * $this->_data_dir will be ignored 85 * $this->_data_dir will be ignored
84 * 86 *
85 * @var string 87 * @var string
86 * @access private 88 * @access private
87 */ 89 */
88 var $_unicode_db_filename = './unicode_blocks.dat'; 90 var $_unicode_db_filename = 'unicode_blocks.dat';
89 91
90 /** 92 /**
91 * The data directory 93 * The data directory
@@ -99,11 +101,8 @@ class Text_LanguageDetect
99 101
100 /** 102 /**
101 * The trigram data for comparison 103 * The trigram data for comparison
102 *
103 * Will be loaded on start from $this->_db_filename
104 * 104 *
105 * May be set to a PEAR_Error object if there is an error during its 105 * Will be loaded on start from $this->_db_filename
106 * initialization
107 * 106 *
108 * @var array 107 * @var array
109 * @access private 108 * @access private
@@ -120,7 +119,7 @@ class Text_LanguageDetect
120 119
121 /** 120 /**
122 * The size of the trigram data arrays 121 * The size of the trigram data arrays
123 * 122 *
124 * @var int 123 * @var int
125 * @access private 124 * @access private
126 */ 125 */
@@ -140,7 +139,7 @@ class Text_LanguageDetect
140 139
141 /** 140 /**
142 * Whether or not to simulate perl's Language::Guess exactly 141 * Whether or not to simulate perl's Language::Guess exactly
143 * 142 *
144 * @access private 143 * @access private
145 * @var bool 144 * @var bool
146 * @see setPerlCompatible() 145 * @see setPerlCompatible()
@@ -165,18 +164,24 @@ class Text_LanguageDetect
165 var $_clusters; 164 var $_clusters;
166 165
167 /** 166 /**
167 * Which type of "language names" are accepted and returned:
168 *
169 * 0 - language name ("english")
170 * 2 - 2-letter ISO 639-1 code ("en")
171 * 3 - 3-letter ISO 639-2 code ("eng")
172 */
173 var $_name_mode = 0;
174
175 /**
168 * Constructor 176 * Constructor
169 * 177 *
170 * Will attempt to load the language database. If it fails, you will get 178 * Will attempt to load the language database. If it fails, you will get
171 * a PEAR_Error object returned when you try to use detect() 179 * an exception.
172 *
173 */ 180 */
174 function Text_LanguageDetect($db=null, $unicode_db=null) 181 function __construct()
175 { 182 {
176 if (isset($db)) $this->_db_filename = $db;
177 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
178
179 $data = $this->_readdb($this->_db_filename); 183 $data = $this->_readdb($this->_db_filename);
184 $this->_checkTrigram($data['trigram']);
180 $this->_lang_db = $data['trigram']; 185 $this->_lang_db = $data['trigram'];
181 186
182 if (isset($data['trigram-unicodemap'])) { 187 if (isset($data['trigram-unicodemap'])) {
@@ -186,29 +191,32 @@ class Text_LanguageDetect
186 // Not yet implemented: 191 // Not yet implemented:
187 if (isset($data['trigram-clusters'])) { 192 if (isset($data['trigram-clusters'])) {
188 $this->_clusters = $data['trigram-clusters']; 193 $this->_clusters = $data['trigram-clusters'];
189 } 194 }
190 } 195 }
191 196
192 /** 197 /**
193 * Returns the path to the location of the database 198 * Returns the path to the location of the database
194 * 199 *
195 * @access private 200 * @param string $fname File name to load
196 * @return string expected path to the language model database 201 *
202 * @return string expected path to the language model database
203 * @access private
197 */ 204 */
198 function _get_data_loc($fname) 205 function _get_data_loc($fname)
199 { 206 {
200 return $fname; 207 return dirname(__FILE__).'/'.$fname;
201 } 208 }
202 209
203 /** 210 /**
204 * Loads the language trigram database from filename 211 * Loads the language trigram database from filename
205 * 212 *
206 * Trigram datbase should be a serialize()'d array 213 * Trigram datbase should be a serialize()'d array
207 * 214 *
208 * @access private 215 * @param string $fname the filename where the data is stored
209 * @param string $fname the filename where the data is stored 216 *
210 * @return array the language model data 217 * @return array the language model data
211 * @throws PEAR_Error 218 * @throws Text_LanguageDetect_Exception
219 * @access private
212 */ 220 */
213 function _readdb($fname) 221 function _readdb($fname)
214 { 222 {
@@ -217,79 +225,74 @@ class Text_LanguageDetect
217 225
218 // input check 226 // input check
219 if (!file_exists($fname)) { 227 if (!file_exists($fname)) {
220 throw new Exception('Language database does not exist.'); 228 throw new Text_LanguageDetect_Exception(
229 'Language database does not exist: ' . $fname,
230 Text_LanguageDetect_Exception::DB_NOT_FOUND
231 );
221 } elseif (!is_readable($fname)) { 232 } elseif (!is_readable($fname)) {
222 throw new Exception('Language database is not readable.'); 233 throw new Text_LanguageDetect_Exception(
234 'Language database is not readable: ' . $fname,
235 Text_LanguageDetect_Exception::DB_NOT_READABLE
236 );
223 } 237 }
224 238
225 if (function_exists('file_get_contents')) { 239 return unserialize(file_get_contents($fname));
226 return unserialize(file_get_contents($fname));
227 } else {
228 // if you don't have file_get_contents(),
229 // then this is the next fastest way
230 ob_start();
231 readfile($fname);
232 $contents = ob_get_contents();
233 ob_end_clean();
234 return unserialize($contents);
235 }
236 } 240 }
237 241
238 242
239 /** 243 /**
240 * Checks if this object is ready to detect languages 244 * Checks if this object is ready to detect languages
241 * 245 *
242 * @access private 246 * @param array $trigram Trigram data from database
243 * @param mixed &$err error object to be returned by reference, if any 247 *
244 * @return bool true if no errors 248 * @return void
249 * @access private
245 */ 250 */
246 function _setup_ok(&$err) 251 function _checkTrigram($trigram)
247 { 252 {
248 if (!is_array($this->_lang_db)) { 253 if (!is_array($trigram)) {
249 if (ini_get('magic_quotes_runtime')) { 254 if (ini_get('magic_quotes_runtime')) {
250 throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); 255 throw new Text_LanguageDetect_Exception(
251 } else { 256 'Error loading database. Try turning magic_quotes_runtime off.',
252 throw new Exception('Language database is not an array.'); 257 Text_LanguageDetect_Exception::MAGIC_QUOTES
258 );
253 } 259 }
254 return false; 260 throw new Text_LanguageDetect_Exception(
255 261 'Language database is not an array.',
256 } elseif (empty($this->_lang_db)) { 262 Text_LanguageDetect_Exception::DB_NOT_ARRAY
257 throw new Exception('Language database has no elements.'); 263 );
258 return false; 264 } elseif (empty($trigram)) {
259 265 throw new Text_LanguageDetect_Exception(
260 } else { 266 'Language database has no elements.',
261 return true; 267 Text_LanguageDetect_Exception::DB_EMPTY
268 );
262 } 269 }
263 } 270 }
264 271
265 /** 272 /**
266 * Omits languages 273 * Omits languages
267 * 274 *
268 * Pass this function the name of or an array of names of 275 * Pass this function the name of or an array of names of
269 * languages that you don't want considered 276 * languages that you don't want considered
270 * 277 *
271 * If you're only expecting a limited set of languages, this can greatly 278 * If you're only expecting a limited set of languages, this can greatly
272 * speed up processing 279 * speed up processing
273 * 280 *
274 * @access public 281 * @param mixed $omit_list language name or array of names to omit
275 * @param mixed $omit_list language name or array of names to omit 282 * @param bool $include_only if true will include (rather than
276 * @param bool $include_only if true will include (rather than 283 * exclude) only those in the list
277 * exclude) only those in the list 284 *
278 * @return int number of languages successfully deleted 285 * @return int number of languages successfully deleted
279 * @throws PEAR_Error 286 * @throws Text_LanguageDetect_Exception
280 */ 287 */
281 function omitLanguages($omit_list, $include_only = false) 288 public function omitLanguages($omit_list, $include_only = false)
282 { 289 {
283
284 // setup check
285 if (!$this->_setup_ok($err)) {
286 return $err;
287 }
288
289 $deleted = 0; 290 $deleted = 0;
290 291
291 // deleting the given languages 292 $omit_list = $this->_convertFromNameMode($omit_list);
293
292 if (!$include_only) { 294 if (!$include_only) {
295 // deleting the given languages
293 if (!is_array($omit_list)) { 296 if (!is_array($omit_list)) {
294 $omit_list = strtolower($omit_list); // case desensitize 297 $omit_list = strtolower($omit_list); // case desensitize
295 if (isset($this->_lang_db[$omit_list])) { 298 if (isset($this->_lang_db[$omit_list])) {
@@ -301,12 +304,12 @@ class Text_LanguageDetect
301 if (isset($this->_lang_db[$omit_lang])) { 304 if (isset($this->_lang_db[$omit_lang])) {
302 unset($this->_lang_db[$omit_lang]); 305 unset($this->_lang_db[$omit_lang]);
303 $deleted++; 306 $deleted++;
304 } 307 }
305 } 308 }
306 } 309 }
307 310
308 // deleting all except the given languages
309 } else { 311 } else {
312 // deleting all except the given languages
310 if (!is_array($omit_list)) { 313 if (!is_array($omit_list)) {
311 $omit_list = array($omit_list); 314 $omit_list = array($omit_list);
312 } 315 }
@@ -327,7 +330,7 @@ class Text_LanguageDetect
327 // reset the cluster cache if the number of languages changes 330 // reset the cluster cache if the number of languages changes
328 // this will then have to be recalculated 331 // this will then have to be recalculated
329 if (isset($this->_clusters) && $deleted > 0) { 332 if (isset($this->_clusters) && $deleted > 0) {
330 unset($this->_clusters); 333 $this->_clusters = null;
331 } 334 }
332 335
333 return $deleted; 336 return $deleted;
@@ -339,49 +342,40 @@ class Text_LanguageDetect
339 * 342 *
340 * @access public 343 * @access public
341 * @return int the number of languages 344 * @return int the number of languages
342 * @throws PEAR_Error 345 * @throws Text_LanguageDetect_Exception
343 */ 346 */
344 function getLanguageCount() 347 function getLanguageCount()
345 { 348 {
346 if (!$this->_setup_ok($err)) { 349 return count($this->_lang_db);
347 return $err;
348 } else {
349 return count($this->_lang_db);
350 }
351 } 350 }
352 351
353 /** 352 /**
354 * Returns true if a given language exists 353 * Checks if the language with the given name exists in the database
355 * 354 *
356 * If passed an array of names, will return true only if all exist 355 * @param mixed $lang Language name or array of language names
357 * 356 *
358 * @access public 357 * @return bool true if language model exists
359 * @param mixed $lang language name or array of language names
360 * @return bool true if language model exists
361 * @throws PEAR_Error
362 */ 358 */
363 function languageExists($lang) 359 public function languageExists($lang)
364 { 360 {
365 if (!$this->_setup_ok($err)) { 361 $lang = $this->_convertFromNameMode($lang);
366 return $err;
367 } else {
368 // string
369 if (is_string($lang)) {
370 return isset($this->_lang_db[strtolower($lang)]);
371
372 // array
373 } elseif (is_array($lang)) {
374 foreach ($lang as $test_lang) {
375 if (!isset($this->_lang_db[strtolower($test_lang)])) {
376 return false;
377 }
378 }
379 return true;
380 362
381 // other (error) 363 if (is_string($lang)) {
382 } else { 364 return isset($this->_lang_db[strtolower($lang)]);
383 throw new Exception('Unknown type passed to languageExists()'); 365
366 } elseif (is_array($lang)) {
367 foreach ($lang as $test_lang) {
368 if (!isset($this->_lang_db[strtolower($test_lang)])) {
369 return false;
370 }
384 } 371 }
372 return true;
373
374 } else {
375 throw new Text_LanguageDetect_Exception(
376 'Unsupported parameter type passed to languageExists()',
377 Text_LanguageDetect_Exception::PARAM_TYPE
378 );
385 } 379 }
386 } 380 }
387 381
@@ -389,25 +383,24 @@ class Text_LanguageDetect
389 * Returns the list of detectable languages 383 * Returns the list of detectable languages
390 * 384 *
391 * @access public 385 * @access public
392 * @return array the names of the languages known to this object 386 * @return array the names of the languages known to this object<<<<<<<
393 * @throws PEAR_Error 387 * @throws Text_LanguageDetect_Exception
394 */ 388 */
395 function getLanguages() 389 function getLanguages()
396 { 390 {
397 if (!$this->_setup_ok($err)) { 391 return $this->_convertToNameMode(
398 return $err; 392 array_keys($this->_lang_db)
399 } else { 393 );
400 return array_keys($this->_lang_db);
401 }
402 } 394 }
403 395
404 /** 396 /**
405 * Make this object behave like Language::Guess 397 * Make this object behave like Language::Guess
406 * 398 *
407 * @access public 399 * @param bool $setting false to turn off perl compatibility
408 * @param bool $setting false to turn off perl compatibility 400 *
401 * @return void
409 */ 402 */
410 function setPerlCompatible($setting = true) 403 public function setPerlCompatible($setting = true)
411 { 404 {
412 if (is_bool($setting)) { // input check 405 if (is_bool($setting)) { // input check
413 $this->_perl_compatible = $setting; 406 $this->_perl_compatible = $setting;
@@ -422,6 +415,21 @@ class Text_LanguageDetect
422 } 415 }
423 416
424 /** 417 /**
418 * Sets the way how language names are accepted and returned.
419 *
420 * @param integer $name_mode One of the following modes:
421 * 0 - language name ("english")
422 * 2 - 2-letter ISO 639-1 code ("en")
423 * 3 - 3-letter ISO 639-2 code ("eng")
424 *
425 * @return void
426 */
427 function setNameMode($name_mode)
428 {
429 $this->_name_mode = $name_mode;
430 }
431
432 /**
425 * Whether to use unicode block ranges in detection 433 * Whether to use unicode block ranges in detection
426 * 434 *
427 * Should speed up most detections if turned on (detault is on). In some 435 * Should speed up most detections if turned on (detault is on). In some
@@ -429,10 +437,11 @@ class Text_LanguageDetect
429 * in languages that use latin scripts. In other cases it should speed up 437 * in languages that use latin scripts. In other cases it should speed up
430 * detection noticeably. 438 * detection noticeably.
431 * 439 *
432 * @access public 440 * @param bool $setting false to turn off
433 * @param bool $setting false to turn off 441 *
442 * @return void
434 */ 443 */
435 function useUnicodeBlocks($setting = true) 444 public function useUnicodeBlocks($setting = true)
436 { 445 {
437 if (is_bool($setting)) { 446 if (is_bool($setting)) {
438 $this->_use_unicode_narrowing = $setting; 447 $this->_use_unicode_narrowing = $setting;
@@ -442,15 +451,15 @@ class Text_LanguageDetect
442 /** 451 /**
443 * Converts a piece of text into trigrams 452 * Converts a piece of text into trigrams
444 * 453 *
445 * Superceded by the Text_LanguageDetect_Parser class 454 * @param string $text text to convert
446 * 455 *
447 * @access private 456 * @return array array of trigram frequencies
448 * @param string $text text to convert 457 * @access private
449 * @return array array of trigram frequencies 458 * @deprecated Superceded by the Text_LanguageDetect_Parser class
450 */ 459 */
451 function _trigram($text) 460 function _trigram($text)
452 { 461 {
453 $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); 462 $s = new Text_LanguageDetect_Parser($text);
454 $s->prepareTrigram(); 463 $s->prepareTrigram();
455 $s->prepareUnicode(false); 464 $s->prepareUnicode(false);
456 $s->setPadStart(!$this->_perl_compatible); 465 $s->setPadStart(!$this->_perl_compatible);
@@ -463,11 +472,12 @@ class Text_LanguageDetect
463 * 472 *
464 * Thresholds (cuts off) the list at $this->_threshold 473 * Thresholds (cuts off) the list at $this->_threshold
465 * 474 *
466 * @access protected 475 * @param array $arr array of trigram
467 * @param array $arr array of trgram 476 *
468 * @return array ranks of trigrams 477 * @return array ranks of trigrams
478 * @access protected
469 */ 479 */
470 function _arr_rank(&$arr) 480 function _arr_rank($arr)
471 { 481 {
472 482
473 // sorts alphabetically first as a standard way of breaking rank ties 483 // sorts alphabetically first as a standard way of breaking rank ties
@@ -494,14 +504,17 @@ class Text_LanguageDetect
494 504
495 /** 505 /**
496 * Sorts an array by value breaking ties alphabetically 506 * Sorts an array by value breaking ties alphabetically
497 * 507 *
498 * @access private 508 * @param array &$arr the array to sort
499 * @param array &$arr the array to sort 509 *
510 * @return void
511 * @access private
500 */ 512 */
501 function _bub_sort(&$arr) 513 function _bub_sort(&$arr)
502 { 514 {
503 // should do the same as this perl statement: 515 // should do the same as this perl statement:
504 // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } 516 // sort { $trigrams{$b} == $trigrams{$a}
517 // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
505 518
506 // needs to sort by both key and value at once 519 // needs to sort by both key and value at once
507 // using the key to break ties for the value 520 // using the key to break ties for the value
@@ -528,13 +541,14 @@ class Text_LanguageDetect
528 /** 541 /**
529 * Sort function used by bubble sort 542 * Sort function used by bubble sort
530 * 543 *
531 * Callback function for usort(). 544 * Callback function for usort().
532 * 545 *
533 * @access private 546 * @param array $a first param passed by usort()
534 * @param array first param passed by usort() 547 * @param array $b second param passed by usort()
535 * @param array second param passed by usort() 548 *
536 * @return int 1 if $a is greater, -1 if not 549 * @return int 1 if $a is greater, -1 if not
537 * @see _bub_sort() 550 * @see _bub_sort()
551 * @access private
538 */ 552 */
539 function _sort_func($a, $b) 553 function _sort_func($a, $b)
540 { 554 {
@@ -542,12 +556,12 @@ class Text_LanguageDetect
542 list($a_key, $a_value) = $a; 556 list($a_key, $a_value) = $a;
543 list($b_key, $b_value) = $b; 557 list($b_key, $b_value) = $b;
544 558
545 // if the values are the same, break ties using the key
546 if ($a_value == $b_value) { 559 if ($a_value == $b_value) {
560 // if the values are the same, break ties using the key
547 return strcmp($a_key, $b_key); 561 return strcmp($a_key, $b_key);
548 562
549 // if not, just sort normally
550 } else { 563 } else {
564 // if not, just sort normally
551 if ($a_value > $b_value) { 565 if ($a_value > $b_value) {
552 return -1; 566 return -1;
553 } else { 567 } else {
@@ -559,23 +573,24 @@ class Text_LanguageDetect
559 } 573 }
560 574
561 /** 575 /**
562 * Calculates a linear rank-order distance statistic between two sets of 576 * Calculates a linear rank-order distance statistic between two sets of
563 * ranked trigrams 577 * ranked trigrams
564 * 578 *
565 * Sums the differences in rank for each trigram. If the trigram does not 579 * Sums the differences in rank for each trigram. If the trigram does not
566 * appear in both, consider it a difference of $this->_threshold. 580 * appear in both, consider it a difference of $this->_threshold.
567 * 581 *
568 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite 582 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
569 * its simplicity it has been shown to be highly accurate for language 583 * its simplicity it has been shown to be highly accurate for language
570 * identification tasks. 584 * identification tasks.
571 * 585 *
572 * @access private 586 * @param array $arr1 the reference set of trigram ranks
573 * @param array $arr1 the reference set of trigram ranks 587 * @param array $arr2 the target set of trigram ranks
574 * @param array $arr2 the target set of trigram ranks 588 *
575 * @return int the sum of the differences between the ranks of 589 * @return int the sum of the differences between the ranks of
576 * the two trigram sets 590 * the two trigram sets
591 * @access private
577 */ 592 */
578 function _distance(&$arr1, &$arr2) 593 function _distance($arr1, $arr2)
579 { 594 {
580 $sumdist = 0; 595 $sumdist = 0;
581 596
@@ -598,14 +613,15 @@ class Text_LanguageDetect
598 613
599 /** 614 /**
600 * Normalizes the score returned by _distance() 615 * Normalizes the score returned by _distance()
601 * 616 *
602 * Different if perl compatible or not 617 * Different if perl compatible or not
603 * 618 *
604 * @access private 619 * @param int $score the score from _distance()
605 * @param int $score the score from _distance() 620 * @param int $base_count the number of trigrams being considered
606 * @param int $base_count the number of trigrams being considered 621 *
607 * @return float the normalized score 622 * @return float the normalized score
608 * @see _distance() 623 * @see _distance()
624 * @access private
609 */ 625 */
610 function _normalize_score($score, $base_count = null) 626 function _normalize_score($score, $base_count = null)
611 { 627 {
@@ -630,29 +646,24 @@ class Text_LanguageDetect
630 * 646 *
631 * If perl compatible, the score is 300-0, 0 being most similar. 647 * If perl compatible, the score is 300-0, 0 being most similar.
632 * Otherwise, it's 0-1 with 1 being most similar. 648 * Otherwise, it's 0-1 with 1 being most similar.
633 * 649 *
634 * The $sample text should be at least a few sentences in length; 650 * The $sample text should be at least a few sentences in length;
635 * should be ascii-7 or utf8 encoded, if another and the mbstring extension 651 * should be ascii-7 or utf8 encoded, if another and the mbstring extension
636 * is present it will try to detect and convert. However, experience has 652 * is present it will try to detect and convert. However, experience has
637 * shown that mb_detect_encoding() *does not work very well* with at least 653 * shown that mb_detect_encoding() *does not work very well* with at least
638 * some types of encoding. 654 * some types of encoding.
639 * 655 *
640 * @access public 656 * @param string $sample a sample of text to compare.
641 * @param string $sample a sample of text to compare. 657 * @param int $limit if specified, return an array of the most likely
642 * @param int $limit if specified, return an array of the most likely 658 * $limit languages and their scores.
643 * $limit languages and their scores. 659 *
644 * @return mixed sorted array of language scores, blank array if no 660 * @return mixed sorted array of language scores, blank array if no
645 * useable text was found, or PEAR_Error if error 661 * useable text was found
646 * with the object setup 662 * @see _distance()
647 * @see _distance() 663 * @throws Text_LanguageDetect_Exception
648 * @throws PEAR_Error
649 */ 664 */
650 function detect($sample, $limit = 0) 665 public function detect($sample, $limit = 0)
651 { 666 {
652 if (!$this->_setup_ok($err)) {
653 return $err;
654 }
655
656 // input check 667 // input check
657 if (!Text_LanguageDetect_Parser::validateString($sample)) { 668 if (!Text_LanguageDetect_Parser::validateString($sample)) {
658 return array(); 669 return array();
@@ -660,36 +671,27 @@ class Text_LanguageDetect
660 671
661 // check char encoding 672 // check char encoding
662 // (only if mbstring extension is compiled and PHP > 4.0.6) 673 // (only if mbstring extension is compiled and PHP > 4.0.6)
663 if (function_exists('mb_detect_encoding') 674 if (function_exists('mb_detect_encoding')
664 && function_exists('mb_convert_encoding')) { 675 && function_exists('mb_convert_encoding')
665 676 ) {
666 // mb_detect_encoding isn't very reliable, to say the least 677 // mb_detect_encoding isn't very reliable, to say the least
667 // detection should still work with a sufficient sample of ascii characters 678 // detection should still work with a sufficient sample
679 // of ascii characters
668 $encoding = mb_detect_encoding($sample); 680 $encoding = mb_detect_encoding($sample);
669 681
670 // mb_detect_encoding() will return FALSE if detection fails 682 // mb_detect_encoding() will return FALSE if detection fails
671 // don't attempt conversion if that's the case 683 // don't attempt conversion if that's the case
672 if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { 684 if ($encoding != 'ASCII' && $encoding != 'UTF-8'
673 685 && $encoding !== false
674 if (function_exists('mb_list_encodings')) { 686 ) {
675 687 // verify the encoding exists in mb_list_encodings
676 // verify the encoding exists in mb_list_encodings 688 if (in_array($encoding, mb_list_encodings())) {
677 if (in_array($encoding, mb_list_encodings())) { 689 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
678 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
679 }
680
681 // if the previous condition failed:
682 // somehow we detected an encoding that also we don't support
683
684 } else {
685 // php 4 doesnt have mb_list_encodings()
686 // so attempt with error suppression
687 $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding);
688 } 690 }
689 } 691 }
690 } 692 }
691 693
692 $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); 694 $sample_obj = new Text_LanguageDetect_Parser($sample);
693 $sample_obj->prepareTrigram(); 695 $sample_obj->prepareTrigram();
694 if ($this->_use_unicode_narrowing) { 696 if ($this->_use_unicode_narrowing) {
695 $sample_obj->prepareUnicode(); 697 $sample_obj->prepareUnicode();
@@ -713,7 +715,10 @@ class Text_LanguageDetect
713 if (is_array($blocks)) { 715 if (is_array($blocks)) {
714 $present_blocks = array_keys($blocks); 716 $present_blocks = array_keys($blocks);
715 } else { 717 } else {
716 throw new Exception('Error during block detection'); 718 throw new Text_LanguageDetect_Exception(
719 'Error during block detection',
720 Text_LanguageDetect_Exception::BLOCK_DETECTION
721 );
717 } 722 }
718 723
719 $possible_langs = array(); 724 $possible_langs = array();
@@ -731,30 +736,30 @@ class Text_LanguageDetect
731 } 736 }
732 737
733 // could also try an intersect operation rather than a union 738 // could also try an intersect operation rather than a union
734 // in other words, choose languages whose trigrams contain 739 // in other words, choose languages whose trigrams contain
735 // ALL of the unicode blocks found in this sample 740 // ALL of the unicode blocks found in this sample
736 // would improve speed but would be completely thrown off by an 741 // would improve speed but would be completely thrown off by an
737 // unexpected character, like an umlaut appearing in english text 742 // unexpected character, like an umlaut appearing in english text
738 743
739 $possible_langs = array_intersect( 744 $possible_langs = array_intersect(
740 array_keys($this->_lang_db), 745 array_keys($this->_lang_db),
741 array_unique($possible_langs) 746 array_unique($possible_langs)
742 ); 747 );
743 748
744 // needs to intersect it with the keys of _lang_db in case 749 // needs to intersect it with the keys of _lang_db in case
745 // languages have been omitted 750 // languages have been omitted
746 751
747 // or just try 'em all
748 } else { 752 } else {
753 // or just try 'em all
749 $possible_langs = array_keys($this->_lang_db); 754 $possible_langs = array_keys($this->_lang_db);
750 } 755 }
751 756
752 757
753 foreach ($possible_langs as $lang) { 758 foreach ($possible_langs as $lang) {
754 $scores[$lang] = 759 $scores[$lang] = $this->_normalize_score(
755 $this->_normalize_score( 760 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
756 $this->_distance($this->_lang_db[$lang], $trigram_freqs), 761 $trigram_count
757 $trigram_count); 762 );
758 } 763 }
759 764
760 unset($sample_obj); 765 unset($sample_obj);
@@ -772,7 +777,6 @@ class Text_LanguageDetect
772 $limited_scores = array(); 777 $limited_scores = array();
773 778
774 $i = 0; 779 $i = 0;
775
776 foreach ($scores as $key => $value) { 780 foreach ($scores as $key => $value) {
777 if ($i++ >= $limit) { 781 if ($i++ >= $limit) {
778 break; 782 break;
@@ -781,9 +785,9 @@ class Text_LanguageDetect
781 $limited_scores[$key] = $value; 785 $limited_scores[$key] = $value;
782 } 786 }
783 787
784 return $limited_scores; 788 return $this->_convertToNameMode($limited_scores, true);
785 } else { 789 } else {
786 return $scores; 790 return $this->_convertToNameMode($scores, true);
787 } 791 }
788 } 792 }
789 793
@@ -791,35 +795,33 @@ class Text_LanguageDetect
791 * Returns only the most similar language to the text sample 795 * Returns only the most similar language to the text sample
792 * 796 *
793 * Calls $this->detect() and returns only the top result 797 * Calls $this->detect() and returns only the top result
794 * 798 *
795 * @access public 799 * @param string $sample text to detect the language of
796 * @param string $sample text to detect the language of 800 *
797 * @return string the name of the most likely language 801 * @return string the name of the most likely language
798 * or null if no language is similar 802 * or null if no language is similar
799 * @see detect() 803 * @see detect()
800 * @throws PEAR_Error 804 * @throws Text_LanguageDetect_Exception
801 */ 805 */
802 function detectSimple($sample) 806 public function detectSimple($sample)
803 { 807 {
804 $scores = $this->detect($sample, 1); 808 $scores = $this->detect($sample, 1);
805 809
806 // if top language has the maximum possible score, 810 // if top language has the maximum possible score,
807 // then the top score will have been picked at random 811 // then the top score will have been picked at random
808 if ( !is_array($scores) 812 if (!is_array($scores) || empty($scores)
809 || empty($scores) 813 || current($scores) == $this->_max_score
810 || current($scores) == $this->_max_score) { 814 ) {
811
812 return null; 815 return null;
813
814 } else { 816 } else {
815 return ucfirst(key($scores)); 817 return key($scores);
816 } 818 }
817 } 819 }
818 820
819 /** 821 /**
820 * Returns an array containing the most similar language and a confidence 822 * Returns an array containing the most similar language and a confidence
821 * rating 823 * rating
822 * 824 *
823 * Confidence is a simple measure calculated from the similarity score 825 * Confidence is a simple measure calculated from the similarity score
824 * minus the similarity score from the next most similar language 826 * minus the similarity score from the next most similar language
825 * divided by the highest possible score. Languages that have closely 827 * divided by the highest possible score. Languages that have closely
@@ -827,46 +829,43 @@ class Text_LanguageDetect
827 * confidence scores. 829 * confidence scores.
828 * 830 *
829 * The similarity score answers the question "How likely is the text the 831 * The similarity score answers the question "How likely is the text the
830 * returned language regardless of the other languages considered?" The 832 * returned language regardless of the other languages considered?" The
831 * confidence score is one way of answering the question "how likely is the 833 * confidence score is one way of answering the question "how likely is the
832 * text the detected language relative to the rest of the language model 834 * text the detected language relative to the rest of the language model
833 * set?" 835 * set?"
834 * 836 *
835 * To see how similar languages are a priori, see languageSimilarity() 837 * To see how similar languages are a priori, see languageSimilarity()
836 * 838 *
837 * @access public 839 * @param string $sample text for which language will be detected
838 * @param string $sample text for which language will be detected 840 *
839 * @return array most similar language, score and confidence rating 841 * @return array most similar language, score and confidence rating
840 * or null if no language is similar 842 * or null if no language is similar
841 * @see detect() 843 * @see detect()
842 * @throws PEAR_Error 844 * @throws Text_LanguageDetect_Exception
843 */ 845 */
844 function detectConfidence($sample) 846 public function detectConfidence($sample)
845 { 847 {
846 $scores = $this->detect($sample, 2); 848 $scores = $this->detect($sample, 2);
847 849
848 // if most similar language has the max score, it 850 // if most similar language has the max score, it
849 // will have been picked at random 851 // will have been picked at random
850 if ( !is_array($scores) 852 if (!is_array($scores) || empty($scores)
851 || empty($scores) 853 || current($scores) == $this->_max_score
852 || current($scores) == $this->_max_score) { 854 ) {
853
854 return null; 855 return null;
855 } 856 }
856 857
857 $arr['language'] = ucfirst(key($scores)); 858 $arr['language'] = key($scores);
858 $arr['similarity'] = current($scores); 859 $arr['similarity'] = current($scores);
859 if (next($scores) !== false) { // if false then no next element 860 if (next($scores) !== false) { // if false then no next element
860 // the goal is to return a higher value if the distance between 861 // the goal is to return a higher value if the distance between
861 // the similarity of the first score and the second score is high 862 // the similarity of the first score and the second score is high
862 863
863 if ($this->_perl_compatible) { 864 if ($this->_perl_compatible) {
864 865 $arr['confidence'] = (current($scores) - $arr['similarity'])
865 $arr['confidence'] = 866 / $this->_max_score;
866 (current($scores) - $arr['similarity']) / $this->_max_score;
867 867
868 } else { 868 } else {
869
870 $arr['confidence'] = $arr['similarity'] - current($scores); 869 $arr['confidence'] = $arr['similarity'] - current($scores);
871 870
872 } 871 }
@@ -882,32 +881,26 @@ class Text_LanguageDetect
882 * Returns the distribution of unicode blocks in a given utf8 string 881 * Returns the distribution of unicode blocks in a given utf8 string
883 * 882 *
884 * For the block name of a single char, use unicodeBlockName() 883 * For the block name of a single char, use unicodeBlockName()
885 * 884 *
886 * @access public 885 * @param string $str input string. Must be ascii or utf8
887 * @param string $str input string. Must be ascii or utf8 886 * @param bool $skip_symbols if true, skip ascii digits, symbols and
888 * @param bool $skip_symbols if true, skip ascii digits, symbols and 887 * non-printing characters. Includes spaces,
889 * non-printing characters. Includes spaces, 888 * newlines and common punctutation characters.
890 * newlines and common punctutation characters. 889 *
891 * @return array 890 * @return array
892 * @throws PEAR_Error 891 * @throws Text_LanguageDetect_Exception
893 */ 892 */
894 function detectUnicodeBlocks($str, $skip_symbols) 893 public function detectUnicodeBlocks($str, $skip_symbols)
895 { 894 {
896 // input check 895 $skip_symbols = (bool)$skip_symbols;
897 if (!is_bool($skip_symbols)) { 896 $str = (string)$str;
898 throw new Exception('Second parameter must be boolean');
899 }
900
901 if (!is_string($str)) {
902 throw new Exception('First parameter was not a string');
903 }
904 897
905 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 898 $sample_obj = new Text_LanguageDetect_Parser($str);
906 $sample_obj->prepareUnicode(); 899 $sample_obj->prepareUnicode();
907 $sample_obj->prepareTrigram(false); 900 $sample_obj->prepareTrigram(false);
908 $sample_obj->setUnicodeSkipSymbols($skip_symbols); 901 $sample_obj->setUnicodeSkipSymbols($skip_symbols);
909 $sample_obj->analyze(); 902 $sample_obj->analyze();
910 $blocks =& $sample_obj->getUnicodeBlocks(); 903 $blocks = $sample_obj->getUnicodeBlocks();
911 unset($sample_obj); 904 unset($sample_obj);
912 return $blocks; 905 return $blocks;
913 } 906 }
@@ -915,38 +908,37 @@ class Text_LanguageDetect
915 /** 908 /**
916 * Returns the block name for a given unicode value 909 * Returns the block name for a given unicode value
917 * 910 *
918 * If passed a string, will assume it is being passed a UTF8-formatted 911 * If passed a string, will assume it is being passed a UTF8-formatted
919 * character and will automatically convert. Otherwise it will assume it 912 * character and will automatically convert. Otherwise it will assume it
920 * is being passed a numeric unicode value. 913 * is being passed a numeric unicode value.
921 * 914 *
922 * Make sure input is of the correct type! 915 * Make sure input is of the correct type!
923 * 916 *
924 * @access public
925 * @param mixed $unicode unicode value or utf8 char 917 * @param mixed $unicode unicode value or utf8 char
918 *
926 * @return mixed the block name string or false if not found 919 * @return mixed the block name string or false if not found
927 * @throws PEAR_Error 920 * @throws Text_LanguageDetect_Exception
928 */ 921 */
929 function unicodeBlockName($unicode) { 922 public function unicodeBlockName($unicode)
923 {
930 if (is_string($unicode)) { 924 if (is_string($unicode)) {
931 // assume it is being passed a utf8 char, so convert it 925 // assume it is being passed a utf8 char, so convert it
932 926 if (self::utf8strlen($unicode) > 1) {
933 // input check 927 throw new Text_LanguageDetect_Exception(
934 if ($this->utf8strlen($unicode) > 1) { 928 'Pass a single char only to this method',
935 throw new Exception('Pass this function only a single char'); 929 Text_LanguageDetect_Exception::PARAM_TYPE
930 );
936 } 931 }
937
938 $unicode = $this->_utf8char2unicode($unicode); 932 $unicode = $this->_utf8char2unicode($unicode);
939 933
940 if ($unicode == -1) {
941 throw new Exception('Malformatted char');
942 }
943
944 // input check
945 } elseif (!is_int($unicode)) { 934 } elseif (!is_int($unicode)) {
946 throw new Exception('Input must be of type string or int.'); 935 throw new Text_LanguageDetect_Exception(
936 'Input must be of type string or int.',
937 Text_LanguageDetect_Exception::PARAM_TYPE
938 );
947 } 939 }
948 940
949 $blocks =& $this->_read_unicode_block_db(); 941 $blocks = $this->_read_unicode_block_db();
950 942
951 $result = $this->_unicode_block_name($unicode, $blocks); 943 $result = $this->_unicode_block_name($unicode, $blocks);
952 944
@@ -964,14 +956,17 @@ class Text_LanguageDetect
964 * the public interface for this function, which does input checks which 956 * the public interface for this function, which does input checks which
965 * this function omits for speed. 957 * this function omits for speed.
966 * 958 *
967 * @access protected 959 * @param int $unicode the unicode value
968 * @param int $unicode the unicode value 960 * @param array $blocks the block database
969 * @param array &$blocks the block database 961 * @param int $block_count the number of defined blocks in the database
970 * @param int $block_count the number of defined blocks in the database 962 *
971 * @see unicodeBlockName() 963 * @return mixed Block name, -1 if it failed
964 * @see unicodeBlockName()
965 * @access protected
972 */ 966 */
973 function _unicode_block_name($unicode, &$blocks, $block_count = -1) { 967 function _unicode_block_name($unicode, $blocks, $block_count = -1)
974 // for a reference, see 968 {
969 // for a reference, see
975 // http://www.unicode.org/Public/UNIDATA/Blocks.txt 970 // http://www.unicode.org/Public/UNIDATA/Blocks.txt
976 971
977 // assume that ascii characters are the most common 972 // assume that ascii characters are the most common
@@ -994,35 +989,36 @@ class Text_LanguageDetect
994 while ($low <= $high) { 989 while ($low <= $high) {
995 $mid = floor(($low + $high) / 2); 990 $mid = floor(($low + $high) / 2);
996 991
997 // if it's lower than the lower bound
998 if ($unicode < $blocks[$mid][0]) { 992 if ($unicode < $blocks[$mid][0]) {
993 // if it's lower than the lower bound
999 $high = $mid - 1; 994 $high = $mid - 1;
1000 995
1001 // if it's higher than the upper bound
1002 } elseif ($unicode > $blocks[$mid][1]) { 996 } elseif ($unicode > $blocks[$mid][1]) {
997 // if it's higher than the upper bound
1003 $low = $mid + 1; 998 $low = $mid + 1;
1004 999
1005 // found it
1006 } else { 1000 } else {
1001 // found it
1007 return $blocks[$mid]; 1002 return $blocks[$mid];
1008 } 1003 }
1009 } 1004 }
1010 1005
1011 // failed to find the block 1006 // failed to find the block
1012 return -1; 1007 return -1;
1013 1008
1014 // todo: differentiate when it's out of range or when it falls 1009 // todo: differentiate when it's out of range or when it falls
1015 // into an unassigned range? 1010 // into an unassigned range?
1016 } 1011 }
1017 1012
1018 /** 1013 /**
1019 * Brings up the unicode block database 1014 * Brings up the unicode block database
1020 * 1015 *
1021 * @access protected
1022 * @return array the database of unicode block definitions 1016 * @return array the database of unicode block definitions
1023 * @throws PEAR_Error 1017 * @throws Text_LanguageDetect_Exception
1018 * @access protected
1024 */ 1019 */
1025 function &_read_unicode_block_db() { 1020 function _read_unicode_block_db()
1021 {
1026 // since the unicode definitions are always going to be the same, 1022 // since the unicode definitions are always going to be the same,
1027 // might as well share the memory for the db with all other instances 1023 // might as well share the memory for the db with all other instances
1028 // of this class 1024 // of this class
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect
1037 1033
1038 /** 1034 /**
1039 * Calculate the similarities between the language models 1035 * Calculate the similarities between the language models
1040 * 1036 *
1041 * Use this function to see how similar languages are to each other. 1037 * Use this function to see how similar languages are to each other.
1042 * 1038 *
1043 * If passed 2 language names, will return just those languages compared. 1039 * If passed 2 language names, will return just those languages compared.
1044 * If passed 1 language name, will return that language compared to 1040 * If passed 1 language name, will return that language compared to
1045 * all others. 1041 * all others.
1046 * If passed none, will return an array of every language model compared 1042 * If passed none, will return an array of every language model compared
1047 * to every other one. 1043 * to every other one.
1048 * 1044 *
1049 * @access public 1045 * @param string $lang1 the name of the first language to be compared
1050 * @param string $lang1 the name of the first language to be compared 1046 * @param string $lang2 the name of the second language to be compared
1051 * @param string $lang2 the name of the second language to be compared 1047 *
1052 * @return array scores of every language compared 1048 * @return array scores of every language compared
1053 * or the score of just the provided languages 1049 * or the score of just the provided languages
1054 * or null if one of the supplied languages does not exist 1050 * or null if one of the supplied languages does not exist
1055 * @throws PEAR_Error 1051 * @throws Text_LanguageDetect_Exception
1056 */ 1052 */
1057 function languageSimilarity($lang1 = null, $lang2 = null) 1053 public function languageSimilarity($lang1 = null, $lang2 = null)
1058 { 1054 {
1059 if (!$this->_setup_ok($err)) { 1055 $lang1 = $this->_convertFromNameMode($lang1);
1060 return $err; 1056 $lang2 = $this->_convertFromNameMode($lang2);
1061 }
1062
1063 if ($lang1 != null) { 1057 if ($lang1 != null) {
1064 $lang1 = strtolower($lang1); 1058 $lang1 = strtolower($lang1);
1065 1059
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect
1069 } 1063 }
1070 1064
1071 if ($lang2 != null) { 1065 if ($lang2 != null) {
1072 1066 if (!isset($this->_lang_db[$lang2])) {
1073 // can't only set the second param 1067 // check if language model exists
1074 if ($lang1 == null) {
1075 return null;
1076 // check if language model exists
1077 } elseif (!isset($this->_lang_db[$lang2])) {
1078 return null; 1068 return null;
1079 } 1069 }
1080 1070
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect
1088 ) 1078 )
1089 ); 1079 );
1090 1080
1091
1092 // compare just $lang1 to all languages
1093 } else { 1081 } else {
1082 // compare just $lang1 to all languages
1094 $return_arr = array(); 1083 $return_arr = array();
1095 foreach ($this->_lang_db as $key => $value) { 1084 foreach ($this->_lang_db as $key => $value) {
1096 if ($key != $lang1) { // don't compare a language to itself 1085 if ($key != $lang1) {
1086 // don't compare a language to itself
1097 $return_arr[$key] = $this->_normalize_score( 1087 $return_arr[$key] = $this->_normalize_score(
1098 $this->_distance($this->_lang_db[$lang1], $value)); 1088 $this->_distance($this->_lang_db[$lang1], $value)
1089 );
1099 } 1090 }
1100 } 1091 }
1101 asort($return_arr); 1092 asort($return_arr);
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect
1104 } 1095 }
1105 1096
1106 1097
1107 // compare all languages to each other
1108 } else { 1098 } else {
1099 // compare all languages to each other
1109 $return_arr = array(); 1100 $return_arr = array();
1110 foreach (array_keys($this->_lang_db) as $lang1) { 1101 foreach (array_keys($this->_lang_db) as $lang1) {
1111 foreach (array_keys($this->_lang_db) as $lang2) { 1102 foreach (array_keys($this->_lang_db) as $lang2) {
1112
1113 // skip comparing languages to themselves 1103 // skip comparing languages to themselves
1114 if ($lang1 != $lang2) { 1104 if ($lang1 != $lang2) {
1115
1116 // don't re-calculate what's already been done
1117 if (isset($return_arr[$lang2][$lang1])) {
1118 1105
1119 $return_arr[$lang1][$lang2] = 1106 if (isset($return_arr[$lang2][$lang1])) {
1120 $return_arr[$lang2][$lang1]; 1107 // don't re-calculate what's already been done
1108 $return_arr[$lang1][$lang2]
1109 = $return_arr[$lang2][$lang1];
1121 1110
1122 // calculate
1123 } else { 1111 } else {
1124 1112 // calculate
1125 $return_arr[$lang1][$lang2] = 1113 $return_arr[$lang1][$lang2]
1126 $this->_normalize_score( 1114 = $this->_normalize_score(
1127 $this->_distance( 1115 $this->_distance(
1128 $this->_lang_db[$lang1], 1116 $this->_lang_db[$lang1],
1129 $this->_lang_db[$lang2] 1117 $this->_lang_db[$lang2]
1130 ) 1118 )
1131 ); 1119 );
1132 1120
1133 } 1121 }
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect
1150 * 1138 *
1151 * @access public 1139 * @access public
1152 * @return array language cluster data 1140 * @return array language cluster data
1153 * @throws PEAR_Error 1141 * @throws Text_LanguageDetect_Exception
1154 * @see languageSimilarity() 1142 * @see languageSimilarity()
1155 * @deprecated this function will eventually be removed and placed into 1143 * @deprecated this function will eventually be removed and placed into
1156 * the model generation class 1144 * the model generation class
1157 */ 1145 */
1158 function clusterLanguages() 1146 function clusterLanguages()
1159 { 1147 {
1160 // todo: set the maximum number of clusters 1148 // todo: set the maximum number of clusters
1161
1162 // setup check
1163 if (!$this->_setup_ok($err)) {
1164 return $err;
1165 }
1166
1167 // return cached result, if any 1149 // return cached result, if any
1168 if (isset($this->_clusters)) { 1150 if (isset($this->_clusters)) {
1169 return $this->_clusters; 1151 return $this->_clusters;
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect
1177 1159
1178 foreach ($langs as $lang) { 1160 foreach ($langs as $lang) {
1179 if (!isset($this->_lang_db[$lang])) { 1161 if (!isset($this->_lang_db[$lang])) {
1180 throw new Exception("missing $lang!\n"); 1162 throw new Text_LanguageDetect_Exception(
1163 "missing $lang!",
1164 Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1165 );
1181 } 1166 }
1182 } 1167 }
1183 1168
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect
1186 $langs[$lang1] = $lang1; 1171 $langs[$lang1] = $lang1;
1187 unset($langs[$old_key]); 1172 unset($langs[$old_key]);
1188 } 1173 }
1189 1174
1175 $result_data = $really_map = array();
1176
1190 $i = 0; 1177 $i = 0;
1191 while (count($langs) > 2 && $i++ < 200) { 1178 while (count($langs) > 2 && $i++ < 200) {
1192 $highest_score = -1; 1179 $highest_score = -1;
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect
1194 $highest_key2 = ''; 1181 $highest_key2 = '';
1195 foreach ($langs as $lang1) { 1182 foreach ($langs as $lang1) {
1196 foreach ($langs as $lang2) { 1183 foreach ($langs as $lang2) {
1197 if ( $lang1 != $lang2 1184 if ($lang1 != $lang2
1198 && $arr[$lang1][$lang2] > $highest_score) { 1185 && $arr[$lang1][$lang2] > $highest_score
1186 ) {
1199 $highest_score = $arr[$lang1][$lang2]; 1187 $highest_score = $arr[$lang1][$lang2];
1200 $highest_key1 = $lang1; 1188 $highest_key1 = $lang1;
1201 $highest_key2 = $lang2; 1189 $highest_key2 = $lang2;
1202 } 1190 }
1203 } 1191 }
1204 } 1192 }
1205 1193
1206 if (!$highest_key1) { 1194 if (!$highest_key1) {
1207 // should not ever happen 1195 // should not ever happen
1208 throw new Exception("no highest key? (step: $i)"); 1196 throw new Text_LanguageDetect_Exception(
1197 "no highest key? (step: $i)",
1198 Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1199 );
1209 } 1200 }
1210 1201
1211 if ($highest_score == 0) { 1202 if ($highest_score == 0) {
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect
1217 $sum1 = array_sum($arr[$highest_key1]); 1208 $sum1 = array_sum($arr[$highest_key1]);
1218 $sum2 = array_sum($arr[$highest_key2]); 1209 $sum2 = array_sum($arr[$highest_key2]);
1219 1210
1220 // use the score for the one that is most similar to the rest of 1211 // use the score for the one that is most similar to the rest of
1221 // the field as the score for the group 1212 // the field as the score for the group
1222 // todo: could try averaging or "centroid" method instead 1213 // todo: could try averaging or "centroid" method instead
1223 // seems like that might make more sense 1214 // seems like that might make more sense
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect
1248 $really_lang = $replaceme; 1239 $really_lang = $replaceme;
1249 while (isset($really_map[$really_lang])) { 1240 while (isset($really_map[$really_lang])) {
1250 $really_lang = $really_map[$really_lang]; 1241 $really_lang = $really_map[$really_lang];
1251 } 1242 }
1252 $really_map[$newkey] = $really_lang; 1243 $really_map[$newkey] = $really_lang;
1253 1244
1254 1245
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect
1259 $arr[$key1][$newkey] = $arr[$key1][$key2]; 1250 $arr[$key1][$newkey] = $arr[$key1][$key2];
1260 unset($arr[$key1][$key2]); 1251 unset($arr[$key1][$key2]);
1261 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] 1252 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1262 } 1253 }
1263 1254
1264 if ($key1 == $replaceme) { 1255 if ($key1 == $replaceme) {
1265 $arr[$newkey][$key2] = $arr[$key1][$key2]; 1256 $arr[$newkey][$key2] = $arr[$key1][$key2];
1266 unset($arr[$key1][$key2]); 1257 unset($arr[$key1][$key2]);
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect
1273 } 1264 }
1274 } 1265 }
1275 } 1266 }
1276 1267
1277 1268
1278 unset($langs[$highest_key1]); 1269 unset($langs[$highest_key1]);
1279 unset($langs[$highest_key2]); 1270 unset($langs[$highest_key2]);
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect
1293 } 1284 }
1294 1285
1295 $return_val = array( 1286 $return_val = array(
1296 'open_forks' => $langs, 1287 'open_forks' => $langs,
1297 // the top level of clusters 1288 // the top level of clusters
1298 // clusters that are mutually exclusive 1289 // clusters that are mutually exclusive
1299 // or specified by a specific maximum 1290 // or specified by a specific maximum
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect
1323 * use, and it may disappear or its functionality may change in future 1314 * use, and it may disappear or its functionality may change in future
1324 * releases without notice. 1315 * releases without notice.
1325 * 1316 *
1326 * This compares the sample text to top the top level of clusters. If the 1317 * This compares the sample text to top the top level of clusters. If the
1327 * sample is similar to the cluster it will drop down and compare it to the 1318 * sample is similar to the cluster it will drop down and compare it to the
1328 * languages in the cluster, and so on until it hits a leaf node. 1319 * languages in the cluster, and so on until it hits a leaf node.
1329 * 1320 *
1330 * this should find the language in considerably fewer compares 1321 * this should find the language in considerably fewer compares
1331 * (the equivalent of a binary search), however clusterLanguages() is costly 1322 * (the equivalent of a binary search), however clusterLanguages() is costly
1332 * and the loss of accuracy from this technique is significant. 1323 * and the loss of accuracy from this technique is significant.
1333 * 1324 *
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect
1337 * was very large, however in such cases some method of Bayesian inference 1328 * was very large, however in such cases some method of Bayesian inference
1338 * might be more helpful. 1329 * might be more helpful.
1339 * 1330 *
1340 * @see clusterLanguages() 1331 * @param string $str input string
1341 * @access public 1332 *
1342 * @param string $str input string 1333 * @return array language scores (only those compared)
1343 * @return array language scores (only those compared) 1334 * @throws Text_LanguageDetect_Exception
1344 * @throws PEAR_Error 1335 * @see clusterLanguages()
1345 */ 1336 */
1346 function clusteredSearch($str) 1337 public function clusteredSearch($str)
1347 { 1338 {
1348
1349 // input check 1339 // input check
1350 if (!Text_LanguageDetect_Parser::validateString($str)) { 1340 if (!Text_LanguageDetect_Parser::validateString($str)) {
1351 return array(); 1341 return array();
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect
1359 $dendogram_data = $result['fork_data']; 1349 $dendogram_data = $result['fork_data'];
1360 $dendogram_alias = $result['name_map']; 1350 $dendogram_alias = $result['name_map'];
1361 1351
1362 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 1352 $sample_obj = new Text_LanguageDetect_Parser($str);
1363 $sample_obj->prepareTrigram(); 1353 $sample_obj->prepareTrigram();
1364 $sample_obj->setPadStart(!$this->_perl_compatible); 1354 $sample_obj->setPadStart(!$this->_perl_compatible);
1365 $sample_obj->analyze(); 1355 $sample_obj->analyze();
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect
1372 } 1362 }
1373 1363
1374 $i = 0; // counts the number of steps 1364 $i = 0; // counts the number of steps
1375 1365
1376 foreach ($dendogram_start as $lang) { 1366 foreach ($dendogram_start as $lang) {
1377 if (isset($dendogram_alias[$lang])) { 1367 if (isset($dendogram_alias[$lang])) {
1378 $lang_key = $dendogram_alias[$lang]; 1368 $lang_key = $dendogram_alias[$lang];
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect
1382 1372
1383 $scores[$lang] = $this->_normalize_score( 1373 $scores[$lang] = $this->_normalize_score(
1384 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1374 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1385 $sample_count); 1375 $sample_count
1376 );
1386 1377
1387 $i++; 1378 $i++;
1388 } 1379 }
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect
1411 1402
1412 $scores[$lang] = $this->_normalize_score( 1403 $scores[$lang] = $this->_normalize_score(
1413 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1404 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1414 $sample_count); 1405 $sample_count
1406 );
1415 1407
1416 //todo: does not need to do same comparison again 1408 //todo: does not need to do same comparison again
1417 } 1409 }
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect
1428 1420
1429 $diff = $scores[$cur_key] - $scores[$loser_key]; 1421 $diff = $scores[$cur_key] - $scores[$loser_key];
1430 1422
1431 // $cur_key ({$dendogram_alias[$cur_key]}) wins 1423 // $cur_key ({$dendogram_alias[$cur_key]}) wins
1432 // over $loser_key ({$dendogram_alias[$loser_key]}) 1424 // over $loser_key ({$dendogram_alias[$loser_key]})
1433 // with a difference of $diff 1425 // with a difference of $diff
1434 } 1426 }
1435 1427
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect
1439 // which paths the algorithm decided to take along the tree 1431 // which paths the algorithm decided to take along the tree
1440 1432
1441 // but sometimes the last item is only the second highest 1433 // but sometimes the last item is only the second highest
1442 if ( ($this->_perl_compatible && (end($scores) > prev($scores))) 1434 if (($this->_perl_compatible && (end($scores) > prev($scores)))
1443 || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { 1435 || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1444 1436 ) {
1445 $real_last_score = current($scores); 1437 $real_last_score = current($scores);
1446 $real_last_key = key($scores); 1438 $real_last_key = key($scores);
1447 1439
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect
1449 unset($scores[$real_last_key]); 1441 unset($scores[$real_last_key]);
1450 $scores[$real_last_key] = $real_last_score; 1442 $scores[$real_last_key] = $real_last_score;
1451 } 1443 }
1452 1444
1453 1445
1454 if (!$this->_perl_compatible) { 1446 if (!$this->_perl_compatible) {
1455 $scores = array_reverse($scores, true); 1447 $scores = array_reverse($scores, true);
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect
1464 * 1456 *
1465 * Returns the numbers of characters (not bytes) in a utf8 string 1457 * Returns the numbers of characters (not bytes) in a utf8 string
1466 * 1458 *
1467 * @static 1459 * @param string $str string to get the length of
1468 * @access public 1460 *
1469 * @param string $str string to get the length of 1461 * @return int number of chars
1470 * @return int number of chars
1471 */ 1462 */
1472 function utf8strlen($str) 1463 public static function utf8strlen($str)
1473 { 1464 {
1474 // utf8_decode() will convert unknown chars to '?', which is actually 1465 // utf8_decode() will convert unknown chars to '?', which is actually
1475 // ideal for counting. 1466 // ideal for counting.
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect
1482 /** 1473 /**
1483 * Returns the unicode value of a utf8 char 1474 * Returns the unicode value of a utf8 char
1484 * 1475 *
1485 * @access protected 1476 * @param string $char a utf8 (possibly multi-byte) char
1486 * @param string $char a utf8 (possibly multi-byte) char 1477 *
1487 * @return int unicode value or -1 if malformatted 1478 * @return int unicode value
1479 * @access protected
1480 * @link http://en.wikipedia.org/wiki/UTF-8
1488 */ 1481 */
1489 function _utf8char2unicode($char) { 1482 function _utf8char2unicode($char)
1490 1483 {
1491 // strlen() here will actually get the binary length of a single char 1484 // strlen() here will actually get the binary length of a single char
1492 switch (strlen($char)) { 1485 switch (strlen($char)) {
1493 1486 case 1:
1494 // for a reference, see http://en.wikipedia.org/wiki/UTF-8 1487 // normal ASCII-7 byte
1495 1488 // 0xxxxxxx --> 0xxxxxxx
1496 case 1: 1489 return ord($char{0});
1497 // normal ASCII-7 byte 1490
1498 // 0xxxxxxx --> 0xxxxxxx 1491 case 2:
1499 return ord($char{0}); 1492 // 2 byte unicode
1500 1493 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1501 case 2: 1494 $z = (ord($char{0}) & 0x000001F) << 6;
1502 // 2 byte unicode 1495 $x = (ord($char{1}) & 0x0000003F);
1503 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx 1496 return ($z | $x);
1504 $z = (ord($char{0}) & 0x000001F) << 6; 1497
1505 $x = (ord($char{1}) & 0x0000003F); 1498 case 3:
1506 1499 // 3 byte unicode
1507 return ($z | $x); 1500 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1508 1501 $z = (ord($char{0}) & 0x0000000F) << 12;
1509 case 3: 1502 $x1 = (ord($char{1}) & 0x0000003F) << 6;
1510 // 3 byte unicode 1503 $x2 = (ord($char{2}) & 0x0000003F);
1511 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx 1504 return ($z | $x1 | $x2);
1512 $z = (ord($char{0}) & 0x0000000F) << 12; 1505
1513 $x1 = (ord($char{1}) & 0x0000003F) << 6; 1506 case 4:
1514 $x2 = (ord($char{2}) & 0x0000003F); 1507 // 4 byte unicode
1515 1508 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1516 return ($z | $x1 | $x2); 1509 // 000zzzzz xxxxxxxx xxxxxxxx
1517 1510 $z1 = (ord($char{0}) & 0x00000007) << 18;
1518 case 4: 1511 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1519 // 4 byte unicode 1512 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1520 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> 1513 $x2 = (ord($char{3}) & 0x0000003F);
1521 // 000zzzzz xxxxxxxx xxxxxxxx 1514 return ($z1 | $z2 | $x1 | $x2);
1522 $z1 = (ord($char{0}) & 0x00000007) << 18;
1523 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1524 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1525 $x2 = (ord($char{3}) & 0x0000003F);
1526
1527 return ($z1 | $z2 | $x1 | $x2);
1528
1529 default:
1530 // error: malformatted char?
1531 return -1;
1532 } 1515 }
1533 } 1516 }
1534 1517
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect
1536 * utf8-safe fast character iterator 1519 * utf8-safe fast character iterator
1537 * 1520 *
1538 * Will get the next character starting from $counter, which will then be 1521 * Will get the next character starting from $counter, which will then be
1539 * incremented. If a multi-byte char the bytes will be concatenated and 1522 * incremented. If a multi-byte char the bytes will be concatenated and
1540 * $counter will be incremeted by the number of bytes in the char. 1523 * $counter will be incremeted by the number of bytes in the char.
1541 * 1524 *
1542 * @access private 1525 * @param string $str the string being iterated over
1543 * @param string &$str the string being iterated over 1526 * @param int &$counter the iterator, will increment by reference
1544 * @param int &$counter the iterator, will increment by reference 1527 * @param bool $special_convert whether to do special conversions
1545 * @param bool $special_convert whether to do special conversions 1528 *
1546 * @return char the next (possibly multi-byte) char from $counter 1529 * @return char the next (possibly multi-byte) char from $counter
1530 * @access private
1547 */ 1531 */
1548 function _next_char(&$str, &$counter, $special_convert = false) 1532 static function _next_char($str, &$counter, $special_convert = false)
1549 { 1533 {
1550
1551 $char = $str{$counter++}; 1534 $char = $str{$counter++};
1552 $ord = ord($char); 1535 $ord = ord($char);
1553 1536
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect
1556 1539
1557 // normal ascii one byte char 1540 // normal ascii one byte char
1558 if ($ord <= 127) { 1541 if ($ord <= 127) {
1559
1560 // special conversions needed for this package 1542 // special conversions needed for this package
1561 // (that only apply to regular ascii characters) 1543 // (that only apply to regular ascii characters)
1562 // lower case, and convert all non-alphanumeric characters 1544 // lower case, and convert all non-alphanumeric characters
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect
1571 1553
1572 return $char; 1554 return $char;
1573 1555
1574 // multi-byte chars
1575 } elseif ($ord >> 5 == 6) { // two-byte char 1556 } elseif ($ord >> 5 == 6) { // two-byte char
1557 // multi-byte chars
1576 $nextchar = $str{$counter++}; // get next byte 1558 $nextchar = $str{$counter++}; // get next byte
1577 1559
1578 // lower-casing of non-ascii characters is still incomplete 1560 // lower-casing of non-ascii characters is still incomplete
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect
1582 if ($ord == 195) { 1564 if ($ord == 195) {
1583 $nextord = ord($nextchar); 1565 $nextord = ord($nextchar);
1584 $nextord_adj = $nextord + 64; 1566 $nextord_adj = $nextord + 64;
1585 // for a reference, see 1567 // for a reference, see
1586 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html 1568 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1587 1569
1588 // &Agrave; - &THORN; but not &times; 1570 // &Agrave; - &THORN; but not &times;
1589 if ( $nextord_adj >= 192 1571 if ($nextord_adj >= 192
1590 && $nextord_adj <= 222 1572 && $nextord_adj <= 222
1591 && $nextord_adj != 215) { 1573 && $nextord_adj != 215
1592 1574 ) {
1593 $nextchar = chr($nextord + 32); 1575 $nextchar = chr($nextord + 32);
1594 } 1576 }
1595 1577
1596 // lower case cyrillic alphabet
1597 } elseif ($ord == 208) { 1578 } elseif ($ord == 208) {
1579 // lower case cyrillic alphabet
1598 $nextord = ord($nextchar); 1580 $nextord = ord($nextchar);
1599 // if A - Pe 1581 // if A - Pe
1600 if ($nextord >= 144 && $nextord <= 159) { 1582 if ($nextord >= 144 && $nextord <= 159) {
1601 // lower case 1583 // lower case
1602 $nextchar = chr($nextord + 32); 1584 $nextchar = chr($nextord + 32);
1603 1585
1604 // if Er - Ya
1605 } elseif ($nextord >= 160 && $nextord <= 175) { 1586 } elseif ($nextord >= 160 && $nextord <= 175) {
1587 // if Er - Ya
1606 // lower case 1588 // lower case
1607 $char = chr(209); // == $ord++ 1589 $char = chr(209); // == $ord++
1608 $nextchar = chr($nextord - 32); 1590 $nextchar = chr($nextord - 32);
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect
1611 } 1593 }
1612 1594
1613 // tag on next byte 1595 // tag on next byte
1614 return $char . $nextchar; 1596 return $char . $nextchar;
1615
1616 } elseif ($ord >> 4 == 14) { // three-byte char 1597 } elseif ($ord >> 4 == 14) { // three-byte char
1617 1598
1618 // tag on next 2 bytes 1599 // tag on next 2 bytes
1619 return $char . $str{$counter++} . $str{$counter++}; 1600 return $char . $str{$counter++} . $str{$counter++};
1620 1601
1621 } elseif ($ord >> 3 == 30) { // four-byte char 1602 } elseif ($ord >> 3 == 30) { // four-byte char
1622 1603
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect
1628 } 1609 }
1629 } 1610 }
1630 1611
1631} 1612 /**
1613 * Converts an $language input parameter from the configured mode
1614 * to the language name that is used internally.
1615 *
1616 * Works for strings and arrays.
1617 *
1618 * @param string|array $lang A language description ("english"/"en"/"eng")
1619 * @param boolean $convertKey If $lang is an array, setting $key
1620 * converts the keys to the language name.
1621 *
1622 * @return string|array Language name
1623 */
1624 function _convertFromNameMode($lang, $convertKey = false)
1625 {
1626 if ($this->_name_mode == 0) {
1627 return $lang;
1628 }
1629
1630 if ($this->_name_mode == 2) {
1631 $method = 'code2ToName';
1632 } else {
1633 $method = 'code3ToName';
1634 }
1635
1636 if (is_string($lang)) {
1637 return (string)Text_LanguageDetect_ISO639::$method($lang);
1638 }
1639
1640 $newlang = array();
1641 foreach ($lang as $key => $val) {
1642 if ($convertKey) {
1643 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1644 $newlang[$newkey] = $val;
1645 } else {
1646 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1647 }
1648 }
1649 return $newlang;
1650 }
1632 1651
1633/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 1652 /**
1653 * Converts an $language output parameter from the language name that is
1654 * used internally to the configured mode.
1655 *
1656 * Works for strings and arrays.
1657 *
1658 * @param string|array $lang A language description ("english"/"en"/"eng")
1659 * @param boolean $convertKey If $lang is an array, setting $key
1660 * converts the keys to the language name.
1661 *
1662 * @return string|array Language name
1663 */
1664 function _convertToNameMode($lang, $convertKey = false)
1665 {
1666 if ($this->_name_mode == 0) {
1667 return $lang;
1668 }
1669
1670 if ($this->_name_mode == 2) {
1671 $method = 'nameToCode2';
1672 } else {
1673 $method = 'nameToCode3';
1674 }
1675
1676 if (is_string($lang)) {
1677 return Text_LanguageDetect_ISO639::$method($lang);
1678 }
1679
1680 $newlang = array();
1681 foreach ($lang as $key => $val) {
1682 if ($convertKey) {
1683 $newkey = Text_LanguageDetect_ISO639::$method($key);
1684 $newlang[$newkey] = $val;
1685 } else {
1686 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1687 }
1688 }
1689 return $newlang;
1690 }
1691}
1634 1692
1635?> 1693/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
1<?php
2class Text_LanguageDetect_Exception extends Exception
3{
4 /**
5 * Database file could not be found
6 */
7 const DB_NOT_FOUND = 10;
8
9 /**
10 * Database file found, but not readable
11 */
12 const DB_NOT_READABLE = 11;
13
14 /**
15 * Database file is empty
16 */
17 const DB_EMPTY = 12;
18
19 /**
20 * Database contents is not a PHP array
21 */
22 const DB_NOT_ARRAY = 13;
23
24 /**
25 * Magic quotes are activated
26 */
27 const MAGIC_QUOTES = 14;
28
29
30 /**
31 * Parameter of invalid type passed to method
32 */
33 const PARAM_TYPE = 20;
34
35 /**
36 * Character in parameter is invalid
37 */
38 const INVALID_CHAR = 21;
39
40
41 /**
42 * Language is not in the database
43 */
44 const UNKNOWN_LANGUAGE = 30;
45
46
47 /**
48 * Error during block detection
49 */
50 const BLOCK_DETECTION = 40;
51
52
53 /**
54 * Error while clustering languages
55 */
56 const NO_HIGHEST_KEY = 50;
57}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
1<?php
2/**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Christian Weiske <cweiske@php.net>
10 * @copyright 2011 Christian Weiske <cweiske@php.net>
11 * @license http://www.debian.org/misc/bsd.license BSD
12 * @version SVN: $Id$
13 * @link http://pear.php.net/package/Text_LanguageDetect/
14 */
15
16/**
17 * Provides a mapping between the languages from lang.dat and the
18 * ISO 639-1 and ISO-639-2 codes.
19 *
20 * Note that this class contains only languages that exist in lang.dat.
21 *
22 * @category Text
23 * @package Text_LanguageDetect
24 * @author Christian Weiske <cweiske@php.net>
25 * @copyright 2011 Christian Weiske <cweiske@php.net>
26 * @license http://www.debian.org/misc/bsd.license BSD
27 * @link http://www.loc.gov/standards/iso639-2/php/code_list.php
28 */
29class Text_LanguageDetect_ISO639
30{
31 /**
32 * Maps all language names from the language database to the
33 * ISO 639-1 2-letter language code.
34 *
35 * NULL indicates that there is no 2-letter code.
36 *
37 * @var array
38 */
39 public static $nameToCode2 = array(
40 'albanian' => 'sq',
41 'arabic' => 'ar',
42 'azeri' => 'az',
43 'bengali' => 'bn',
44 'bulgarian' => 'bg',
45 'cebuano' => null,
46 'croatian' => 'hr',
47 'czech' => 'cs',
48 'danish' => 'da',
49 'dutch' => 'nl',
50 'english' => 'en',
51 'estonian' => 'et',
52 'farsi' => 'fa',
53 'finnish' => 'fi',
54 'french' => 'fr',
55 'german' => 'de',
56 'hausa' => 'ha',
57 'hawaiian' => null,
58 'hindi' => 'hi',
59 'hungarian' => 'hu',
60 'icelandic' => 'is',
61 'indonesian' => 'id',
62 'italian' => 'it',
63 'kazakh' => 'kk',
64 'kyrgyz' => 'ky',
65 'latin' => 'la',
66 'latvian' => 'lv',
67 'lithuanian' => 'lt',
68 'macedonian' => 'mk',
69 'mongolian' => 'mn',
70 'nepali' => 'ne',
71 'norwegian' => 'no',
72 'pashto' => 'ps',
73 'pidgin' => null,
74 'polish' => 'pl',
75 'portuguese' => 'pt',
76 'romanian' => 'ro',
77 'russian' => 'ru',
78 'serbian' => 'sr',
79 'slovak' => 'sk',
80 'slovene' => 'sl',
81 'somali' => 'so',
82 'spanish' => 'es',
83 'swahili' => 'sw',
84 'swedish' => 'sv',
85 'tagalog' => 'tl',
86 'turkish' => 'tr',
87 'ukrainian' => 'uk',
88 'urdu' => 'ur',
89 'uzbek' => 'uz',
90 'vietnamese' => 'vi',
91 'welsh' => 'cy',
92 );
93
94 /**
95 * Maps all language names from the language database to the
96 * ISO 639-2 3-letter language code.
97 *
98 * @var array
99 */
100 public static $nameToCode3 = array(
101 'albanian' => 'sqi',
102 'arabic' => 'ara',
103 'azeri' => 'aze',
104 'bengali' => 'ben',
105 'bulgarian' => 'bul',
106 'cebuano' => 'ceb',
107 'croatian' => 'hrv',
108 'czech' => 'ces',
109 'danish' => 'dan',
110 'dutch' => 'nld',
111 'english' => 'eng',
112 'estonian' => 'est',
113 'farsi' => 'fas',
114 'finnish' => 'fin',
115 'french' => 'fra',
116 'german' => 'deu',
117 'hausa' => 'hau',
118 'hawaiian' => 'haw',
119 'hindi' => 'hin',
120 'hungarian' => 'hun',
121 'icelandic' => 'isl',
122 'indonesian' => 'ind',
123 'italian' => 'ita',
124 'kazakh' => 'kaz',
125 'kyrgyz' => 'kir',
126 'latin' => 'lat',
127 'latvian' => 'lav',
128 'lithuanian' => 'lit',
129 'macedonian' => 'mkd',
130 'mongolian' => 'mon',
131 'nepali' => 'nep',
132 'norwegian' => 'nor',
133 'pashto' => 'pus',
134 'pidgin' => 'crp',
135 'polish' => 'pol',
136 'portuguese' => 'por',
137 'romanian' => 'ron',
138 'russian' => 'rus',
139 'serbian' => 'srp',
140 'slovak' => 'slk',
141 'slovene' => 'slv',
142 'somali' => 'som',
143 'spanish' => 'spa',
144 'swahili' => 'swa',
145 'swedish' => 'swe',
146 'tagalog' => 'tgl',
147 'turkish' => 'tur',
148 'ukrainian' => 'ukr',
149 'urdu' => 'urd',
150 'uzbek' => 'uzb',
151 'vietnamese' => 'vie',
152 'welsh' => 'cym',
153 );
154
155 /**
156 * Maps ISO 639-1 2-letter language codes to the language names
157 * in the language database
158 *
159 * Not all languages have a 2 letter code, so some are missing
160 *
161 * @var array
162 */
163 public static $code2ToName = array(
164 'ar' => 'arabic',
165 'az' => 'azeri',
166 'bg' => 'bulgarian',
167 'bn' => 'bengali',
168 'cs' => 'czech',
169 'cy' => 'welsh',
170 'da' => 'danish',
171 'de' => 'german',
172 'en' => 'english',
173 'es' => 'spanish',
174 'et' => 'estonian',
175 'fa' => 'farsi',
176 'fi' => 'finnish',
177 'fr' => 'french',
178 'ha' => 'hausa',
179 'hi' => 'hindi',
180 'hr' => 'croatian',
181 'hu' => 'hungarian',
182 'id' => 'indonesian',
183 'is' => 'icelandic',
184 'it' => 'italian',
185 'kk' => 'kazakh',
186 'ky' => 'kyrgyz',
187 'la' => 'latin',
188 'lt' => 'lithuanian',
189 'lv' => 'latvian',
190 'mk' => 'macedonian',
191 'mn' => 'mongolian',
192 'ne' => 'nepali',
193 'nl' => 'dutch',
194 'no' => 'norwegian',
195 'pl' => 'polish',
196 'ps' => 'pashto',
197 'pt' => 'portuguese',
198 'ro' => 'romanian',
199 'ru' => 'russian',
200 'sk' => 'slovak',
201 'sl' => 'slovene',
202 'so' => 'somali',
203 'sq' => 'albanian',
204 'sr' => 'serbian',
205 'sv' => 'swedish',
206 'sw' => 'swahili',
207 'tl' => 'tagalog',
208 'tr' => 'turkish',
209 'uk' => 'ukrainian',
210 'ur' => 'urdu',
211 'uz' => 'uzbek',
212 'vi' => 'vietnamese',
213 );
214
215 /**
216 * Maps ISO 639-2 3-letter language codes to the language names
217 * in the language database.
218 *
219 * @var array
220 */
221 public static $code3ToName = array(
222 'ara' => 'arabic',
223 'aze' => 'azeri',
224 'ben' => 'bengali',
225 'bul' => 'bulgarian',
226 'ceb' => 'cebuano',
227 'ces' => 'czech',
228 'crp' => 'pidgin',
229 'cym' => 'welsh',
230 'dan' => 'danish',
231 'deu' => 'german',
232 'eng' => 'english',
233 'est' => 'estonian',
234 'fas' => 'farsi',
235 'fin' => 'finnish',
236 'fra' => 'french',
237 'hau' => 'hausa',
238 'haw' => 'hawaiian',
239 'hin' => 'hindi',
240 'hrv' => 'croatian',
241 'hun' => 'hungarian',
242 'ind' => 'indonesian',
243 'isl' => 'icelandic',
244 'ita' => 'italian',
245 'kaz' => 'kazakh',
246 'kir' => 'kyrgyz',
247 'lat' => 'latin',
248 'lav' => 'latvian',
249 'lit' => 'lithuanian',
250 'mkd' => 'macedonian',
251 'mon' => 'mongolian',
252 'nep' => 'nepali',
253 'nld' => 'dutch',
254 'nor' => 'norwegian',
255 'pol' => 'polish',
256 'por' => 'portuguese',
257 'pus' => 'pashto',
258 'rom' => 'romanian',
259 'rus' => 'russian',
260 'slk' => 'slovak',
261 'slv' => 'slovene',
262 'som' => 'somali',
263 'spa' => 'spanish',
264 'sqi' => 'albanian',
265 'srp' => 'serbian',
266 'swa' => 'swahili',
267 'swe' => 'swedish',
268 'tgl' => 'tagalog',
269 'tur' => 'turkish',
270 'ukr' => 'ukrainian',
271 'urd' => 'urdu',
272 'uzb' => 'uzbek',
273 'vie' => 'vietnamese',
274 );
275
276 /**
277 * Returns the 2-letter ISO 639-1 code for the given language name.
278 *
279 * @param string $lang English language name like "swedish"
280 *
281 * @return string Two-letter language code (e.g. "sv") or NULL if not found
282 */
283 public static function nameToCode2($lang)
284 {
285 $lang = strtolower($lang);
286 if (!isset(self::$nameToCode2[$lang])) {
287 return null;
288 }
289 return self::$nameToCode2[$lang];
290 }
291
292 /**
293 * Returns the 3-letter ISO 639-2 code for the given language name.
294 *
295 * @param string $lang English language name like "swedish"
296 *
297 * @return string Three-letter language code (e.g. "swe") or NULL if not found
298 */
299 public static function nameToCode3($lang)
300 {
301 $lang = strtolower($lang);
302 if (!isset(self::$nameToCode3[$lang])) {
303 return null;
304 }
305 return self::$nameToCode3[$lang];
306 }
307
308 /**
309 * Returns the language name for the given 2-letter ISO 639-1 code.
310 *
311 * @param string $code Two-letter language code (e.g. "sv")
312 *
313 * @return string English language name like "swedish"
314 */
315 public static function code2ToName($code)
316 {
317 $lang = strtolower($code);
318 if (!isset(self::$code2ToName[$code])) {
319 return null;
320 }
321 return self::$code2ToName[$code];
322 }
323
324 /**
325 * Returns the language name for the given 3-letter ISO 639-2 code.
326 *
327 * @param string $code Three-letter language code (e.g. "swe")
328 *
329 * @return string English language name like "swedish"
330 */
331 public static function code3ToName($code)
332 {
333 $lang = strtolower($code);
334 if (!isset(self::$code3ToName[$code])) {
335 return null;
336 }
337 return self::$code3ToName[$code];
338 }
339} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
index 7f15fa98..fb0e1e20 100644
--- a/inc/3rdparty/libraries/language-detect/Parser.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -8,7 +8,7 @@
8 * @author Nicholas Pisarro 8 * @author Nicholas Pisarro
9 * @copyright 2006 9 * @copyright 2006
10 * @license BSD 10 * @license BSD
11 * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ 11 * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
12 * @link http://pear.php.net/package/Text_LanguageDetect/ 12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/ 13 * @link http://langdetect.blogspot.com/
14 */ 14 */
@@ -28,7 +28,7 @@
28 * @author Nicholas Pisarro 28 * @author Nicholas Pisarro
29 * @copyright 2006 29 * @copyright 2006
30 * @license BSD 30 * @license BSD
31 * @version release: 0.2.3 31 * @version release: 0.3.0
32 */ 32 */
33class Text_LanguageDetect_Parser extends Text_LanguageDetect 33class Text_LanguageDetect_Parser extends Text_LanguageDetect
34{ 34{
@@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
102 * @access private 102 * @access private
103 * @param string $string string to be parsed 103 * @param string $string string to be parsed
104 */ 104 */
105 function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { 105 function Text_LanguageDetect_Parser($string) {
106 if (isset($db)) $this->_db_filename = $db;
107 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
108 $this->_string = $string; 106 $this->_string = $string;
109 } 107 }
110 108
111 /** 109 /**
112 * Returns true if a string is suitable for parsing 110 * Returns true if a string is suitable for parsing
113 * 111 *
114 * @static
115 * @access public
116 * @param string $str input string to test 112 * @param string $str input string to test
117 * @return bool true if acceptable, false if not 113 * @return bool true if acceptable, false if not
118 */ 114 */
119 function validateString($str) { 115 public static function validateString($str) {
120 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { 116 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
121 return true; 117 return true;
122 } else { 118 } else {
@@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
222 218
223 // unicode startup 219 // unicode startup
224 if ($this->_compile_unicode) { 220 if ($this->_compile_unicode) {
225 $blocks =& $this->_read_unicode_block_db(); 221 $blocks = $this->_read_unicode_block_db();
226
227 $block_count = count($blocks); 222 $block_count = count($blocks);
228 223
229 $skipped_count = 0; 224 $skipped_count = 0;
@@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
349 } 344 }
350} 345}
351 346
352/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 347/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
353
354?>
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php
index 2e8991cc..d0f09d74 100644
--- a/inc/3rdparty/libraries/readability/Readability.php
+++ b/inc/3rdparty/libraries/readability/Readability.php
@@ -1,1138 +1,1138 @@
1<?php 1<?php
2/** 2/**
3* Arc90's Readability ported to PHP for FiveFilters.org 3* Arc90's Readability ported to PHP for FiveFilters.org
4* Based on readability.js version 1.7.1 (without multi-page support) 4* Based on readability.js version 1.7.1 (without multi-page support)
5* Updated to allow HTML5 parsing with html5lib 5* Updated to allow HTML5 parsing with html5lib
6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds 6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
7* ------------------------------------------------------ 7* ------------------------------------------------------
8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js 8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
9* Arc90's project URL: http://lab.arc90.com/experiments/readability/ 9* Arc90's project URL: http://lab.arc90.com/experiments/readability/
10* JS Source: http://code.google.com/p/arc90labs-readability 10* JS Source: http://code.google.com/p/arc90labs-readability
11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net 11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
12* More information: http://fivefilters.org/content-only/ 12* More information: http://fivefilters.org/content-only/
13* License: Apache License, Version 2.0 13* License: Apache License, Version 2.0
14* Requires: PHP5 14* Requires: PHP5
15* Date: 2012-09-19 15* Date: 2012-09-19
16* 16*
17* Differences between the PHP port and the original 17* Differences between the PHP port and the original
18* ------------------------------------------------------ 18* ------------------------------------------------------
19* Arc90's Readability is designed to run in the browser. It works on the DOM 19* Arc90's Readability is designed to run in the browser. It works on the DOM
20* tree (the parsed HTML) after the page's CSS styles have been applied and 20* tree (the parsed HTML) after the page's CSS styles have been applied and
21* Javascript code executed. This PHP port does not run inside a browser. 21* Javascript code executed. This PHP port does not run inside a browser.
22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot
23* rely on CSS or Javascript support. As such, the results will not always 23* rely on CSS or Javascript support. As such, the results will not always
24* match Arc90's Readability. (For example, if a web page contains CSS style 24* match Arc90's Readability. (For example, if a web page contains CSS style
25* rules or Javascript code which hide certain HTML elements from display, 25* rules or Javascript code which hide certain HTML elements from display,
26* Arc90's Readability will dismiss those from consideration but our PHP port, 26* Arc90's Readability will dismiss those from consideration but our PHP port,
27* unable to understand CSS or Javascript, will not know any better.) 27* unable to understand CSS or Javascript, will not know any better.)
28* 28*
29* Another significant difference is that the aim of Arc90's Readability is 29* Another significant difference is that the aim of Arc90's Readability is
30* to re-present the main content block of a given web page so users can 30* to re-present the main content block of a given web page so users can
31* read it more easily in their browsers. Correct identification, clean up, 31* read it more easily in their browsers. Correct identification, clean up,
32* and separation of the content block is only a part of this process. 32* and separation of the content block is only a part of this process.
33* This PHP port is only concerned with this part, it does not include code 33* This PHP port is only concerned with this part, it does not include code
34* that relates to presentation in the browser - Arc90 already do 34* that relates to presentation in the browser - Arc90 already do
35* that extremely well, and for PDF output there's FiveFilters.org's 35* that extremely well, and for PDF output there's FiveFilters.org's
36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/. 36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
37* 37*
38* Finally, this class contains methods that might be useful for developers 38* Finally, this class contains methods that might be useful for developers
39* working on HTML document fragments. So without deviating too much from 39* working on HTML document fragments. So without deviating too much from
40* the original code (which I don't want to do because it makes debugging 40* the original code (which I don't want to do because it makes debugging
41* and updating more difficult), I've tried to make it a little more 41* and updating more difficult), I've tried to make it a little more
42* developer friendly. You should be able to use the methods here on 42* developer friendly. You should be able to use the methods here on
43* existing DOMElement objects without passing an entire HTML document to 43* existing DOMElement objects without passing an entire HTML document to
44* be parsed. 44* be parsed.
45*/ 45*/
46 46
47// This class allows us to do JavaScript like assignements to innerHTML 47// This class allows us to do JavaScript like assignements to innerHTML
48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); 48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
49 49
50// Alternative usage (for testing only!) 50// Alternative usage (for testing only!)
51// uncomment the lines below and call Readability.php in your browser 51// uncomment the lines below and call Readability.php in your browser
52// passing it the URL of the page you'd like content from, e.g.: 52// passing it the URL of the page you'd like content from, e.g.:
53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php 53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
54 54
55/* 55/*
56if (!isset($_GET['url']) || $_GET['url'] == '') { 56if (!isset($_GET['url']) || $_GET['url'] == '') {
57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); 57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
58} 58}
59$url = $_GET['url']; 59$url = $_GET['url'];
60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; 60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
61$html = file_get_contents($url); 61$html = file_get_contents($url);
62$r = new Readability($html, $url); 62$r = new Readability($html, $url);
63$r->init(); 63$r->init();
64echo $r->articleContent->innerHTML; 64echo $r->articleContent->innerHTML;
65*/ 65*/
66 66
67class Readability 67class Readability
68{ 68{
69 public $version = '1.7.1-without-multi-page'; 69 public $version = '1.7.1-without-multi-page';
70 public $convertLinksToFootnotes = false; 70 public $convertLinksToFootnotes = false;
71 public $revertForcedParagraphElements = true; 71 public $revertForcedParagraphElements = true;
72 public $articleTitle; 72 public $articleTitle;
73 public $articleContent; 73 public $articleContent;
74 public $dom; 74 public $dom;
75 public $url = null; // optional - URL where HTML was retrieved 75 public $url = null; // optional - URL where HTML was retrieved
76 public $debug = false; 76 public $debug = false;
77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19 77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19
78 protected $body = null; // 78 protected $body = null; //
79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. 80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
81 protected $success = false; // indicates whether we were able to extract or not 81 protected $success = false; // indicates whether we were able to extract or not
82 82
83 /** 83 /**
84 * All of the regular expressions in use within readability. 84 * All of the regular expressions in use within readability.
85 * Defined up here so we don't instantiate them repeatedly in loops. 85 * Defined up here so we don't instantiate them repeatedly in loops.
86 **/ 86 **/
87 public $regexps = array( 87 public $regexps = array(
88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', 88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', 90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', 91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', 92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', 93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
94 'replaceFonts' => '/<(\/?)font[^>]*>/i', 94 'replaceFonts' => '/<(\/?)font[^>]*>/i',
95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() 95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
96 'normalize' => '/\s{2,}/', 96 'normalize' => '/\s{2,}/',
97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/', 97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', 98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 ); 100 );
101 101
102 /* constants */ 102 /* constants */
103 const FLAG_STRIP_UNLIKELYS = 1; 103 const FLAG_STRIP_UNLIKELYS = 1;
104 const FLAG_WEIGHT_CLASSES = 2; 104 const FLAG_WEIGHT_CLASSES = 2;
105 const FLAG_CLEAN_CONDITIONALLY = 4; 105 const FLAG_CLEAN_CONDITIONALLY = 4;
106 106
107 /** 107 /**
108 * Create instance of Readability 108 * Create instance of Readability
109 * @param string UTF-8 encoded string 109 * @param string UTF-8 encoded string
110 * @param string (optional) URL associated with HTML (used for footnotes) 110 * @param string (optional) URL associated with HTML (used for footnotes)
111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') 111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 */ 112 */
113 function __construct($html, $url=null, $parser='libxml') 113 function __construct($html, $url=null, $parser='libxml')
114 { 114 {
115 $this->url = $url; 115 $this->url = $url;
116 /* Turn all double br's into p's */ 116 /* Turn all double br's into p's */
117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); 117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); 118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); 119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
120 if (trim($html) == '') $html = '<html></html>'; 120 if (trim($html) == '') $html = '<html></html>';
121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { 121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
122 // all good 122 // all good
123 } else { 123 } else {
124 $this->dom = new DOMDocument(); 124 $this->dom = new DOMDocument();
125 $this->dom->preserveWhiteSpace = false; 125 $this->dom->preserveWhiteSpace = false;
126 @$this->dom->loadHTML($html); 126 @$this->dom->loadHTML($html);
127 } 127 }
128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
129 } 129 }
130 130
131 /** 131 /**
132 * Get article title element 132 * Get article title element
133 * @return DOMElement 133 * @return DOMElement
134 */ 134 */
135 public function getTitle() { 135 public function getTitle() {
136 return $this->articleTitle; 136 return $this->articleTitle;
137 } 137 }
138 138
139 /** 139 /**
140 * Get article content element 140 * Get article content element
141 * @return DOMElement 141 * @return DOMElement
142 */ 142 */
143 public function getContent() { 143 public function getContent() {
144 return $this->articleContent; 144 return $this->articleContent;
145 } 145 }
146 146
147 /** 147 /**
148 * Runs readability. 148 * Runs readability.
149 * 149 *
150 * Workflow: 150 * Workflow:
151 * 1. Prep the document by removing script tags, css, etc. 151 * 1. Prep the document by removing script tags, css, etc.
152 * 2. Build readability's DOM tree. 152 * 2. Build readability's DOM tree.
153 * 3. Grab the article content from the current dom tree. 153 * 3. Grab the article content from the current dom tree.
154 * 4. Replace the current DOM tree with the new one. 154 * 4. Replace the current DOM tree with the new one.
155 * 5. Read peacefully. 155 * 5. Read peacefully.
156 * 156 *
157 * @return boolean true if we found content, false otherwise 157 * @return boolean true if we found content, false otherwise
158 **/ 158 **/
159 public function init() 159 public function init()
160 { 160 {
161 if (!isset($this->dom->documentElement)) return false; 161 if (!isset($this->dom->documentElement)) return false;
162 $this->removeScripts($this->dom); 162 $this->removeScripts($this->dom);
163 //die($this->getInnerHTML($this->dom->documentElement)); 163 //die($this->getInnerHTML($this->dom->documentElement));
164 164
165 // Assume successful outcome 165 // Assume successful outcome
166 $this->success = true; 166 $this->success = true;
167 167
168 $bodyElems = $this->dom->getElementsByTagName('body'); 168 $bodyElems = $this->dom->getElementsByTagName('body');
169 if ($bodyElems->length > 0) { 169 if ($bodyElems->length > 0) {
170 if ($this->bodyCache == null) { 170 if ($this->bodyCache == null) {
171 $this->bodyCache = $bodyElems->item(0)->innerHTML; 171 $this->bodyCache = $bodyElems->item(0)->innerHTML;
172 } 172 }
173 if ($this->body == null) { 173 if ($this->body == null) {
174 $this->body = $bodyElems->item(0); 174 $this->body = $bodyElems->item(0);
175 } 175 }
176 } 176 }
177 177
178 $this->prepDocument(); 178 $this->prepDocument();
179 179
180 //die($this->dom->documentElement->parentNode->nodeType); 180 //die($this->dom->documentElement->parentNode->nodeType);
181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 //die($this->getInnerHTML($this->dom->documentElement)); 182 //die($this->getInnerHTML($this->dom->documentElement));
183 183
184 /* Build readability's DOM tree */ 184 /* Build readability's DOM tree */
185 $overlay = $this->dom->createElement('div'); 185 $overlay = $this->dom->createElement('div');
186 $innerDiv = $this->dom->createElement('div'); 186 $innerDiv = $this->dom->createElement('div');
187 $articleTitle = $this->getArticleTitle(); 187 $articleTitle = $this->getArticleTitle();
188 $articleContent = $this->grabArticle(); 188 $articleContent = $this->grabArticle();
189 189
190 if (!$articleContent) { 190 if (!$articleContent) {
191 $this->success = false; 191 $this->success = false;
192 $articleContent = $this->dom->createElement('div'); 192 $articleContent = $this->dom->createElement('div');
193 $articleContent->setAttribute('id', 'readability-content'); 193 $articleContent->setAttribute('id', 'readability-content');
194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; 194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
195 } 195 }
196 196
197 $overlay->setAttribute('id', 'readOverlay'); 197 $overlay->setAttribute('id', 'readOverlay');
198 $innerDiv->setAttribute('id', 'readInner'); 198 $innerDiv->setAttribute('id', 'readInner');
199 199
200 /* Glue the structure of our document together. */ 200 /* Glue the structure of our document together. */
201 $innerDiv->appendChild($articleTitle); 201 $innerDiv->appendChild($articleTitle);
202 $innerDiv->appendChild($articleContent); 202 $innerDiv->appendChild($articleContent);
203 $overlay->appendChild($innerDiv); 203 $overlay->appendChild($innerDiv);
204 204
205 /* Clear the old HTML, insert the new content. */ 205 /* Clear the old HTML, insert the new content. */
206 $this->body->innerHTML = ''; 206 $this->body->innerHTML = '';
207 $this->body->appendChild($overlay); 207 $this->body->appendChild($overlay);
208 //document.body.insertBefore(overlay, document.body.firstChild); 208 //document.body.insertBefore(overlay, document.body.firstChild);
209 $this->body->removeAttribute('style'); 209 $this->body->removeAttribute('style');
210 210
211 $this->postProcessContent($articleContent); 211 $this->postProcessContent($articleContent);
212 212
213 // Set title and content instance variables 213 // Set title and content instance variables
214 $this->articleTitle = $articleTitle; 214 $this->articleTitle = $articleTitle;
215 $this->articleContent = $articleContent; 215 $this->articleContent = $articleContent;
216 216
217 return $this->success; 217 return $this->success;
218 } 218 }
219 219
220 /** 220 /**
221 * Debug 221 * Debug
222 */ 222 */
223 protected function dbg($msg) { 223 protected function dbg($msg) {
224 if ($this->debug) echo '* ',$msg, "\n"; 224 if ($this->debug) echo '* ',$msg, "\n";
225 } 225 }
226 226
227 /** 227 /**
228 * Run any post-process modifications to article content as necessary. 228 * Run any post-process modifications to article content as necessary.
229 * 229 *
230 * @param DOMElement 230 * @param DOMElement
231 * @return void 231 * @return void
232 */ 232 */
233 public function postProcessContent($articleContent) { 233 public function postProcessContent($articleContent) {
234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 $this->addFootnotes($articleContent); 235 $this->addFootnotes($articleContent);
236 } 236 }
237 } 237 }
238 238
239 /** 239 /**
240 * Get the article title as an H1. 240 * Get the article title as an H1.
241 * 241 *
242 * @return DOMElement 242 * @return DOMElement
243 */ 243 */
244 protected function getArticleTitle() { 244 protected function getArticleTitle() {
245 $curTitle = ''; 245 $curTitle = '';
246 $origTitle = ''; 246 $origTitle = '';
247 247
248 try { 248 try {
249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); 249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 } catch(Exception $e) {} 250 } catch(Exception $e) {}
251 251
252 if (preg_match('/ [\|\-] /', $curTitle)) 252 if (preg_match('/ [\|\-] /', $curTitle))
253 { 253 {
254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); 254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255 255
256 if (count(explode(' ', $curTitle)) < 3) { 256 if (count(explode(' ', $curTitle)) < 3) {
257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); 257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 } 258 }
259 } 259 }
260 else if (strpos($curTitle, ': ') !== false) 260 else if (strpos($curTitle, ': ') !== false)
261 { 261 {
262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); 262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
263 263
264 if (count(explode(' ', $curTitle)) < 3) { 264 if (count(explode(' ', $curTitle)) < 3) {
265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); 265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
266 } 266 }
267 } 267 }
268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) 268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
269 { 269 {
270 $hOnes = $this->dom->getElementsByTagName('h1'); 270 $hOnes = $this->dom->getElementsByTagName('h1');
271 if($hOnes->length == 1) 271 if($hOnes->length == 1)
272 { 272 {
273 $curTitle = $this->getInnerText($hOnes->item(0)); 273 $curTitle = $this->getInnerText($hOnes->item(0));
274 } 274 }
275 } 275 }
276 276
277 $curTitle = trim($curTitle); 277 $curTitle = trim($curTitle);
278 278
279 if (count(explode(' ', $curTitle)) <= 4) { 279 if (count(explode(' ', $curTitle)) <= 4) {
280 $curTitle = $origTitle; 280 $curTitle = $origTitle;
281 } 281 }
282 282
283 $articleTitle = $this->dom->createElement('h1'); 283 $articleTitle = $this->dom->createElement('h1');
284 $articleTitle->innerHTML = $curTitle; 284 $articleTitle->innerHTML = $curTitle;
285 285
286 return $articleTitle; 286 return $articleTitle;
287 } 287 }
288 288
289 /** 289 /**
290 * Prepare the HTML document for readability to scrape it. 290 * Prepare the HTML document for readability to scrape it.
291 * This includes things like stripping javascript, CSS, and handling terrible markup. 291 * This includes things like stripping javascript, CSS, and handling terrible markup.
292 * 292 *
293 * @return void 293 * @return void
294 **/ 294 **/
295 protected function prepDocument() { 295 protected function prepDocument() {
296 /** 296 /**
297 * In some cases a body element can't be found (if the HTML is totally hosed for example) 297 * In some cases a body element can't be found (if the HTML is totally hosed for example)
298 * so we create a new body node and append it to the document. 298 * so we create a new body node and append it to the document.
299 */ 299 */
300 if ($this->body == null) 300 if ($this->body == null)
301 { 301 {
302 $this->body = $this->dom->createElement('body'); 302 $this->body = $this->dom->createElement('body');
303 $this->dom->documentElement->appendChild($this->body); 303 $this->dom->documentElement->appendChild($this->body);
304 } 304 }
305 $this->body->setAttribute('id', 'readabilityBody'); 305 $this->body->setAttribute('id', 'readabilityBody');
306 306
307 /* Remove all style tags in head */ 307 /* Remove all style tags in head */
308 $styleTags = $this->dom->getElementsByTagName('style'); 308 $styleTags = $this->dom->getElementsByTagName('style');
309 for ($i = $styleTags->length-1; $i >= 0; $i--) 309 for ($i = $styleTags->length-1; $i >= 0; $i--)
310 { 310 {
311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); 311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
312 } 312 }
313 313
314 /* Turn all double br's into p's */ 314 /* Turn all double br's into p's */
315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); 316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. 317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
318 // Manipulating innerHTML as it's done in JS is not possible in PHP. 318 // Manipulating innerHTML as it's done in JS is not possible in PHP.
319 } 319 }
320 320
321 /** 321 /**
322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links. 322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php 323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
324 * 324 *
325 * @return void 325 * @return void
326 **/ 326 **/
327 public function addFootnotes($articleContent) { 327 public function addFootnotes($articleContent) {
328 $footnotesWrapper = $this->dom->createElement('div'); 328 $footnotesWrapper = $this->dom->createElement('div');
329 $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 329 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 $footnotesWrapper->innerHTML = '<h3>References</h3>'; 330 $footnotesWrapper->innerHTML = '<h3>References</h3>';
331 331
332 $articleFootnotes = $this->dom->createElement('ol'); 332 $articleFootnotes = $this->dom->createElement('ol');
333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); 333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 $footnotesWrapper->appendChild($articleFootnotes); 334 $footnotesWrapper->appendChild($articleFootnotes);
335 335
336 $articleLinks = $articleContent->getElementsByTagName('a'); 336 $articleLinks = $articleContent->getElementsByTagName('a');
337 337
338 $linkCount = 0; 338 $linkCount = 0;
339 for ($i = 0; $i < $articleLinks->length; $i++) 339 for ($i = 0; $i < $articleLinks->length; $i++)
340 { 340 {
341 $articleLink = $articleLinks->item($i); 341 $articleLink = $articleLinks->item($i);
342 $footnoteLink = $articleLink->cloneNode(true); 342 $footnoteLink = $articleLink->cloneNode(true);
343 $refLink = $this->dom->createElement('a'); 343 $refLink = $this->dom->createElement('a');
344 $footnote = $this->dom->createElement('li'); 344 $footnote = $this->dom->createElement('li');
345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); 345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); 346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, 347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 $linkText = $this->getInnerText($articleLink); 348 $linkText = $this->getInnerText($articleLink);
349 349
350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { 350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 continue; 351 continue;
352 } 352 }
353 353
354 $linkCount++; 354 $linkCount++;
355 355
356 /** Add a superscript reference after the article link */ 356 /** Add a superscript reference after the article link */
357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); 357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; 358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
359 $refLink->setAttribute('class', 'readability-DoNotFootnote'); 359 $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 $refLink->setAttribute('style', 'color: inherit;'); 360 $refLink->setAttribute('style', 'color: inherit;');
361 361
362 //TODO: does this work or should we use DOMNode.isSameNode()? 362 //TODO: does this work or should we use DOMNode.isSameNode()?
363 if ($articleLink->parentNode->lastChild == $articleLink) { 363 if ($articleLink->parentNode->lastChild == $articleLink) {
364 $articleLink->parentNode->appendChild($refLink); 364 $articleLink->parentNode->appendChild($refLink);
365 } else { 365 } else {
366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); 366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
367 } 367 }
368 368
369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); 369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); 370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
371 371
372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; 372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
373 373
374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); 374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); 375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376 376
377 $footnote->appendChild($footnoteLink); 377 $footnote->appendChild($footnoteLink);
378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; 378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
379 379
380 $articleFootnotes->appendChild($footnote); 380 $articleFootnotes->appendChild($footnote);
381 } 381 }
382 382
383 if ($linkCount > 0) { 383 if ($linkCount > 0) {
384 $articleContent->appendChild($footnotesWrapper); 384 $articleContent->appendChild($footnotesWrapper);
385 } 385 }
386 } 386 }
387 387
388 /** 388 /**
389 * Reverts P elements with class 'readability-styled' 389 * Reverts P elements with class 'readability-styled'
390 * to text nodes - which is what they were before. 390 * to text nodes - which is what they were before.
391 * 391 *
392 * @param DOMElement 392 * @param DOMElement
393 * @return void 393 * @return void
394 */ 394 */
395 function revertReadabilityStyledElements($articleContent) { 395 function revertReadabilityStyledElements($articleContent) {
396 $xpath = new DOMXPath($articleContent->ownerDocument); 396 $xpath = new DOMXPath($articleContent->ownerDocument);
397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); 397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
398 //$elems = $articleContent->getElementsByTagName('p'); 398 //$elems = $articleContent->getElementsByTagName('p');
399 for ($i = $elems->length-1; $i >= 0; $i--) { 399 for ($i = $elems->length-1; $i >= 0; $i--) {
400 $e = $elems->item($i); 400 $e = $elems->item($i);
401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); 401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { 402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); 403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
404 //} 404 //}
405 } 405 }
406 } 406 }
407 407
408 /** 408 /**
409 * Prepare the article node for display. Clean out any inline styles, 409 * Prepare the article node for display. Clean out any inline styles,
410 * iframes, forms, strip extraneous <p> tags, etc. 410 * iframes, forms, strip extraneous <p> tags, etc.
411 * 411 *
412 * @param DOMElement 412 * @param DOMElement
413 * @return void 413 * @return void
414 */ 414 */
415 function prepArticle($articleContent) { 415 function prepArticle($articleContent) {
416 $this->cleanStyles($articleContent); 416 $this->cleanStyles($articleContent);
417 $this->killBreaks($articleContent); 417 $this->killBreaks($articleContent);
418 if ($this->revertForcedParagraphElements) { 418 if ($this->revertForcedParagraphElements) {
419 $this->revertReadabilityStyledElements($articleContent); 419 $this->revertReadabilityStyledElements($articleContent);
420 } 420 }
421 421
422 /* Clean out junk from the article content */ 422 /* Clean out junk from the article content */
423 $this->cleanConditionally($articleContent, 'form'); 423 $this->cleanConditionally($articleContent, 'form');
424 $this->clean($articleContent, 'object'); 424 $this->clean($articleContent, 'object');
425 $this->clean($articleContent, 'h1'); 425 $this->clean($articleContent, 'h1');
426 426
427 /** 427 /**
428 * If there is only one h2, they are probably using it 428 * If there is only one h2, they are probably using it
429 * as a header and not a subheader, so remove it since we already have a header. 429 * as a header and not a subheader, so remove it since we already have a header.
430 ***/ 430 ***/
431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { 431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 $this->clean($articleContent, 'h2'); 432 $this->clean($articleContent, 'h2');
433 } 433 }
434 $this->clean($articleContent, 'iframe'); 434 $this->clean($articleContent, 'iframe');
435 435
436 $this->cleanHeaders($articleContent); 436 $this->cleanHeaders($articleContent);
437 437
438 /* Do these last as the previous stuff may have removed junk that will affect these */ 438 /* Do these last as the previous stuff may have removed junk that will affect these */
439 $this->cleanConditionally($articleContent, 'table'); 439 $this->cleanConditionally($articleContent, 'table');
440 $this->cleanConditionally($articleContent, 'ul'); 440 $this->cleanConditionally($articleContent, 'ul');
441 $this->cleanConditionally($articleContent, 'div'); 441 $this->cleanConditionally($articleContent, 'div');
442 442
443 /* Remove extra paragraphs */ 443 /* Remove extra paragraphs */
444 $articleParagraphs = $articleContent->getElementsByTagName('p'); 444 $articleParagraphs = $articleContent->getElementsByTagName('p');
445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--) 445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
446 { 446 {
447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; 447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; 448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; 449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; 450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451 451
452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') 452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 { 453 {
454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); 454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
455 } 455 }
456 } 456 }
457 457
458 try { 458 try {
459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); 459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); 460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
461 } 461 }
462 catch (Exception $e) { 462 catch (Exception $e) {
463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); 463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 } 464 }
465 } 465 }
466 466
467 /** 467 /**
468 * Initialize a node with the readability object. Also checks the 468 * Initialize a node with the readability object. Also checks the
469 * className/id for special names to add to its score. 469 * className/id for special names to add to its score.
470 * 470 *
471 * @param Element 471 * @param Element
472 * @return void 472 * @return void
473 **/ 473 **/
474 protected function initializeNode($node) { 474 protected function initializeNode($node) {
475 $readability = $this->dom->createAttribute('readability'); 475 $readability = $this->dom->createAttribute('readability');
476 $readability->value = 0; // this is our contentScore 476 $readability->value = 0; // this is our contentScore
477 $node->setAttributeNode($readability); 477 $node->setAttributeNode($readability);
478 478
479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case 479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 case 'DIV': 480 case 'DIV':
481 $readability->value += 5; 481 $readability->value += 5;
482 break; 482 break;
483 483
484 case 'PRE': 484 case 'PRE':
485 case 'TD': 485 case 'TD':
486 case 'BLOCKQUOTE': 486 case 'BLOCKQUOTE':
487 $readability->value += 3; 487 $readability->value += 3;
488 break; 488 break;
489 489
490 case 'ADDRESS': 490 case 'ADDRESS':
491 case 'OL': 491 case 'OL':
492 case 'UL': 492 case 'UL':
493 case 'DL': 493 case 'DL':
494 case 'DD': 494 case 'DD':
495 case 'DT': 495 case 'DT':
496 case 'LI': 496 case 'LI':
497 case 'FORM': 497 case 'FORM':
498 $readability->value -= 3; 498 $readability->value -= 3;
499 break; 499 break;
500 500
501 case 'H1': 501 case 'H1':
502 case 'H2': 502 case 'H2':
503 case 'H3': 503 case 'H3':
504 case 'H4': 504 case 'H4':
505 case 'H5': 505 case 'H5':
506 case 'H6': 506 case 'H6':
507 case 'TH': 507 case 'TH':
508 $readability->value -= 5; 508 $readability->value -= 5;
509 break; 509 break;
510 } 510 }
511 $readability->value += $this->getClassWeight($node); 511 $readability->value += $this->getClassWeight($node);
512 } 512 }
513 513
514 /*** 514 /***
515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
517 * 517 *
518 * @return DOMElement 518 * @return DOMElement
519 **/ 519 **/
520 protected function grabArticle($page=null) { 520 protected function grabArticle($page=null) {
521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); 521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
522 if (!$page) $page = $this->dom; 522 if (!$page) $page = $this->dom;
523 $allElements = $page->getElementsByTagName('*'); 523 $allElements = $page->getElementsByTagName('*');
524 /** 524 /**
525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
527 * 527 *
528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
529 * TODO: Shouldn't this be a reverse traversal? 529 * TODO: Shouldn't this be a reverse traversal?
530 **/ 530 **/
531 $node = null; 531 $node = null;
532 $nodesToScore = array(); 532 $nodesToScore = array();
533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { 533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { 534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
535 //$node = $targetList->item($nodeIndex); 535 //$node = $targetList->item($nodeIndex);
536 $tagName = strtoupper($node->tagName); 536 $tagName = strtoupper($node->tagName);
537 /* Remove unlikely candidates */ 537 /* Remove unlikely candidates */
538 if ($stripUnlikelyCandidates) { 538 if ($stripUnlikelyCandidates) {
539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); 539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
540 if ( 540 if (
541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && 541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && 542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
543 $tagName != 'BODY' 543 $tagName != 'BODY'
544 ) 544 )
545 { 545 {
546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); 546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
547 //$nodesToRemove[] = $node; 547 //$nodesToRemove[] = $node;
548 $node->parentNode->removeChild($node); 548 $node->parentNode->removeChild($node);
549 $nodeIndex--; 549 $nodeIndex--;
550 continue; 550 continue;
551 } 551 }
552 } 552 }
553 553
554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { 554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
555 $nodesToScore[] = $node; 555 $nodesToScore[] = $node;
556 } 556 }
557 557
558 /* Turn all divs that don't have children block level elements into p's */ 558 /* Turn all divs that don't have children block level elements into p's */
559 if ($tagName == 'DIV') { 559 if ($tagName == 'DIV') {
560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { 560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
561 //$this->dbg('Altering div to p'); 561 //$this->dbg('Altering div to p');
562 $newNode = $this->dom->createElement('p'); 562 $newNode = $this->dom->createElement('p');
563 try { 563 try {
564 $newNode->innerHTML = $node->innerHTML; 564 $newNode->innerHTML = $node->innerHTML;
565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); 565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
566 $node->parentNode->replaceChild($newNode, $node); 566 $node->parentNode->replaceChild($newNode, $node);
567 $nodeIndex--; 567 $nodeIndex--;
568 $nodesToScore[] = $node; // or $newNode? 568 $nodesToScore[] = $node; // or $newNode?
569 } 569 }
570 catch(Exception $e) { 570 catch(Exception $e) {
571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); 571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
572 } 572 }
573 } 573 }
574 else 574 else
575 { 575 {
576 /* EXPERIMENTAL */ 576 /* EXPERIMENTAL */
577 // TODO: change these p elements back to text nodes after processing 577 // TODO: change these p elements back to text nodes after processing
578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { 578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
579 $childNode = $node->childNodes->item($i); 579 $childNode = $node->childNodes->item($i);
580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE 580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE
581 //$this->dbg('replacing text node with a p tag with the same content.'); 581 //$this->dbg('replacing text node with a p tag with the same content.');
582 $p = $this->dom->createElement('p'); 582 $p = $this->dom->createElement('p');
583 $p->innerHTML = $childNode->nodeValue; 583 $p->innerHTML = $childNode->nodeValue;
584 $p->setAttribute('style', 'display: inline;'); 584 $p->setAttribute('style', 'display: inline;');
585 $p->setAttribute('class', 'readability-styled'); 585 $p->setAttribute('class', 'readability-styled');
586 $childNode->parentNode->replaceChild($p, $childNode); 586 $childNode->parentNode->replaceChild($p, $childNode);
587 } 587 }
588 } 588 }
589 } 589 }
590 } 590 }
591 } 591 }
592 592
593 /** 593 /**
594 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 594 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 * Then add their score to their parent node. 595 * Then add their score to their parent node.
596 * 596 *
597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
598 **/ 598 **/
599 $candidates = array(); 599 $candidates = array();
600 for ($pt=0; $pt < count($nodesToScore); $pt++) { 600 for ($pt=0; $pt < count($nodesToScore); $pt++) {
601 $parentNode = $nodesToScore[$pt]->parentNode; 601 $parentNode = $nodesToScore[$pt]->parentNode;
602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null; 602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); 603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
604 $innerText = $this->getInnerText($nodesToScore[$pt]); 604 $innerText = $this->getInnerText($nodesToScore[$pt]);
605 605
606 if (!$parentNode || !isset($parentNode->tagName)) { 606 if (!$parentNode || !isset($parentNode->tagName)) {
607 continue; 607 continue;
608 } 608 }
609 609
610 /* If this paragraph is less than 25 characters, don't even count it. */ 610 /* If this paragraph is less than 25 characters, don't even count it. */
611 if(strlen($innerText) < 25) { 611 if(strlen($innerText) < 25) {
612 continue; 612 continue;
613 } 613 }
614 614
615 /* Initialize readability data for the parent. */ 615 /* Initialize readability data for the parent. */
616 if (!$parentNode->hasAttribute('readability')) 616 if (!$parentNode->hasAttribute('readability'))
617 { 617 {
618 $this->initializeNode($parentNode); 618 $this->initializeNode($parentNode);
619 $candidates[] = $parentNode; 619 $candidates[] = $parentNode;
620 } 620 }
621 621
622 /* Initialize readability data for the grandparent. */ 622 /* Initialize readability data for the grandparent. */
623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) 623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
624 { 624 {
625 $this->initializeNode($grandParentNode); 625 $this->initializeNode($grandParentNode);
626 $candidates[] = $grandParentNode; 626 $candidates[] = $grandParentNode;
627 } 627 }
628 628
629 $contentScore = 0; 629 $contentScore = 0;
630 630
631 /* Add a point for the paragraph itself as a base. */ 631 /* Add a point for the paragraph itself as a base. */
632 $contentScore++; 632 $contentScore++;
633 633
634 /* Add points for any commas within this paragraph */ 634 /* Add points for any commas within this paragraph */
635 $contentScore += count(explode(',', $innerText)); 635 $contentScore += count(explode(',', $innerText));
636 636
637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 $contentScore += min(floor(strlen($innerText) / 100), 3); 638 $contentScore += min(floor(strlen($innerText) / 100), 3);
639 639
640 /* Add the score to the parent. The grandparent gets half. */ 640 /* Add the score to the parent. The grandparent gets half. */
641 $parentNode->getAttributeNode('readability')->value += $contentScore; 641 $parentNode->getAttributeNode('readability')->value += $contentScore;
642 642
643 if ($grandParentNode) { 643 if ($grandParentNode) {
644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; 644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 } 645 }
646 } 646 }
647 647
648 /** 648 /**
649 * After we've calculated scores, loop through all of the possible candidate nodes we found 649 * After we've calculated scores, loop through all of the possible candidate nodes we found
650 * and find the one with the highest score. 650 * and find the one with the highest score.
651 **/ 651 **/
652 $topCandidate = null; 652 $topCandidate = null;
653 for ($c=0, $cl=count($candidates); $c < $cl; $c++) 653 for ($c=0, $cl=count($candidates); $c < $cl; $c++)
654 { 654 {
655 /** 655 /**
656 * Scale the final candidates score based on link density. Good content should have a 656 * Scale the final candidates score based on link density. Good content should have a
657 * relatively small link density (5% or less) and be mostly unaffected by this operation. 657 * relatively small link density (5% or less) and be mostly unaffected by this operation.
658 **/ 658 **/
659 $readability = $candidates[$c]->getAttributeNode('readability'); 659 $readability = $candidates[$c]->getAttributeNode('readability');
660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); 660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
661 661
662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); 662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
663 663
664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { 664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
665 $topCandidate = $candidates[$c]; 665 $topCandidate = $candidates[$c];
666 } 666 }
667 } 667 }
668 668
669 /** 669 /**
670 * If we still have no top candidate, just use the body as a last resort. 670 * If we still have no top candidate, just use the body as a last resort.
671 * We also have to copy the body node so it is something we can modify. 671 * We also have to copy the body node so it is something we can modify.
672 **/ 672 **/
673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') 673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
674 { 674 {
675 $topCandidate = $this->dom->createElement('div'); 675 $topCandidate = $this->dom->createElement('div');
676 if ($page instanceof DOMDocument) { 676 if ($page instanceof DOMDocument) {
677 if (!isset($page->documentElement)) { 677 if (!isset($page->documentElement)) {
678 // we don't have a body either? what a mess! :) 678 // we don't have a body either? what a mess! :)
679 } else { 679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML; 680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = ''; 681 $page->documentElement->innerHTML = '';
682 $page->documentElement->appendChild($topCandidate); 682 $page->documentElement->appendChild($topCandidate);
683 } 683 }
684 } else { 684 } else {
685 $topCandidate->innerHTML = $page->innerHTML; 685 $topCandidate->innerHTML = $page->innerHTML;
686 $page->innerHTML = ''; 686 $page->innerHTML = '';
687 $page->appendChild($topCandidate); 687 $page->appendChild($topCandidate);
688 } 688 }
689 $this->initializeNode($topCandidate); 689 $this->initializeNode($topCandidate);
690 } 690 }
691 691
692 /** 692 /**
693 * Now that we have the top candidate, look through its siblings for content that might also be related. 693 * Now that we have the top candidate, look through its siblings for content that might also be related.
694 * Things like preambles, content split by ads that we removed, etc. 694 * Things like preambles, content split by ads that we removed, etc.
695 **/ 695 **/
696 $articleContent = $this->dom->createElement('div'); 696 $articleContent = $this->dom->createElement('div');
697 $articleContent->setAttribute('id', 'readability-content'); 697 $articleContent->setAttribute('id', 'readability-content');
698 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); 698 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
699 $siblingNodes = $topCandidate->parentNode->childNodes; 699 $siblingNodes = $topCandidate->parentNode->childNodes;
700 if (!isset($siblingNodes)) { 700 if (!isset($siblingNodes)) {
701 $siblingNodes = new stdClass; 701 $siblingNodes = new stdClass;
702 $siblingNodes->length = 0; 702 $siblingNodes->length = 0;
703 } 703 }
704 704
705 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) 705 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
706 { 706 {
707 $siblingNode = $siblingNodes->item($s); 707 $siblingNode = $siblingNodes->item($s);
708 $append = false; 708 $append = false;
709 709
710 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); 710 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
711 711
712 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); 712 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
713 713
714 if ($siblingNode === $topCandidate) 714 if ($siblingNode === $topCandidate)
715 // or if ($siblingNode->isSameNode($topCandidate)) 715 // or if ($siblingNode->isSameNode($topCandidate))
716 { 716 {
717 $append = true; 717 $append = true;
718 } 718 }
719 719
720 $contentBonus = 0; 720 $contentBonus = 0;
721 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 721 /* Give a bonus if sibling nodes and top candidates have the example same classname */
722 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { 722 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
723 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; 723 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
724 } 724 }
725 725
726 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) 726 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
727 { 727 {
728 $append = true; 728 $append = true;
729 } 729 }
730 730
731 if (strtoupper($siblingNode->nodeName) == 'P') { 731 if (strtoupper($siblingNode->nodeName) == 'P') {
732 $linkDensity = $this->getLinkDensity($siblingNode); 732 $linkDensity = $this->getLinkDensity($siblingNode);
733 $nodeContent = $this->getInnerText($siblingNode); 733 $nodeContent = $this->getInnerText($siblingNode);
734 $nodeLength = strlen($nodeContent); 734 $nodeLength = strlen($nodeContent);
735 735
736 if ($nodeLength > 80 && $linkDensity < 0.25) 736 if ($nodeLength > 80 && $linkDensity < 0.25)
737 { 737 {
738 $append = true; 738 $append = true;
739 } 739 }
740 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) 740 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
741 { 741 {
742 $append = true; 742 $append = true;
743 } 743 }
744 } 744 }
745 745
746 if ($append) 746 if ($append)
747 { 747 {
748 $this->dbg('Appending node: ' . $siblingNode->nodeName); 748 $this->dbg('Appending node: ' . $siblingNode->nodeName);
749 749
750 $nodeToAppend = null; 750 $nodeToAppend = null;
751 $sibNodeName = strtoupper($siblingNode->nodeName); 751 $sibNodeName = strtoupper($siblingNode->nodeName);
752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { 752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
754 754
755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); 755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
756 $nodeToAppend = $this->dom->createElement('div'); 756 $nodeToAppend = $this->dom->createElement('div');
757 try { 757 try {
758 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); 758 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
759 $nodeToAppend->innerHTML = $siblingNode->innerHTML; 759 $nodeToAppend->innerHTML = $siblingNode->innerHTML;
760 } 760 }
761 catch(Exception $e) 761 catch(Exception $e)
762 { 762 {
763 $this->dbg('Could not alter siblingNode to div, reverting back to original.'); 763 $this->dbg('Could not alter siblingNode to div, reverting back to original.');
764 $nodeToAppend = $siblingNode; 764 $nodeToAppend = $siblingNode;
765 $s--; 765 $s--;
766 $sl--; 766 $sl--;
767 } 767 }
768 } else { 768 } else {
769 $nodeToAppend = $siblingNode; 769 $nodeToAppend = $siblingNode;
770 $s--; 770 $s--;
771 $sl--; 771 $sl--;
772 } 772 }
773 773
774 /* To ensure a node does not interfere with readability styles, remove its classnames */ 774 /* To ensure a node does not interfere with readability styles, remove its classnames */
775 $nodeToAppend->removeAttribute('class'); 775 $nodeToAppend->removeAttribute('class');
776 776
777 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 777 /* Append sibling and subtract from our list because it removes the node when you append to another node */
778 $articleContent->appendChild($nodeToAppend); 778 $articleContent->appendChild($nodeToAppend);
779 } 779 }
780 } 780 }
781 781
782 /** 782 /**
783 * So we have all of the content that we need. Now we clean it up for presentation. 783 * So we have all of the content that we need. Now we clean it up for presentation.
784 **/ 784 **/
785 $this->prepArticle($articleContent); 785 $this->prepArticle($articleContent);
786 786
787 /** 787 /**
788 * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 788 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
789 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 789 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
790 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 790 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
791 * finding the -right- content. 791 * finding the -right- content.
792 **/ 792 **/
793 if (strlen($this->getInnerText($articleContent, false)) < 250) 793 if (strlen($this->getInnerText($articleContent, false)) < 250)
794 { 794 {
795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 // in the meantime, we check and create an empty element if it's not there. 796 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); 797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
798 $this->body->innerHTML = $this->bodyCache; 798 $this->body->innerHTML = $this->bodyCache;
799 799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 return $this->grabArticle($this->body); 802 return $this->grabArticle($this->body);
803 } 803 }
804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES); 805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 return $this->grabArticle($this->body); 806 return $this->grabArticle($this->body);
807 } 807 }
808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); 809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
810 return $this->grabArticle($this->body); 810 return $this->grabArticle($this->body);
811 } 811 }
812 else { 812 else {
813 return false; 813 return false;
814 } 814 }
815 } 815 }
816 return $articleContent; 816 return $articleContent;
817 } 817 }
818 818
819 /** 819 /**
820 * Remove script tags from document 820 * Remove script tags from document
821 * 821 *
822 * @param DOMElement 822 * @param DOMElement
823 * @return void 823 * @return void
824 */ 824 */
825 public function removeScripts($doc) { 825 public function removeScripts($doc) {
826 $scripts = $doc->getElementsByTagName('script'); 826 $scripts = $doc->getElementsByTagName('script');
827 for($i = $scripts->length-1; $i >= 0; $i--) 827 for($i = $scripts->length-1; $i >= 0; $i--)
828 { 828 {
829 $scripts->item($i)->parentNode->removeChild($scripts->item($i)); 829 $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 } 830 }
831 } 831 }
832 832
833 /** 833 /**
834 * Get the inner text of a node. 834 * Get the inner text of a node.
835 * This also strips out any excess whitespace to be found. 835 * This also strips out any excess whitespace to be found.
836 * 836 *
837 * @param DOMElement $ 837 * @param DOMElement $
838 * @param boolean $normalizeSpaces (default: true) 838 * @param boolean $normalizeSpaces (default: true)
839 * @return string 839 * @return string
840 **/ 840 **/
841 public function getInnerText($e, $normalizeSpaces=true) { 841 public function getInnerText($e, $normalizeSpaces=true) {
842 $textContent = ''; 842 $textContent = '';
843 843
844 if (!isset($e->textContent) || $e->textContent == '') { 844 if (!isset($e->textContent) || $e->textContent == '') {
845 return ''; 845 return '';
846 } 846 }
847 847
848 $textContent = trim($e->textContent); 848 $textContent = trim($e->textContent);
849 849
850 if ($normalizeSpaces) { 850 if ($normalizeSpaces) {
851 return preg_replace($this->regexps['normalize'], ' ', $textContent); 851 return preg_replace($this->regexps['normalize'], ' ', $textContent);
852 } else { 852 } else {
853 return $textContent; 853 return $textContent;
854 } 854 }
855 } 855 }
856 856
857 /** 857 /**
858 * Get the number of times a string $s appears in the node $e. 858 * Get the number of times a string $s appears in the node $e.
859 * 859 *
860 * @param DOMElement $e 860 * @param DOMElement $e
861 * @param string - what to count. Default is "," 861 * @param string - what to count. Default is ","
862 * @return number (integer) 862 * @return number (integer)
863 **/ 863 **/
864 public function getCharCount($e, $s=',') { 864 public function getCharCount($e, $s=',') {
865 return substr_count($this->getInnerText($e), $s); 865 return substr_count($this->getInnerText($e), $s);
866 } 866 }
867 867
868 /** 868 /**
869 * Remove the style attribute on every $e and under. 869 * Remove the style attribute on every $e and under.
870 * 870 *
871 * @param DOMElement $e 871 * @param DOMElement $e
872 * @return void 872 * @return void
873 */ 873 */
874 public function cleanStyles($e) { 874 public function cleanStyles($e) {
875 if (!is_object($e)) return; 875 if (!is_object($e)) return;
876 $elems = $e->getElementsByTagName('*'); 876 $elems = $e->getElementsByTagName('*');
877 foreach ($elems as $elem) { 877 foreach ($elems as $elem) {
878 $elem->removeAttribute('style'); 878 $elem->removeAttribute('style');
879 } 879 }
880 } 880 }
881 881
882 /** 882 /**
883 * Get the density of links as a percentage of the content 883 * Get the density of links as a percentage of the content
884 * This is the amount of text that is inside a link divided by the total text in the node. 884 * This is the amount of text that is inside a link divided by the total text in the node.
885 * 885 *
886 * @param DOMElement $e 886 * @param DOMElement $e
887 * @return number (float) 887 * @return number (float)
888 */ 888 */
889 public function getLinkDensity($e) { 889 public function getLinkDensity($e) {
890 $links = $e->getElementsByTagName('a'); 890 $links = $e->getElementsByTagName('a');
891 $textLength = strlen($this->getInnerText($e)); 891 $textLength = strlen($this->getInnerText($e));
892 $linkLength = 0; 892 $linkLength = 0;
893 for ($i=0, $il=$links->length; $i < $il; $i++) 893 for ($i=0, $il=$links->length; $i < $il; $i++)
894 { 894 {
895 $linkLength += strlen($this->getInnerText($links->item($i))); 895 $linkLength += strlen($this->getInnerText($links->item($i)));
896 } 896 }
897 if ($textLength > 0) { 897 if ($textLength > 0) {
898 return $linkLength / $textLength; 898 return $linkLength / $textLength;
899 } else { 899 } else {
900 return 0; 900 return 0;
901 } 901 }
902 } 902 }
903 903
904 /** 904 /**
905 * Get an elements class/id weight. Uses regular expressions to tell if this 905 * Get an elements class/id weight. Uses regular expressions to tell if this
906 * element looks good or bad. 906 * element looks good or bad.
907 * 907 *
908 * @param DOMElement $e 908 * @param DOMElement $e
909 * @return number (Integer) 909 * @return number (Integer)
910 */ 910 */
911 public function getClassWeight($e) { 911 public function getClassWeight($e) {
912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
913 return 0; 913 return 0;
914 } 914 }
915 915
916 $weight = 0; 916 $weight = 0;
917 917
918 /* Look for a special classname */ 918 /* Look for a special classname */
919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '') 919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
920 { 920 {
921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { 921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
922 $weight -= 25; 922 $weight -= 25;
923 } 923 }
924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { 924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
925 $weight += 25; 925 $weight += 25;
926 } 926 }
927 } 927 }
928 928
929 /* Look for a special ID */ 929 /* Look for a special ID */
930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '') 930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
931 { 931 {
932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { 932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
933 $weight -= 25; 933 $weight -= 25;
934 } 934 }
935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { 935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
936 $weight += 25; 936 $weight += 25;
937 } 937 }
938 } 938 }
939 return $weight; 939 return $weight;
940 } 940 }
941 941
942 /** 942 /**
943 * Remove extraneous break tags from a node. 943 * Remove extraneous break tags from a node.
944 * 944 *
945 * @param DOMElement $node 945 * @param DOMElement $node
946 * @return void 946 * @return void
947 */ 947 */
948 public function killBreaks($node) { 948 public function killBreaks($node) {
949 $html = $node->innerHTML; 949 $html = $node->innerHTML;
950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); 950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
951 $node->innerHTML = $html; 951 $node->innerHTML = $html;
952 } 952 }
953 953
954 /** 954 /**
955 * Clean a node of all elements of type "tag". 955 * Clean a node of all elements of type "tag".
956 * (Unless it's a youtube/vimeo video. People love movies.) 956 * (Unless it's a youtube/vimeo video. People love movies.)
957 * 957 *
958 * Updated 2012-09-18 to preserve youtube/vimeo iframes 958 * Updated 2012-09-18 to preserve youtube/vimeo iframes
959 * 959 *
960 * @param DOMElement $e 960 * @param DOMElement $e
961 * @param string $tag 961 * @param string $tag
962 * @return void 962 * @return void
963 */ 963 */
964 public function clean($e, $tag) { 964 public function clean($e, $tag) {
965 $targetList = $e->getElementsByTagName($tag); 965 $targetList = $e->getElementsByTagName($tag);
966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); 966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967 967
968 for ($y=$targetList->length-1; $y >= 0; $y--) { 968 for ($y=$targetList->length-1; $y >= 0; $y--) {
969 /* Allow youtube and vimeo videos through as people usually want to see those. */ 969 /* Allow youtube and vimeo videos through as people usually want to see those. */
970 if ($isEmbed) { 970 if ($isEmbed) {
971 $attributeValues = ''; 971 $attributeValues = '';
972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { 972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) 973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 } 974 }
975 975
976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 if (preg_match($this->regexps['video'], $attributeValues)) { 977 if (preg_match($this->regexps['video'], $attributeValues)) {
978 continue; 978 continue;
979 } 979 }
980 980
981 /* Then check the elements inside this element for the same. */ 981 /* Then check the elements inside this element for the same. */
982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { 982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
983 continue; 983 continue;
984 } 984 }
985 } 985 }
986 $targetList->item($y)->parentNode->removeChild($targetList->item($y)); 986 $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 } 987 }
988 } 988 }
989 989
990 /** 990 /**
991 * Clean an element of all tags of type "tag" if they look fishy. 991 * Clean an element of all tags of type "tag" if they look fishy.
992 * "Fishy" is an algorithm based on content length, classnames, 992 * "Fishy" is an algorithm based on content length, classnames,
993 * link density, number of images & embeds, etc. 993 * link density, number of images & embeds, etc.
994 * 994 *
995 * @param DOMElement $e 995 * @param DOMElement $e
996 * @param string $tag 996 * @param string $tag
997 * @return void 997 * @return void
998 */ 998 */
999 public function cleanConditionally($e, $tag) { 999 public function cleanConditionally($e, $tag) {
1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1001 return; 1001 return;
1002 } 1002 }
1003 1003
1004 $tagsList = $e->getElementsByTagName($tag); 1004 $tagsList = $e->getElementsByTagName($tag);
1005 $curTagsLength = $tagsList->length; 1005 $curTagsLength = $tagsList->length;
1006 1006
1007 /** 1007 /**
1008 * Gather counts for other typical elements embedded within. 1008 * Gather counts for other typical elements embedded within.
1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1010 * 1010 *
1011 * TODO: Consider taking into account original contentScore here. 1011 * TODO: Consider taking into account original contentScore here.
1012 */ 1012 */
1013 for ($i=$curTagsLength-1; $i >= 0; $i--) { 1013 for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 $weight = $this->getClassWeight($tagsList->item($i)); 1014 $weight = $this->getClassWeight($tagsList->item($i));
1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; 1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016 1016
1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); 1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018 1018
1019 if ($weight + $contentScore < 0) { 1019 if ($weight + $contentScore < 0) {
1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1021 } 1021 }
1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { 1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1023 /** 1023 /**
1024 * If there are not very many commas, and the number of 1024 * If there are not very many commas, and the number of
1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1026 **/ 1026 **/
1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length; 1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length; 1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; 1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length; 1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length; 1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length;
1032 1032
1033 $embedCount = 0; 1033 $embedCount = 0;
1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed'); 1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 $embedCount++; 1037 $embedCount++;
1038 } 1038 }
1039 } 1039 }
1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); 1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 $embedCount++; 1043 $embedCount++;
1044 } 1044 }
1045 } 1045 }
1046 1046
1047 $linkDensity = $this->getLinkDensity($tagsList->item($i)); 1047 $linkDensity = $this->getLinkDensity($tagsList->item($i));
1048 $contentLength = strlen($this->getInnerText($tagsList->item($i))); 1048 $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1049 $toRemove = false; 1049 $toRemove = false;
1050 1050
1051 if ($this->lightClean) { 1051 if ($this->lightClean) {
1052 $this->dbg('Light clean...'); 1052 $this->dbg('Light clean...');
1053 if ( ($img > $p) && ($img > 4) ) { 1053 if ( ($img > $p) && ($img > 4) ) {
1054 $this->dbg(' more than 4 images and more image elements than paragraph elements'); 1054 $this->dbg(' more than 4 images and more image elements than paragraph elements');
1055 $toRemove = true; 1055 $toRemove = true;
1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { 1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); 1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1058 $toRemove = true; 1058 $toRemove = true;
1059 } else if ( $input > floor($p/3) ) { 1059 } else if ( $input > floor($p/3) ) {
1060 $this->dbg(' too many <input> elements'); 1060 $this->dbg(' too many <input> elements');
1061 $toRemove = true; 1061 $toRemove = true;
1062 } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { 1062 } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
1063 $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); 1063 $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
1064 $toRemove = true; 1064 $toRemove = true;
1065 } else if($weight < 25 && $linkDensity > 0.2) { 1065 } else if($weight < 25 && $linkDensity > 0.2) {
1066 $this->dbg(' weight smaller than 25 and link density above 0.2'); 1066 $this->dbg(' weight smaller than 25 and link density above 0.2');
1067 $toRemove = true; 1067 $toRemove = true;
1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { 1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); 1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
1070 $toRemove = true; 1070 $toRemove = true;
1071 } else if($embedCount > 3) { 1071 } else if($embedCount > 3) {
1072 $this->dbg(' more than 3 embeds'); 1072 $this->dbg(' more than 3 embeds');
1073 $toRemove = true; 1073 $toRemove = true;
1074 } 1074 }
1075 } else { 1075 } else {
1076 $this->dbg('Standard clean...'); 1076 $this->dbg('Standard clean...');
1077 if ( $img > $p ) { 1077 if ( $img > $p ) {
1078 $this->dbg(' more image elements than paragraph elements'); 1078 $this->dbg(' more image elements than paragraph elements');
1079 $toRemove = true; 1079 $toRemove = true;
1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { 1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); 1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1082 $toRemove = true; 1082 $toRemove = true;
1083 } else if ( $input > floor($p/3) ) { 1083 } else if ( $input > floor($p/3) ) {
1084 $this->dbg(' too many <input> elements'); 1084 $this->dbg(' too many <input> elements');
1085 $toRemove = true; 1085 $toRemove = true;
1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { 1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); 1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
1088 $toRemove = true; 1088 $toRemove = true;
1089 } else if($weight < 25 && $linkDensity > 0.2) { 1089 } else if($weight < 25 && $linkDensity > 0.2) {
1090 $this->dbg(' weight smaller than 25 and link density above 0.2'); 1090 $this->dbg(' weight smaller than 25 and link density above 0.2');
1091 $toRemove = true; 1091 $toRemove = true;
1092 } else if($weight >= 25 && $linkDensity > 0.5) { 1092 } else if($weight >= 25 && $linkDensity > 0.5) {
1093 $this->dbg(' weight above 25 but link density greater than 0.5'); 1093 $this->dbg(' weight above 25 but link density greater than 0.5');
1094 $toRemove = true; 1094 $toRemove = true;
1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { 1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); 1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
1097 $toRemove = true; 1097 $toRemove = true;
1098 } 1098 }
1099 } 1099 }
1100 1100
1101 if ($toRemove) { 1101 if ($toRemove) {
1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); 1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); 1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1104 } 1104 }
1105 } 1105 }
1106 } 1106 }
1107 } 1107 }
1108 1108
1109 /** 1109 /**
1110 * Clean out spurious headers from an Element. Checks things like classnames and link density. 1110 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1111 * 1111 *
1112 * @param DOMElement $e 1112 * @param DOMElement $e
1113 * @return void 1113 * @return void
1114 */ 1114 */
1115 public function cleanHeaders($e) { 1115 public function cleanHeaders($e) {
1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { 1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1117 $headers = $e->getElementsByTagName('h' . $headerIndex); 1117 $headers = $e->getElementsByTagName('h' . $headerIndex);
1118 for ($i=$headers->length-1; $i >=0; $i--) { 1118 for ($i=$headers->length-1; $i >=0; $i--) {
1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { 1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1120 $headers->item($i)->parentNode->removeChild($headers->item($i)); 1120 $headers->item($i)->parentNode->removeChild($headers->item($i));
1121 } 1121 }
1122 } 1122 }
1123 } 1123 }
1124 } 1124 }
1125 1125
1126 public function flagIsActive($flag) { 1126 public function flagIsActive($flag) {
1127 return ($this->flags & $flag) > 0; 1127 return ($this->flags & $flag) > 0;
1128 } 1128 }
1129 1129
1130 public function addFlag($flag) { 1130 public function addFlag($flag) {
1131 $this->flags = $this->flags | $flag; 1131 $this->flags = $this->flags | $flag;
1132 } 1132 }
1133 1133
1134 public function removeFlag($flag) { 1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag; 1135 $this->flags = $this->flags & ~$flag;
1136 } 1136 }
1137} 1137}
1138?> \ No newline at end of file 1138?> \ No newline at end of file
diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php
index 4faad6d9..62c050ec 100755
--- a/inc/3rdparty/makefulltextfeed.php
+++ b/inc/3rdparty/makefulltextfeed.php
@@ -3,8 +3,8 @@
3// Author: Keyvan Minoukadeh 3// Author: Keyvan Minoukadeh
4// Copyright (c) 2013 Keyvan Minoukadeh 4// Copyright (c) 2013 Keyvan Minoukadeh
5// License: AGPLv3 5// License: AGPLv3
6// Version: 3.1 6// Version: 3.2
7// Date: 2013-03-05 7// Date: 2013-05-13
8// More info: http://fivefilters.org/content-only/ 8// More info: http://fivefilters.org/content-only/
9// Help: http://help.fivefilters.org 9// Help: http://help.fivefilters.org
10 10
@@ -25,14 +25,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
25 25
26// Usage 26// Usage
27// ----- 27// -----
28// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org 28// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
29// The following options can be passed in the querystring: 29// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
30// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url)) 30
31// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected) 31//error_reporting(E_ALL ^ E_NOTICE);
32// * API key: key=[api key] (optional, refer to config.php)
33// * Max entries to process: max=[max number of items] (optional)
34
35error_reporting(E_ALL ^ E_NOTICE);
36ini_set("display_errors", 1); 32ini_set("display_errors", 1);
37@set_time_limit(120); 33@set_time_limit(120);
38 34
@@ -76,8 +72,8 @@ header('X-Robots-Tag: noindex, nofollow');
76//////////////////////////////// 72////////////////////////////////
77// Check if service is enabled 73// Check if service is enabled
78//////////////////////////////// 74////////////////////////////////
79if (!$options->enabled) { 75if (!$options->enabled) {
80 die('The full-text RSS service is currently disabled'); 76 die('The full-text RSS service is currently disabled');
81} 77}
82 78
83//////////////////////////////// 79////////////////////////////////
@@ -121,8 +117,8 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc');
121//////////////////////////////// 117////////////////////////////////
122// Check for feed URL 118// Check for feed URL
123//////////////////////////////// 119////////////////////////////////
124if (!isset($_GET['url'])) { 120if (!isset($_GET['url'])) {
125 die('No URL supplied'); 121 die('No URL supplied');
126} 122}
127$url = trim($_GET['url']); 123$url = trim($_GET['url']);
128if (strtolower(substr($url, 0, 7)) == 'feed://') { 124if (strtolower(substr($url, 0, 7)) == 'feed://') {
@@ -161,10 +157,12 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
161 if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); 157 if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
162 if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); 158 if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
163 if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); 159 if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
164 if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); 160 if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);
165 if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); 161 if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
166 if (isset($_GET['xss'])) $redirect .= '&xss'; 162 if (isset($_GET['xss'])) $redirect .= '&xss';
167 if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; 163 if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
164 if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
165 if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
168 if (isset($_GET['debug'])) $redirect .= '&debug'; 166 if (isset($_GET['debug'])) $redirect .= '&debug';
169 if ($debug_mode) { 167 if ($debug_mode) {
170 debug('Redirecting to hide access key, follow URL below to continue'); 168 debug('Redirecting to hide access key, follow URL below to continue');
@@ -177,7 +175,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
177 175
178/////////////////////////////////////////////// 176///////////////////////////////////////////////
179// Set timezone. 177// Set timezone.
180// Prevents warnings, but needs more testing - 178// Prevents warnings, but needs more testing -
181// perhaps if timezone is set in php.ini we 179// perhaps if timezone is set in php.ini we
182// don't need to set it at all... 180// don't need to set it at all...
183/////////////////////////////////////////////// 181///////////////////////////////////////////////
@@ -199,7 +197,7 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int
199} 197}
200$key_index = ($valid_key) ? (int)$_GET['key'] : 0; 198$key_index = ($valid_key) ? (int)$_GET['key'] : 0;
201if (!$valid_key && $options->key_required) { 199if (!$valid_key && $options->key_required) {
202 die('A valid key must be supplied'); 200 die('A valid key must be supplied');
203} 201}
204if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') { 202if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
205 die('The entered key is invalid'); 203 die('The entered key is invalid');
@@ -251,6 +249,28 @@ if ($options->favour_feed_titles == 'user') {
251} 249}
252 250
253/////////////////////////////////////////////// 251///////////////////////////////////////////////
252// Include full content in output?
253///////////////////////////////////////////////
254if ($options->content === 'user') {
255 if (isset($_GET['content']) && $_GET['content'] === '0') {
256 $options->content = false;
257 } else {
258 $options->content = true;
259 }
260}
261
262///////////////////////////////////////////////
263// Include summaries in output?
264///////////////////////////////////////////////
265if ($options->summary === 'user') {
266 if (isset($_GET['summary']) && $_GET['summary'] === '1') {
267 $options->summary = true;
268 } else {
269 $options->summary = false;
270 }
271}
272
273///////////////////////////////////////////////
254// Exclude items if extraction fails 274// Exclude items if extraction fails
255/////////////////////////////////////////////// 275///////////////////////////////////////////////
256if ($options->exclude_items_on_fail === 'user') { 276if ($options->exclude_items_on_fail === 'user') {
@@ -272,15 +292,6 @@ if ($options->detect_language === 'user') {
272 $detect_language = $options->detect_language; 292 $detect_language = $options->detect_language;
273} 293}
274 294
275if ($detect_language >= 2) {
276 $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
277 'cebuano' => 'ceb', // ISO 639-2
278 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
279 'hawaiian' => 'haw', // ISO 639-2
280 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
281 'pidgin' => 'cpe', // ISO 639-2
282 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
283}
284$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); 295$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
285 296
286///////////////////////////////////// 297/////////////////////////////////////
@@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
330////////////////////////////////// 341//////////////////////////////////
331if ($options->caching) { 342if ($options->caching) {
332 debug('Caching is enabled...'); 343 debug('Caching is enabled...');
333 $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); 344 $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
334 $check_cache = true; 345 $check_cache = true;
335 if ($options->apc && $options->smart_cache) { 346 if ($options->apc && $options->smart_cache) {
336 apc_add("cache.$cache_id", 0, 10*60); 347 apc_add("cache.$cache_id", 0, 10*60);
@@ -468,7 +479,7 @@ if ($img_url = $feed->get_image_url()) {
468//////////////////////////////////////////// 479////////////////////////////////////////////
469// Loop through feed items 480// Loop through feed items
470//////////////////////////////////////////// 481////////////////////////////////////////////
471$items = $feed->get_items(0, $max); 482$items = $feed->get_items(0, $max);
472// Request all feed items in parallel (if supported) 483// Request all feed items in parallel (if supported)
473$urls_sanitized = array(); 484$urls_sanitized = array();
474$urls = array(); 485$urls = array();
@@ -550,24 +561,43 @@ foreach ($items as $key => $item) {
550 $is_single_page = false; 561 $is_single_page = false;
551 if ($single_page_response = getSinglePage($item, $html, $effective_url)) { 562 if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
552 $is_single_page = true; 563 $is_single_page = true;
553 $html = $single_page_response['body'];
554 // remove strange things
555 $html = str_replace('</[>', '', $html);
556 $html = convert_to_utf8($html, $single_page_response['headers']);
557 $effective_url = $single_page_response['effective_url']; 564 $effective_url = $single_page_response['effective_url'];
558 debug("Retrieved single-page view from $effective_url"); 565 // check if action defined for returned Content-Type
566 $mime_info = get_mime_action_info($single_page_response['headers']);
567 if (isset($mime_info['action'])) {
568 if ($mime_info['action'] == 'exclude') {
569 continue; // skip this feed item entry
570 } elseif ($mime_info['action'] == 'link') {
571 if ($mime_info['type'] == 'image') {
572 $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
573 } else {
574 $html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
575 }
576 $extracted_title = $mime_info['name'];
577 $do_content_extraction = false;
578 }
579 }
580 if ($do_content_extraction) {
581 $html = $single_page_response['body'];
582 // remove strange things
583 $html = str_replace('</[>', '', $html);
584 $html = convert_to_utf8($html, $single_page_response['headers']);
585 debug("Retrieved single-page view from $effective_url");
586 }
559 unset($single_page_response); 587 unset($single_page_response);
560 } 588 }
589 }
590 if ($do_content_extraction) {
561 debug('--------'); 591 debug('--------');
562 debug('Attempting to extract content'); 592 debug('Attempting to extract content');
563 $extract_result = $extractor->process($html, $effective_url); 593 $extract_result = $extractor->process($html, $effective_url);
564 $readability = $extractor->readability; 594 $readability = $extractor->readability;
565 $content_block = ($extract_result) ? $extractor->getContent() : null; 595 $content_block = ($extract_result) ? $extractor->getContent() : null;
566 $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; 596 $extracted_title = ($extract_result) ? $extractor->getTitle() : '';
567 // Deal with multi-page articles 597 // Deal with multi-page articles
568 //die('Next: '.$extractor->getNextPageUrl()); 598 //die('Next: '.$extractor->getNextPageUrl());
569 $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); 599 $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
570 if ($options->multipage && $is_multi_page) { 600 if ($options->multipage && $is_multi_page && $options->content) {
571 debug('--------'); 601 debug('--------');
572 debug('Attempting to process multi-page article'); 602 debug('Attempting to process multi-page article');
573 $multi_page_urls = array(); 603 $multi_page_urls = array();
@@ -580,7 +610,7 @@ foreach ($items as $key => $item) {
580 // check it's not what we have already! 610 // check it's not what we have already!
581 if (!in_array($next_page_url, $multi_page_urls)) { 611 if (!in_array($next_page_url, $multi_page_urls)) {
582 // it's not, so let's attempt to fetch it 612 // it's not, so let's attempt to fetch it
583 $multi_page_urls[] = $next_page_url; 613 $multi_page_urls[] = $next_page_url;
584 $_prev_ref = $http->referer; 614 $_prev_ref = $http->referer;
585 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { 615 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {
586 // make sure mime type is not something with a different action associated 616 // make sure mime type is not something with a different action associated
@@ -605,13 +635,15 @@ foreach ($items as $key => $item) {
605 // did we successfully deal with this multi-page article? 635 // did we successfully deal with this multi-page article?
606 if (empty($multi_page_content)) { 636 if (empty($multi_page_content)) {
607 debug('Failed to extract all parts of multi-page article, so not going to include them'); 637 debug('Failed to extract all parts of multi-page article, so not going to include them');
608 $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; 638 $_page = $readability->dom->createElement('p');
639 $_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
640 $multi_page_content[] = $_page;
609 } 641 }
610 foreach ($multi_page_content as $_page) { 642 foreach ($multi_page_content as $_page) {
611 $_page = $content_block->ownerDocument->importNode($_page, true); 643 $_page = $content_block->ownerDocument->importNode($_page, true);
612 $content_block->appendChild($_page); 644 $content_block->appendChild($_page);
613 } 645 }
614 unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); 646 unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
615 } 647 }
616 } 648 }
617 // use extracted title for both feed and item title if we're using single-item dummy feed 649 // use extracted title for both feed and item title if we're using single-item dummy feed
@@ -658,7 +690,7 @@ foreach ($items as $key => $item) {
658 } else { 690 } else {
659 $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML 691 $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
660 } 692 }
661 unset($content_block); 693 //unset($content_block);
662 // post-processing cleanup 694 // post-processing cleanup
663 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html); 695 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
664 if ($links == 'remove') { 696 if ($links == 'remove') {
@@ -671,130 +703,155 @@ foreach ($items as $key => $item) {
671 } 703 }
672 } 704 }
673 705
674 if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment 706 if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
675 $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); 707 $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
708 } else {
709 $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
710 }
711 // filter xss?
712 if ($xss_filter) {
713 debug('Filtering HTML to remove XSS');
714 $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
715 }
716
717 // add content
718 if ($options->summary === true) {
719 // get summary
720 $summary = '';
721 if (!$do_content_extraction) {
722 $summary = $html;
676 } else { 723 } else {
677 $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); 724 // Try to get first few paragraphs
678 } 725 if (isset($content_block) && ($content_block instanceof DOMElement)) {
679 // filter xss? 726 $_paras = $content_block->getElementsByTagName('p');
680 if ($xss_filter) { 727 foreach ($_paras as $_para) {
681 debug('Filtering HTML to remove XSS'); 728 $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
682 $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); 729 if (strlen($summary) > 200) break;
683 }
684 $newitem->setDescription($html);
685
686 // set date
687 if ((int)$item->get_date('U') > 0) {
688 $newitem->setDate((int)$item->get_date('U'));
689 } elseif ($extractor->getDate()) {
690 $newitem->setDate($extractor->getDate());
691 }
692
693 // add authors
694 if ($authors = $item->get_authors()) {
695 foreach ($authors as $author) {
696 // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
697 if ($author->get_name() !== null) {
698 $newitem->addElement('dc:creator', $author->get_name());
699 } elseif ($author->get_email() !== null) {
700 $newitem->addElement('dc:creator', $author->get_email());
701 } 730 }
731 } else {
732 $summary = $html;
702 } 733 }
703 } elseif ($authors = $extractor->getAuthors()) { 734 }
704 //TODO: make sure the list size is reasonable 735 unset($_paras, $_para);
705 foreach ($authors as $author) { 736 $summary = get_excerpt($summary);
706 // TODO: xpath often selects authors from other articles linked from the page. 737 $newitem->setDescription($summary);
707 // for now choose first item 738 if ($options->content) $newitem->setElement('content:encoded', $html);
708 $newitem->addElement('dc:creator', $author); 739 } else {
709 break; 740 if ($options->content) $newitem->setDescription($html);
741 }
742
743 // set date
744 if ((int)$item->get_date('U') > 0) {
745 $newitem->setDate((int)$item->get_date('U'));
746 } elseif ($extractor->getDate()) {
747 $newitem->setDate($extractor->getDate());
748 }
749
750 // add authors
751 if ($authors = $item->get_authors()) {
752 foreach ($authors as $author) {
753 // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
754 if ($author->get_name() !== null) {
755 $newitem->addElement('dc:creator', $author->get_name());
756 } elseif ($author->get_email() !== null) {
757 $newitem->addElement('dc:creator', $author->get_email());
710 } 758 }
711 } 759 }
712 760 } elseif ($authors = $extractor->getAuthors()) {
713 // add language 761 //TODO: make sure the list size is reasonable
714 if ($detect_language) { 762 foreach ($authors as $author) {
715 $language = $extractor->getLanguage(); 763 // TODO: xpath often selects authors from other articles linked from the page.
716 if (!$language) $language = $feed->get_language(); 764 // for now choose first item
717 if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { 765 $newitem->addElement('dc:creator', $author);
718 try { 766 break;
719 if ($use_cld) { 767 }
720 // Use PHP-CLD extension 768 }
721 $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error 769
722 $res = $php_cld($text_sample); 770 // add language
723 if (is_array($res) && count($res) > 0) { 771 if ($detect_language) {
724 $language = $res[0]['code']; 772 $language = $extractor->getLanguage();
725 } 773 if (!$language) $language = $feed->get_language();
726 } else { 774 if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
727 //die('what'); 775 try {
728 // Use PEAR's Text_LanguageDetect 776 if ($use_cld) {
729 if (!isset($l)) { 777 // Use PHP-CLD extension
730 $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); 778 $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
731 } 779 $res = $php_cld($text_sample);
732 $l_result = $l->detect($text_sample, 1); 780 if (is_array($res) && count($res) > 0) {
733 if (count($l_result) > 0) { 781 $language = $res[0]['code'];
734 $language = $language_codes[key($l_result)]; 782 }
735 } 783 } else {
784 //die('what');
785 // Use PEAR's Text_LanguageDetect
786 if (!isset($l)) {
787 $l = new Text_LanguageDetect();
788 $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
789 }
790 $l_result = $l->detect($text_sample, 1);
791 if (count($l_result) > 0) {
792 $language = key($l_result);
736 } 793 }
737 } catch (Exception $e) {
738 //die('error: '.$e);
739 // do nothing
740 } 794 }
741 } 795 } catch (Exception $e) {
742 if ($language && (strlen($language) < 7)) { 796 //die('error: '.$e);
743 $newitem->addElement('dc:language', $language); 797 // do nothing
744 } 798 }
745 } 799 }
746 800 if ($language && (strlen($language) < 7)) {
747 // add MIME type (if it appeared in our exclusions lists) 801 $newitem->addElement('dc:language', $language);
748 if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
749 // add effective URL (URL after redirects)
750 if (isset($effective_url)) {
751 //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
752 //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)
753 //temporary measure: use utf8_encode()
754 $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
755 } else {
756 $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
757 } 802 }
758 803 }
759 // add categories 804
760 if ($categories = $item->get_categories()) { 805 // add MIME type (if it appeared in our exclusions lists)
761 foreach ($categories as $category) { 806 if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
762 if ($category->get_label() !== null) { 807 // add effective URL (URL after redirects)
763 $newitem->addElement('category', $category->get_label()); 808 if (isset($effective_url)) {
764 } 809 //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
810 //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)
811 //temporary measure: use utf8_encode()
812 $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
813 } else {
814 $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
815 }
816
817 // add categories
818 if ($categories = $item->get_categories()) {
819 foreach ($categories as $category) {
820 if ($category->get_label() !== null) {
821 $newitem->addElement('category', $category->get_label());
765 } 822 }
766 } 823 }
767 824 }
768 // check for enclosures 825
769 if ($options->keep_enclosures) { 826 // check for enclosures
770 if ($enclosures = $item->get_enclosures()) { 827 if ($options->keep_enclosures) {
771 foreach ($enclosures as $enclosure) { 828 if ($enclosures = $item->get_enclosures()) {
772 // thumbnails 829 foreach ($enclosures as $enclosure) {
773 foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { 830 // thumbnails
774 $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); 831 foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
775 } 832 $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
776 if (!$enclosure->get_link()) continue;
777 $enc = array();
778 // Media RSS spec ($enc): http://search.yahoo.com/mrss
779 // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
780 $enc['url'] = $enclosure->get_link();
781 if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
782 if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
783 if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
784 if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
785 if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
786 if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
787 if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
788 if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
789 if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
790 if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
791 if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
792 if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
793 $newitem->addElement('media:content', '', $enc);
794 } 833 }
834 if (!$enclosure->get_link()) continue;
835 $enc = array();
836 // Media RSS spec ($enc): http://search.yahoo.com/mrss
837 // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
838 $enc['url'] = $enclosure->get_link();
839 if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
840 if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
841 if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
842 if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
843 if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
844 if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
845 if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
846 if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
847 if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
848 if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
849 if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
850 if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
851 $newitem->addElement('media:content', '', $enc);
795 } 852 }
796 } 853 }
797 /* } */ 854 }
798 $output->addItem($newitem); 855 $output->addItem($newitem);
799 unset($html); 856 unset($html);
800 $item_count++; 857 $item_count++;
diff --git a/inc/3rdparty/makefulltextfeedHelpers.php b/inc/3rdparty/makefulltextfeedHelpers.php
index 1c11b8f6..4e985372 100755
--- a/inc/3rdparty/makefulltextfeedHelpers.php
+++ b/inc/3rdparty/makefulltextfeedHelpers.php
@@ -66,6 +66,38 @@ class DummySingleItem {
66// HELPER FUNCTIONS 66// HELPER FUNCTIONS
67/////////////////////////////// 67///////////////////////////////
68 68
69// Adapted from WordPress
70// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
71function get_excerpt($text, $num_words=55, $more=null) {
72 if (null === $more) $more = '&hellip;';
73 $text = strip_tags($text);
74 //TODO: Check if word count is based on single characters (East Asian characters)
75 /*
76 if (1==2) {
77 $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
78 preg_match_all('/./u', $text, $words_array);
79 $words_array = array_slice($words_array[0], 0, $num_words + 1);
80 $sep = '';
81 } else {
82 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
83 $sep = ' ';
84 }
85 */
86 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
87 $sep = ' ';
88 if (count($words_array) > $num_words) {
89 array_pop($words_array);
90 $text = implode($sep, $words_array);
91 $text = $text.$more;
92 } else {
93 $text = implode($sep, $words_array);
94 }
95 // trim whitespace at beginning or end of string
96 // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
97 $text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
98 return $text;
99}
100
69function url_allowed($url) { 101function url_allowed($url) {
70 global $options; 102 global $options;
71 if (!empty($options->allowed_urls)) { 103 if (!empty($options->allowed_urls)) {
@@ -165,14 +197,6 @@ function convert_to_utf8($html, $header=null)
165 if (strtolower($encoding) != 'utf-8') { 197 if (strtolower($encoding) != 'utf-8') {
166 debug('Converting to UTF-8'); 198 debug('Converting to UTF-8');
167 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); 199 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
168 /*
169 if (function_exists('iconv')) {
170 // iconv appears to handle certain character encodings better than mb_convert_encoding
171 $html = iconv($encoding, 'utf-8', $html);
172 } else {
173 $html = mb_convert_encoding($html, 'utf-8', $encoding);
174 }
175 */
176 } 200 }
177 } 201 }
178 } 202 }
@@ -196,7 +220,7 @@ function makeAbsolute($base, $elem) {
196} 220}
197function makeAbsoluteAttr($base, $e, $attr) { 221function makeAbsoluteAttr($base, $e, $attr) {
198 if ($e->hasAttribute($attr)) { 222 if ($e->hasAttribute($attr)) {
199 // Trim leading and trailing white space. I don't really like this but 223 // Trim leading and trailing white space. I don't really like this but
200 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" /> 224 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
201 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr))); 225 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
202 $url = str_replace(' ', '%20', $url); 226 $url = str_replace(' ', '%20', $url);
diff --git a/inc/3rdparty/site_config/custom/dailymotion.com.txt b/inc/3rdparty/site_config/custom/dailymotion.com.txt
new file mode 100755
index 00000000..0cad808f
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/dailymotion.com.txt
@@ -0,0 +1,12 @@
1title: //title
2body: //iframe
3
4replace_string(<![CDATA[): _
5replace_string(]]>): _
6
7single_page_link: //link[@type='application/xml+oembed']
8
9prune: no
10tidy: no
11
12http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people
diff --git a/inc/3rdparty/site_config/custom/index.php b/inc/3rdparty/site_config/custom/index.php
new file mode 100644
index 00000000..a3d5f739
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/index.php
@@ -0,0 +1,3 @@
1<?php
2// this is here to prevent directory listing over the web
3?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/custom/ted.com.txt b/inc/3rdparty/site_config/custom/ted.com.txt
new file mode 100755
index 00000000..4940d2bc
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/ted.com.txt
@@ -0,0 +1,11 @@
1title: //title
2body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image']
3
4strip_id_or_class: talk-transcript__para__time
5
6single_page_link: //a[@id='hero-transcript-link']
7
8#prune: no
9tidy: no
10
11test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are
diff --git a/inc/3rdparty/site_config/index.php b/inc/3rdparty/site_config/index.php
index a1b767fd..76ca8b3c 100644
--- a/inc/3rdparty/site_config/index.php
+++ b/inc/3rdparty/site_config/index.php
@@ -1,3 +1,2 @@
1<?php 1<?php
2// this is here to prevent directory listing over the web 2// this is here to prevent directory listing over the web \ No newline at end of file
3?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/version.txt b/inc/3rdparty/site_config/standard/version.txt
index bf0d87ab..eaf01ebd 100644
--- a/inc/3rdparty/site_config/standard/version.txt
+++ b/inc/3rdparty/site_config/standard/version.txt
@@ -1 +1 @@
4 \ No newline at end of file 2013-05-12T22:53:07Z \ No newline at end of file
diff --git a/inc/poche/Poche.class.php b/inc/poche/Poche.class.php
index 4139c4f4..09a9f5ff 100755
--- a/inc/poche/Poche.class.php
+++ b/inc/poche/Poche.class.php
@@ -1083,11 +1083,10 @@ class Poche
1083 $config = $this->store->getConfigUser($user_id); 1083 $config = $this->store->getConfigUser($user_id);
1084 1084
1085 if ($config == null) { 1085 if ($config == null) {
1086 die(_('User with this id (' . $user_id . ') does not exist.')); 1086 die(sprintf(_('User with this id (%d) does not exist.'), $user_id));
1087 } 1087 }
1088 1088
1089 if (!in_array($type, $allowed_types) || 1089 if (!in_array($type, $allowed_types) || $token != $config['token']) {
1090 $token != $config['token']) {
1091 die(_('Uh, there is a problem while generating feeds.')); 1090 die(_('Uh, there is a problem while generating feeds.'));
1092 } 1091 }
1093 // Check the token 1092 // Check the token
@@ -1145,16 +1144,18 @@ class Poche
1145 $config = HTMLPurifier_Config::createDefault(); 1144 $config = HTMLPurifier_Config::createDefault();
1146 $config->set('Cache.SerializerPath', CACHE); 1145 $config->set('Cache.SerializerPath', CACHE);
1147 $config->set('HTML.SafeIframe', true); 1146 $config->set('HTML.SafeIframe', true);
1148 $config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/)%'); //allow YouTube and Vimeo$purifier = new HTMLPurifier($config); 1147
1148 //allow YouTube, Vimeo and dailymotion videos
1149 $config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/|www\.dailymotion\.com/embed/video/)%');
1149 1150
1150 return new HTMLPurifier($config); 1151 return new HTMLPurifier($config);
1151 } 1152 }
1152 1153
1153 /** 1154 /**
1154 * handle epub 1155 * handle epub
1155 */ 1156 */
1156 public function createEpub() { 1157 public function createEpub() {
1157 1158
1158 switch ($_GET['method']) { 1159 switch ($_GET['method']) {
1159 case 'id': 1160 case 'id':
1160 $entryID = filter_var($_GET['id'],FILTER_SANITIZE_NUMBER_INT); 1161 $entryID = filter_var($_GET['id'],FILTER_SANITIZE_NUMBER_INT);
@@ -1190,7 +1191,7 @@ class Poche
1190 break; 1191 break;
1191 case 'default': 1192 case 'default':
1192 die(_('Uh, there is a problem while generating epub.')); 1193 die(_('Uh, there is a problem while generating epub.'));
1193 1194
1194 } 1195 }
1195 1196
1196 $content_start = 1197 $content_start =
@@ -1203,10 +1204,9 @@ class Poche
1203 . "<body>\n"; 1204 . "<body>\n";
1204 1205
1205 $bookEnd = "</body>\n</html>\n"; 1206 $bookEnd = "</body>\n</html>\n";
1206 1207
1207 $log = new Logger("wallabag", TRUE); 1208 $log = new Logger("wallabag", TRUE);
1208 $fileDir = CACHE; 1209 $fileDir = CACHE;
1209
1210 1210
1211 $book = new EPub(EPub::BOOK_VERSION_EPUB3, DEBUG_POCHE); 1211 $book = new EPub(EPub::BOOK_VERSION_EPUB3, DEBUG_POCHE);
1212 $log->logLine("new EPub()"); 1212 $log->logLine("new EPub()");
@@ -1215,7 +1215,7 @@ class Poche
1215 $log->logLine("Zip version: " . Zip::VERSION); 1215 $log->logLine("Zip version: " . Zip::VERSION);
1216 $log->logLine("getCurrentServerURL: " . $book->getCurrentServerURL()); 1216 $log->logLine("getCurrentServerURL: " . $book->getCurrentServerURL());
1217 $log->logLine("getCurrentPageURL..: " . $book->getCurrentPageURL()); 1217 $log->logLine("getCurrentPageURL..: " . $book->getCurrentPageURL());
1218 1218
1219 $book->setTitle(_('wallabag\'s articles')); 1219 $book->setTitle(_('wallabag\'s articles'));
1220 $book->setIdentifier("http://$_SERVER[HTTP_HOST]", EPub::IDENTIFIER_URI); // Could also be the ISBN number, prefered for published books, or a UUID. 1220 $book->setIdentifier("http://$_SERVER[HTTP_HOST]", EPub::IDENTIFIER_URI); // Could also be the ISBN number, prefered for published books, or a UUID.
1221 //$book->setLanguage("en"); // Not needed, but included for the example, Language is mandatory, but EPub defaults to "en". Use RFC3066 Language codes, such as "en", "da", "fr" etc. 1221 //$book->setLanguage("en"); // Not needed, but included for the example, Language is mandatory, but EPub defaults to "en". Use RFC3066 Language codes, such as "en", "da", "fr" etc.
@@ -1225,39 +1225,39 @@ class Poche
1225 $book->setDate(time()); // Strictly not needed as the book date defaults to time(). 1225 $book->setDate(time()); // Strictly not needed as the book date defaults to time().
1226 //$book->setRights("Copyright and licence information specific for the book."); // As this is generated, this _could_ contain the name or licence information of the user who purchased the book, if needed. If this is used that way, the identifier must also be made unique for the book. 1226 //$book->setRights("Copyright and licence information specific for the book."); // As this is generated, this _could_ contain the name or licence information of the user who purchased the book, if needed. If this is used that way, the identifier must also be made unique for the book.
1227 $book->setSourceURL("http://$_SERVER[HTTP_HOST]"); 1227 $book->setSourceURL("http://$_SERVER[HTTP_HOST]");
1228 1228
1229 $book->addDublinCoreMetadata(DublinCore::CONTRIBUTOR, "PHP"); 1229 $book->addDublinCoreMetadata(DublinCore::CONTRIBUTOR, "PHP");
1230 $book->addDublinCoreMetadata(DublinCore::CONTRIBUTOR, "wallabag"); 1230 $book->addDublinCoreMetadata(DublinCore::CONTRIBUTOR, "wallabag");
1231 1231
1232 $cssData = "body {\n margin-left: .5em;\n margin-right: .5em;\n text-align: justify;\n}\n\np {\n font-family: serif;\n font-size: 10pt;\n text-align: justify;\n text-indent: 1em;\n margin-top: 0px;\n margin-bottom: 1ex;\n}\n\nh1, h2 {\n font-family: sans-serif;\n font-style: italic;\n text-align: center;\n background-color: #6b879c;\n color: white;\n width: 100%;\n}\n\nh1 {\n margin-bottom: 2px;\n}\n\nh2 {\n margin-top: -2px;\n margin-bottom: 2px;\n}\n"; 1232 $cssData = "body {\n margin-left: .5em;\n margin-right: .5em;\n text-align: justify;\n}\n\np {\n font-family: serif;\n font-size: 10pt;\n text-align: justify;\n text-indent: 1em;\n margin-top: 0px;\n margin-bottom: 1ex;\n}\n\nh1, h2 {\n font-family: sans-serif;\n font-style: italic;\n text-align: center;\n background-color: #6b879c;\n color: white;\n width: 100%;\n}\n\nh1 {\n margin-bottom: 2px;\n}\n\nh2 {\n margin-top: -2px;\n margin-bottom: 2px;\n}\n";
1233 1233
1234 $log->logLine("Add Cover"); 1234 $log->logLine("Add Cover");
1235 1235
1236 $fullTitle = "<h1> " . $bookTitle . "</h1>\n"; 1236 $fullTitle = "<h1> " . $bookTitle . "</h1>\n";
1237 1237
1238 $book->setCoverImage("Cover.png", file_get_contents("themes/baggy/img/apple-touch-icon-152.png"), "image/png", $fullTitle); 1238 $book->setCoverImage("Cover.png", file_get_contents("themes/baggy/img/apple-touch-icon-152.png"), "image/png", $fullTitle);
1239 1239
1240 $cover = $content_start . '<div style="text-align:center;"><p>' . _('Produced by wallabag with PHPePub') . '</p><p>'. _('Please open <a href="https://github.com/wallabag/wallabag/issues" >an issue</a> if you have trouble with the display of this E-Book on your device.') . '</p></div>' . $bookEnd; 1240 $cover = $content_start . '<div style="text-align:center;"><p>' . _('Produced by wallabag with PHPePub') . '</p><p>'. _('Please open <a href="https://github.com/wallabag/wallabag/issues" >an issue</a> if you have trouble with the display of this E-Book on your device.') . '</p></div>' . $bookEnd;
1241 1241
1242 //$book->addChapter("Table of Contents", "TOC.xhtml", NULL, false, EPub::EXTERNAL_REF_IGNORE); 1242 //$book->addChapter("Table of Contents", "TOC.xhtml", NULL, false, EPub::EXTERNAL_REF_IGNORE);
1243 $book->addChapter("Notices", "Cover2.html", $cover); 1243 $book->addChapter("Notices", "Cover2.html", $cover);
1244 1244
1245 $book->buildTOC(); 1245 $book->buildTOC();
1246 1246
1247 foreach ($entries as $entry) { //set tags as subjects 1247 foreach ($entries as $entry) { //set tags as subjects
1248 $tags = $this->store->retrieveTagsByEntry($entry['id']); 1248 $tags = $this->store->retrieveTagsByEntry($entry['id']);
1249 foreach ($tags as $tag) { 1249 foreach ($tags as $tag) {
1250 $book->setSubject($tag['value']); 1250 $book->setSubject($tag['value']);
1251 } 1251 }
1252 1252
1253 $log->logLine("Set up parameters"); 1253 $log->logLine("Set up parameters");
1254 1254
1255 $chapter = $content_start . $entry['content'] . $bookEnd; 1255 $chapter = $content_start . $entry['content'] . $bookEnd;
1256 $book->addChapter($entry['title'], htmlspecialchars($entry['title']) . ".html", $chapter, true, EPub::EXTERNAL_REF_ADD); 1256 $book->addChapter($entry['title'], htmlspecialchars($entry['title']) . ".html", $chapter, true, EPub::EXTERNAL_REF_ADD);
1257 $log->logLine("Added chapter " . $entry['title']); 1257 $log->logLine("Added chapter " . $entry['title']);
1258 } 1258 }
1259 1259
1260 if (DEBUG_POCHE) { 1260 if (DEBUG_POCHE) {
1261 $epuplog = $book->getLog(); 1261 $epuplog = $book->getLog();
1262 $book->addChapter("Log", "Log.html", $content_start . $log->getLog() . "\n</pre>" . $bookEnd); // log generation 1262 $book->addChapter("Log", "Log.html", $content_start . $log->getLog() . "\n</pre>" . $bookEnd); // log generation
1263 } 1263 }
diff --git a/inc/poche/Tools.class.php b/inc/poche/Tools.class.php
index 7f064020..1ef875c9 100755
--- a/inc/poche/Tools.class.php
+++ b/inc/poche/Tools.class.php
@@ -18,8 +18,6 @@ class Tools
18 die(_('Oops, it seems you don\'t have PHP 5.')); 18 die(_('Oops, it seems you don\'t have PHP 5.'));
19 } 19 }
20 20
21 error_reporting(E_ALL);
22
23 function stripslashesDeep($value) { 21 function stripslashesDeep($value) {
24 return is_array($value) 22 return is_array($value)
25 ? array_map('stripslashesDeep', $value) 23 ? array_map('stripslashesDeep', $value)
@@ -60,7 +58,11 @@ class Tools
60 } 58 }
61 59
62 $host = (isset($_SERVER['HTTP_X_FORWARDED_HOST']) ? $_SERVER['HTTP_X_FORWARDED_HOST'] : (isset($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'])); 60 $host = (isset($_SERVER['HTTP_X_FORWARDED_HOST']) ? $_SERVER['HTTP_X_FORWARDED_HOST'] : (isset($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME']));
63 61
62 if (strpos($host, ':') !== false) {
63 $serverport = '';
64 }
65
64 return 'http' . ($https ? 's' : '') . '://' 66 return 'http' . ($https ? 's' : '') . '://'
65 . $host . $serverport . $scriptname; 67 . $host . $serverport . $scriptname;
66 } 68 }
diff --git a/inc/poche/config.inc.default.php b/inc/poche/config.inc.default.php
index edc42fc9..95f727c6 100755
--- a/inc/poche/config.inc.default.php
+++ b/inc/poche/config.inc.default.php
@@ -30,7 +30,12 @@
30 30
31@define ('MODE_DEMO', FALSE); 31@define ('MODE_DEMO', FALSE);
32@define ('DEBUG_POCHE', FALSE); 32@define ('DEBUG_POCHE', FALSE);
33@define ('DOWNLOAD_PICTURES', FALSE); 33
34//default level of error reporting in application. Developers should override it in their config.inc.php: set to E_ALL.
35@define ('ERROR_REPORTING', E_ALL & ~E_NOTICE);
36
37@define ('DOWNLOAD_PICTURES', FALSE); # This can slow down the process of adding articles
38@define ('REGENERATE_PICTURES_QUALITY', 75);
34@define ('CONVERT_LINKS_FOOTNOTES', FALSE); 39@define ('CONVERT_LINKS_FOOTNOTES', FALSE);
35@define ('REVERT_FORCED_PARAGRAPH_ELEMENTS', FALSE); 40@define ('REVERT_FORCED_PARAGRAPH_ELEMENTS', FALSE);
36@define ('SHARE_TWITTER', TRUE); 41@define ('SHARE_TWITTER', TRUE);
diff --git a/inc/poche/pochePictures.php b/inc/poche/pochePictures.php
index a11340f8..7c319a85 100644
--- a/inc/poche/pochePictures.php
+++ b/inc/poche/pochePictures.php
@@ -14,6 +14,7 @@
14function filtre_picture($content, $url, $id) 14function filtre_picture($content, $url, $id)
15{ 15{
16 $matches = array(); 16 $matches = array();
17 $processing_pictures = array(); // list of processing image to avoid processing the same pictures twice
17 preg_match_all('#<\s*(img)[^>]+src="([^"]*)"[^>]*>#Si', $content, $matches, PREG_SET_ORDER); 18 preg_match_all('#<\s*(img)[^>]+src="([^"]*)"[^>]*>#Si', $content, $matches, PREG_SET_ORDER);
18 foreach($matches as $i => $link) { 19 foreach($matches as $i => $link) {
19 $link[1] = trim($link[1]); 20 $link[1] = trim($link[1]);
@@ -22,8 +23,17 @@ function filtre_picture($content, $url, $id)
22 $filename = basename(parse_url($absolute_path, PHP_URL_PATH)); 23 $filename = basename(parse_url($absolute_path, PHP_URL_PATH));
23 $directory = create_assets_directory($id); 24 $directory = create_assets_directory($id);
24 $fullpath = $directory . '/' . $filename; 25 $fullpath = $directory . '/' . $filename;
25 download_pictures($absolute_path, $fullpath); 26
26 $content = str_replace($matches[$i][2], $fullpath, $content); 27 if (in_array($absolute_path, $processing_pictures) === true) {
28 // replace picture's URL only if processing is OK : already processing -> go to next picture
29 continue;
30 }
31
32 if (download_pictures($absolute_path, $fullpath) === true) {
33 $content = str_replace($matches[$i][2], $fullpath, $content);
34 }
35
36 $processing_pictures[] = $absolute_path;
27 } 37 }
28 38
29 } 39 }
@@ -64,6 +74,8 @@ function get_absolute_link($relative_link, $url) {
64 74
65/** 75/**
66 * Téléchargement des images 76 * Téléchargement des images
77 *
78 * @return bool true if the download and processing is OK, false else
67 */ 79 */
68function download_pictures($absolute_path, $fullpath) 80function download_pictures($absolute_path, $fullpath)
69{ 81{
@@ -73,9 +85,44 @@ function download_pictures($absolute_path, $fullpath)
73 if(file_exists($fullpath)) { 85 if(file_exists($fullpath)) {
74 unlink($fullpath); 86 unlink($fullpath);
75 } 87 }
76 $fp = fopen($fullpath, 'x'); 88
77 fwrite($fp, $rawdata); 89 // check extension
78 fclose($fp); 90 $file_ext = strrchr($fullpath, '.');
91 $whitelist = array(".jpg",".jpeg",".gif",".png");
92 if (!(in_array($file_ext, $whitelist))) {
93 Tools::logm('processed image with not allowed extension. Skipping ' . $fullpath);
94 return false;
95 }
96
97 // check headers
98 $imageinfo = getimagesize($absolute_path);
99 if ($imageinfo['mime'] != 'image/gif' && $imageinfo['mime'] != 'image/jpeg'&& $imageinfo['mime'] != 'image/jpg'&& $imageinfo['mime'] != 'image/png') {
100 Tools::logm('processed image with bad header. Skipping ' . $fullpath);
101 return false;
102 }
103
104 // regenerate image
105 $im = imagecreatefromstring($rawdata);
106 if ($im === false) {
107 Tools::logm('error while regenerating image ' . $fullpath);
108 return false;
109 }
110
111 switch ($imageinfo['mime']) {
112 case 'image/gif':
113 $result = imagegif($im, $fullpath);
114 break;
115 case 'image/jpeg':
116 case 'image/jpg':
117 $result = imagejpeg($im, $fullpath, REGENERATE_PICTURES_QUALITY);
118 break;
119 case 'image/png':
120 $result = imagepng($im, $fullpath, ceil(REGENERATE_PICTURES_QUALITY / 100 * 9));
121 break;
122 }
123 imagedestroy($im);
124
125 return $result;
79} 126}
80 127
81/** 128/**
diff --git a/index.php b/index.php
index 79838ed9..c134b103 100755
--- a/index.php
+++ b/index.php
@@ -8,10 +8,15 @@
8 * @license http://www.wtfpl.net/ see COPYING file 8 * @license http://www.wtfpl.net/ see COPYING file
9 */ 9 */
10 10
11define ('POCHE', '1.6.1'); 11define ('POCHE', '1.7.0');
12require 'check_setup.php'; 12require 'check_setup.php';
13require_once 'inc/poche/global.inc.php'; 13require_once 'inc/poche/global.inc.php';
14 14
15# Set error reporting level
16if (defined('ERROR_REPORTING')) {
17 error_reporting(ERROR_REPORTING);
18}
19
15# Start session 20# Start session
16Session::$sessionName = 'poche'; 21Session::$sessionName = 'poche';
17Session::init(); 22Session::init();
diff --git a/themes/baggy/css/main.css b/themes/baggy/css/main.css
index 6d3a56e1..6d320cd2 100755
--- a/themes/baggy/css/main.css
+++ b/themes/baggy/css/main.css
@@ -979,8 +979,8 @@ blockquote {
979 content: none; 979 content: none;
980 } 980 }
981 .logo { 981 .logo {
982 width: 1.5em; 982 width: 1.25em;
983 height: 1.5em; 983 height: 1.25em;
984 left: 0; 984 left: 0;
985 top: 0; 985 top: 0;
986 } 986 }
@@ -1030,6 +1030,7 @@ blockquote {
1030 margin-left: 1.5em; 1030 margin-left: 1.5em;
1031 padding-right: 1.5em; 1031 padding-right: 1.5em;
1032 position: static; 1032 position: static;
1033 margin-top: 3em;
1033 } 1034 }
1034 #article_toolbar .topPosF { 1035 #article_toolbar .topPosF {
1035 display: none; 1036 display: none;
diff --git a/themes/default/_search-form.twig b/themes/default/_search-form.twig
index 74f420d0..33bea20d 100644..100755
--- a/themes/default/_search-form.twig
+++ b/themes/default/_search-form.twig
@@ -7,17 +7,3 @@
7 </p> 7 </p>
8</form> 8</form>
9</div> 9</div>
10<script type="text/javascript">
11 $(document).ready(function() {
12
13 $("#search-form").hide();
14
15 $("#search").click(function(){
16 $("#search-form").toggle();
17 $("#search").toggleClass("current");
18 $("#search-arrow").toggleClass("arrow-down");
19 });
20
21
22 });
23</script> \ No newline at end of file
diff --git a/themes/default/css/style.css b/themes/default/css/style.css
index 11a8ea1c..e58ef81a 100755
--- a/themes/default/css/style.css
+++ b/themes/default/css/style.css
@@ -384,8 +384,8 @@ a#bagit-form-close {
384 background-color: #000; 384 background-color: #000;
385 color: #fff; 385 color: #fff;
386 padding: 0 4px 1px 3px; 386 padding: 0 4px 1px 3px;
387 font-weight: bold; 387 font-weight: bold;
388 font-size: 0.7em; 388 font-size: 0.7em;
389 border-radius: 4px; 389 border-radius: 4px;
390} 390}
391.add-to-wallabag-link-after:hover, .add-to-wallabag-link-after:active { 391.add-to-wallabag-link-after:hover, .add-to-wallabag-link-after:active {
@@ -394,6 +394,23 @@ a#bagit-form-close {
394.add-to-wallabag-link-after:visited { 394.add-to-wallabag-link-after:visited {
395 color: #999; 395 color: #999;
396} 396}
397a.add-to-wallabag-link-after {
398 visibility: hidden;
399 position: absolute;
400 opacity: 0;
401 transition-duration: 2s;
402 transition-timing-function: ease-out;
403}
404#article article a:hover + a.add-to-wallabag-link-after, a.add-to-wallabag-link-after:hover {
405 opacity: 1;
406 visibility: visible;
407 transition-duration: .3s;
408 transition-timing-function: ease-in;
409}
410a.add-to-wallabag-link-after:after {
411 content: "w";
412}
413
397 414
398#add-link-result { 415#add-link-result {
399 display: inline; 416 display: inline;