aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty
diff options
context:
space:
mode:
Diffstat (limited to 'inc/3rdparty')
-rw-r--r--inc/3rdparty/Session.class.php40
-rw-r--r--inc/3rdparty/class.messages.php3
-rwxr-xr-xinc/3rdparty/config.php104
-rw-r--r--inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php266
-rw-r--r--inc/3rdparty/libraries/PHPePub/EPub.NCX.php782
-rw-r--r--inc/3rdparty/libraries/PHPePub/EPub.OPF.php1226
-rw-r--r--inc/3rdparty/libraries/PHPePub/EPub.php2438
-rw-r--r--inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php201
-rw-r--r--inc/3rdparty/libraries/PHPePub/Logger.php92
-rw-r--r--inc/3rdparty/libraries/PHPePub/Zip.php818
-rw-r--r--inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt31
-rw-r--r--inc/3rdparty/libraries/PHPePub/lib.uuid.php314
-rw-r--r--inc/3rdparty/libraries/content-extractor/ContentExtractor.php1455
-rw-r--r--inc/3rdparty/libraries/content-extractor/SiteConfig.php681
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/feedwriter/FeedItem.php365
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/feedwriter/FeedWriter.php856
-rw-r--r--inc/3rdparty/libraries/html5/TreeBuilder.php13
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/CookieJar.php807
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php1589
-rw-r--r--inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php157
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect.php992
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php57
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php339
-rw-r--r--inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php (renamed from inc/3rdparty/libraries/language-detect/Parser.php)19
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/readability/Readability.php2281
-rwxr-xr-xinc/3rdparty/makefulltextfeed.php714
-rwxr-xr-xinc/3rdparty/makefulltextfeedHelpers.php389
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/simple_html_dom.php105
-rwxr-xr-xinc/3rdparty/site_config/custom/dailymotion.com.txt12
-rw-r--r--inc/3rdparty/site_config/custom/index.php3
-rw-r--r--inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt6
-rwxr-xr-xinc/3rdparty/site_config/custom/ted.com.txt11
-rw-r--r--inc/3rdparty/site_config/index.php5
-rw-r--r--inc/3rdparty/site_config/standard/.about.com.txt14
-rw-r--r--inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt9
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/site_config/standard/politico.com.txt4
-rw-r--r--inc/3rdparty/site_config/standard/version.txt2
37 files changed, 12084 insertions, 5116 deletions
diff --git a/inc/3rdparty/Session.class.php b/inc/3rdparty/Session.class.php
index b30a31f3..59dfbe67 100644
--- a/inc/3rdparty/Session.class.php
+++ b/inc/3rdparty/Session.class.php
@@ -31,9 +31,9 @@ class Session
31 public static $sessionName = ''; 31 public static $sessionName = '';
32 // If the user does not access any page within this time, 32 // If the user does not access any page within this time,
33 // his/her session is considered expired (3600 sec. = 1 hour) 33 // his/her session is considered expired (3600 sec. = 1 hour)
34 public static $inactivityTimeout = 86400; 34 public static $inactivityTimeout = 3600;
35 // Extra timeout for long sessions (if enabled) (82800 sec. = 23 hours) 35 // Extra timeout for long sessions (if enabled) (82800 sec. = 23 hours)
36 public static $longSessionTimeout = 31536000; 36 public static $longSessionTimeout = 7776000; // 7776000 = 90 days
37 // If you get disconnected often or if your IP address changes often. 37 // If you get disconnected often or if your IP address changes often.
38 // Let you disable session cookie hijacking protection 38 // Let you disable session cookie hijacking protection
39 public static $disableSessionProtection = false; 39 public static $disableSessionProtection = false;
@@ -48,8 +48,13 @@ class Session
48 /** 48 /**
49 * Initialize session 49 * Initialize session
50 */ 50 */
51 public static function init() 51 public static function init($longlastingsession = false)
52 { 52 {
53 //check if session name is correct
54 if ( (session_id() && !empty(self::$sessionName) && session_name()!=self::$sessionName) || $longlastingsession ) {
55 session_destroy();
56 }
57
53 // Force cookie path (but do not change lifetime) 58 // Force cookie path (but do not change lifetime)
54 $cookie = session_get_cookie_params(); 59 $cookie = session_get_cookie_params();
55 // Default cookie expiration and path. 60 // Default cookie expiration and path.
@@ -61,12 +66,22 @@ class Session
61 if (isset($_SERVER["HTTPS"]) && $_SERVER["HTTPS"] == "on") { 66 if (isset($_SERVER["HTTPS"]) && $_SERVER["HTTPS"] == "on") {
62 $ssl = true; 67 $ssl = true;
63 } 68 }
64 session_set_cookie_params($cookie['lifetime'], $cookiedir, $_SERVER['HTTP_HOST'], $ssl); 69
70 if ( $longlastingsession ) {
71 session_set_cookie_params(self::$longSessionTimeout, $cookiedir, null, $ssl, true);
72 }
73 else {
74 session_set_cookie_params(0, $cookiedir, null, $ssl, true);
75 }
76 //set server side valid session timeout
77 //WARNING! this may not work in shared session environment. See http://www.php.net/manual/en/session.configuration.php#ini.session.gc-maxlifetime about min value: it can be set in any application
78 ini_set('session.gc_maxlifetime', self::$longSessionTimeout);
79
65 // Use cookies to store session. 80 // Use cookies to store session.
66 ini_set('session.use_cookies', 1); 81 ini_set('session.use_cookies', 1);
67 // Force cookies for session (phpsessionID forbidden in URL) 82 // Force cookies for session (phpsessionID forbidden in URL)
68 ini_set('session.use_only_cookies', 1); 83 ini_set('session.use_only_cookies', 1);
69 if (!session_id()) { 84 if ( !session_id() ) {
70 // Prevent php to use sessionID in URL if cookies are disabled. 85 // Prevent php to use sessionID in URL if cookies are disabled.
71 ini_set('session.use_trans_sid', false); 86 ini_set('session.use_trans_sid', false);
72 if (!empty(self::$sessionName)) { 87 if (!empty(self::$sessionName)) {
@@ -115,6 +130,9 @@ class Session
115 if (self::banCanLogin()) { 130 if (self::banCanLogin()) {
116 if ($login === $loginTest && $password === $passwordTest) { 131 if ($login === $loginTest && $password === $passwordTest) {
117 self::banLoginOk(); 132 self::banLoginOk();
133
134 self::init($longlastingsession);
135
118 // Generate unique random number to sign forms (HMAC) 136 // Generate unique random number to sign forms (HMAC)
119 $_SESSION['uid'] = sha1(uniqid('', true).'_'.mt_rand()); 137 $_SESSION['uid'] = sha1(uniqid('', true).'_'.mt_rand());
120 $_SESSION['ip'] = self::_allIPs(); 138 $_SESSION['ip'] = self::_allIPs();
@@ -135,6 +153,7 @@ class Session
135 self::banLoginFailed(); 153 self::banLoginFailed();
136 } 154 }
137 155
156 self::init();
138 return false; 157 return false;
139 } 158 }
140 159
@@ -143,7 +162,14 @@ class Session
143 */ 162 */
144 public static function logout() 163 public static function logout()
145 { 164 {
146 unset($_SESSION['uid'],$_SESSION['ip'],$_SESSION['expires_on'],$_SESSION['tokens'], $_SESSION['login'], $_SESSION['pass'], $_SESSION['longlastingsession'], $_SESSION['poche_user']); 165 // unset($_SESSION['uid'],$_SESSION['ip'],$_SESSION['expires_on'],$_SESSION['tokens'], $_SESSION['login'], $_SESSION['pass'], $_SESSION['longlastingsession'], $_SESSION['poche_user']);
166
167 // Destruction du cookie (le code peut paraître complexe mais c'est pour être certain de reprendre les mêmes paramètres)
168 $args = array_merge(array(session_name(), ''), array_values(session_get_cookie_params()));
169 $args[2] = time() - 3600;
170 call_user_func_array('setcookie', $args);
171 // Suppression physique de la session
172 session_destroy();
147 } 173 }
148 174
149 /** 175 /**
@@ -157,7 +183,7 @@ class Session
157 || (self::$disableSessionProtection === false 183 || (self::$disableSessionProtection === false
158 && $_SESSION['ip'] !== self::_allIPs()) 184 && $_SESSION['ip'] !== self::_allIPs())
159 || time() >= $_SESSION['expires_on']) { 185 || time() >= $_SESSION['expires_on']) {
160 self::logout(); 186 //self::logout();
161 187
162 return false; 188 return false;
163 } 189 }
diff --git a/inc/3rdparty/class.messages.php b/inc/3rdparty/class.messages.php
index e60bd3a1..27c28f43 100644
--- a/inc/3rdparty/class.messages.php
+++ b/inc/3rdparty/class.messages.php
@@ -59,6 +59,7 @@ class Messages {
59 $this->msgId = md5(uniqid()); 59 $this->msgId = md5(uniqid());
60 60
61 // Create the session array if it doesnt already exist 61 // Create the session array if it doesnt already exist
62 settype($_SESSION, 'array');
62 if( !array_key_exists('flash_messages', $_SESSION) ) $_SESSION['flash_messages'] = array(); 63 if( !array_key_exists('flash_messages', $_SESSION) ) $_SESSION['flash_messages'] = array();
63 64
64 } 65 }
@@ -228,4 +229,4 @@ class Messages {
228 229
229 230
230} // end class 231} // end class
231?> \ No newline at end of file 232?>
diff --git a/inc/3rdparty/config.php b/inc/3rdparty/config.php
index e618117b..ec680d86 100755
--- a/inc/3rdparty/config.php
+++ b/inc/3rdparty/config.php
@@ -19,7 +19,7 @@ if (!isset($options)) $options = new stdClass();
19// Enable service 19// Enable service
20// ---------------------- 20// ----------------------
21// Set this to false if you want to disable the service. 21// Set this to false if you want to disable the service.
22// If set to false, no feed is produced and users will 22// If set to false, no feed is produced and users will
23// be told that the service is disabled. 23// be told that the service is disabled.
24$options->enabled = true; 24$options->enabled = true;
25 25
@@ -43,10 +43,64 @@ $options->default_entries = 5;
43// ---------------------- 43// ----------------------
44// The maximum number of feed items to process when no access key is supplied. 44// The maximum number of feed items to process when no access key is supplied.
45// This limits the user-supplied &max=x value. For example, if the user 45// This limits the user-supplied &max=x value. For example, if the user
46// asks for 20 items to be processed (&max=20), if max_entries is set to 46// asks for 20 items to be processed (&max=20), if max_entries is set to
47// 10, only 10 will be processed. 47// 10, only 10 will be processed.
48$options->max_entries = 10; 48$options->max_entries = 10;
49 49
50// Full content
51// ----------------------
52// By default Full-Text RSS includes the extracted content in the output.
53// You can exclude this from the output by passing '&content=0' in the querystring.
54//
55// Possible values...
56// Always include: true
57// Never include: false
58// Include unless user overrides (&content=0): 'user' (default)
59//
60// Note: currently this does not disable full content extraction. It simply omits it
61// from the output.
62$options->content = 'user';
63
64// Excerpts
65// ----------------------
66// By default Full-Text RSS does not include excerpts in the output.
67// You can enable this by passing '&summary=1' in the querystring.
68// This will include a plain text excerpt from the extracted content.
69//
70// Possible values...
71// Always include: true (recommended for new users)
72// Never include: false
73// Don't include unless user overrides (&summary=1): 'user' (default)
74//
75// Important: if both content and excerpts are requested, the excerpt will be
76// placed in the description element and the full content inside content:encoded.
77// If excerpts are not requested, the full content will go inside the description element.
78//
79// Why are we not returning both excerpts and content by default?
80// Mainly for backward compatibility.
81// Excerpts should appear in the feed item's description element. Previous versions
82// of Full-Text RSS did not return excerpts, so the description element was always
83// used for the full content (as recommended by the RSS advisory). When returning both,
84// we need somewhere else to place the content (content:encoded).
85// Having both enabled should not create any problems for news readers, but it may create
86// problems for developers upgrading from one of our earlier versions who may now find
87// their applications are returning excerpts instead of the full content they were
88// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
89// excerpts must be explicitly requested in the querystring by default.
90//
91// Why not use a different element name for excerpts?
92// According to the RSS advisory:
93// "Publishers who employ summaries should store the summary in description and
94// the full content in content:encoded, ordering description first within the item.
95// On items with no summary, the full content should be stored in description."
96// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
97//
98// For more consistent element naming, we recommend new users set this option to true.
99// The full content can still be excluded via the querystring, but the element names
100// will not change: when $options->summary = true, the description element will always
101// be reserved for the excerpt and content:encoded always for full content.
102$options->summary = 'user';
103
50// Rewrite relative URLs 104// Rewrite relative URLs
51// ---------------------- 105// ----------------------
52// With this enabled relative URLs found in the extracted content 106// With this enabled relative URLs found in the extracted content
@@ -67,7 +121,7 @@ $options->exclude_items_on_fail = 'user';
67// Enable multi-page support 121// Enable multi-page support
68// ------------------------- 122// -------------------------
69// If enabled, we will try to follow next page links on multi-page articles. 123// If enabled, we will try to follow next page links on multi-page articles.
70// Currently this only happens for sites where next_page_link has been defined 124// Currently this only happens for sites where next_page_link has been defined
71// in a site config file. 125// in a site config file.
72$options->multipage = true; 126$options->multipage = true;
73 127
@@ -125,10 +179,10 @@ $options->detect_language = 1;
125 179
126// Registration key 180// Registration key
127// --------------- 181// ---------------
128// The registration key is optional. It is not required to use Full-Text RSS, 182// The registration key is optional. It is not required to use Full-Text RSS,
129// and does not affect the normal operation of Full-Text RSS. It is currently 183// and does not affect the normal operation of Full-Text RSS. It is currently
130// only used on admin pages which help you update site patterns with the 184// only used on admin pages which help you update site patterns with the
131// latest version offered by FiveFilters.org. For these admin-related 185// latest version offered by FiveFilters.org. For these admin-related
132// tasks to complete, we will require a valid registration key. 186// tasks to complete, we will require a valid registration key.
133// If you would like one, you can purchase the latest version of Full-Text RSS 187// If you would like one, you can purchase the latest version of Full-Text RSS
134// at http://fivefilters.org/content-only/ 188// at http://fivefilters.org/content-only/
@@ -144,12 +198,12 @@ $options->registration_key = '';
144// ---------------------- 198// ----------------------
145// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials. 199// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials.
146// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages. 200// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages.
147// If no password or username is set, pages requiring admin privelages will be inaccessible. 201// If no password or username is set, pages requiring admin privelages will be inaccessible.
148// The default username is 'admin'. 202// The default username is 'admin'.
149// If overriding with an environment variable, separate username and password with a colon, e.g.: 203// If overriding with an environment variable, separate username and password with a colon, e.g.:
150// ftr_admin_credentials: admin:my-secret-password 204// ftr_admin_credentials: admin:my-secret-password
151// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password'); 205// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
152$options->admin_credentials = array('username'=>'admin', 'password'=>'admin'); 206$options->admin_credentials = array('username'=>'admin', 'password'=>'');
153 207
154// URLs to allow 208// URLs to allow
155// ---------------------- 209// ----------------------
@@ -178,12 +232,12 @@ $options->key_required = false;
178// ---------------------- 232// ----------------------
179// By default, when processing feeds, we assume item titles in the feed 233// By default, when processing feeds, we assume item titles in the feed
180// have not been truncated. So after processing web pages, the extracted titles 234// have not been truncated. So after processing web pages, the extracted titles
181// are not used in the generated feed. If you prefer to have extracted titles in 235// are not used in the generated feed. If you prefer to have extracted titles in
182// the feed you can either set this to false, in which case we will always favour 236// the feed you can either set this to false, in which case we will always favour
183// extracted titles. Alternatively, if set to 'user' (default) we'll use the 237// extracted titles. Alternatively, if set to 'user' (default) we'll use the
184// extracted title if you pass '&use_extracted_title' in the querystring. 238// extracted title if you pass '&use_extracted_title' in the querystring.
185// Possible values: 239// Possible values:
186// * Favour feed titles: true 240// * Favour feed titles: true
187// * Favour extracted titles: false 241// * Favour extracted titles: false
188// * Favour feed titles with user override: 'user' (default) 242// * Favour feed titles with user override: 'user' (default)
189// Note: this has no effect when the input URL is to a web page - in these cases 243// Note: this has no effect when the input URL is to a web page - in these cases
@@ -192,17 +246,17 @@ $options->favour_feed_titles = 'user';
192 246
193// Access keys (password protected access) 247// Access keys (password protected access)
194// ------------------------------------ 248// ------------------------------------
195// NOTE: You do not need an API key from fivefilters.org to run your own 249// NOTE: You do not need an API key from fivefilters.org to run your own
196// copy of the code. This is here if you'd like to restrict access to 250// copy of the code. This is here if you'd like to restrict access to
197// _your_ copy. 251// _your_ copy.
198// Keys let you group users - those with a key and those without - and 252// Keys let you group users - those with a key and those without - and
199// restrict access to the service to those without a key. 253// restrict access to the service to those without a key.
200// If you want everyone to access the service in the same way, you can 254// If you want everyone to access the service in the same way, you can
201// leave the array below empty and ignore the access key options further down. 255// leave the array below empty and ignore the access key options further down.
202// The options further down let you control how the service should behave 256// The options further down let you control how the service should behave
203// in each mode. 257// in each mode.
204// Note: Explicitly including the index number (1 and 2 in the examples below) 258// Note: Explicitly including the index number (1 and 2 in the examples below)
205// is highly recommended (when generating feeds, we encode the key and 259// is highly recommended (when generating feeds, we encode the key and
206// refer to it by index number and hash). 260// refer to it by index number and hash).
207$options->api_keys = array(); 261$options->api_keys = array();
208// Example: 262// Example:
@@ -232,13 +286,13 @@ $options->max_entries_with_key = 10;
232// filter the resulting HTML for XSS attacks, making it redundant for 286// filter the resulting HTML for XSS attacks, making it redundant for
233// Full-Text RSS do the same. Similarly with frameworks/CMS which display 287// Full-Text RSS do the same. Similarly with frameworks/CMS which display
234// feed content - the content should be treated like any other user-submitted content. 288// feed content - the content should be treated like any other user-submitted content.
235// 289//
236// If you are writing an application yourself which is processing feeds generated by 290// If you are writing an application yourself which is processing feeds generated by
237// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks 291// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks
238// or enable this option. This might be useful if you are processing our generated 292// or enable this option. This might be useful if you are processing our generated
239// feeds with JavaScript on the client side - although there's client side xss 293// feeds with JavaScript on the client side - although there's client side xss
240// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer 294// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer
241// 295//
242// If enabled, we'll pass retrieved HTML content through htmLawed with 296// If enabled, we'll pass retrieved HTML content through htmLawed with
243// safe flag on and style attributes denied, see 297// safe flag on and style attributes denied, see
244// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6 298// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6
@@ -253,8 +307,8 @@ $options->xss_filter = 'user';
253// Allowed parsers 307// Allowed parsers
254// ---------------------- 308// ----------------------
255// Full-Text RSS attempts to use PHP's libxml extension to process HTML. 309// Full-Text RSS attempts to use PHP's libxml extension to process HTML.
256// While fast, on some sites it may not always produce good results. 310// While fast, on some sites it may not always produce good results.
257// For these sites, you can specify an alternative HTML parser: 311// For these sites, you can specify an alternative HTML parser:
258// parser: html5lib 312// parser: html5lib
259// The html5lib parser is bundled with Full-Text RSS. 313// The html5lib parser is bundled with Full-Text RSS.
260// see http://code.google.com/p/html5lib/ 314// see http://code.google.com/p/html5lib/
@@ -273,7 +327,7 @@ $options->cors = false;
273 327
274// Use APC user cache? 328// Use APC user cache?
275// ---------------------- 329// ----------------------
276// If enabled we will store site config files (when requested 330// If enabled we will store site config files (when requested
277// for the first time) in APC's user cache. Keys prefixed with 'sc.' 331// for the first time) in APC's user cache. Keys prefixed with 'sc.'
278// This improves performance by reducing disk access. 332// This improves performance by reducing disk access.
279// Note: this has no effect if APC is unavailable on your server. 333// Note: this has no effect if APC is unavailable on your server.
@@ -346,7 +400,7 @@ $options->rewrite_url = array(
346// Valid actions: 400// Valid actions:
347// * 'exclude' - exclude this item from the result 401// * 'exclude' - exclude this item from the result
348// * 'link' - create HTML link to the item 402// * 'link' - create HTML link to the item
349$options->content_type_exc = array( 403$options->content_type_exc = array(
350 'application/pdf' => array('action'=>'link', 'name'=>'PDF'), 404 'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
351 'image' => array('action'=>'link', 'name'=>'Image'), 405 'image' => array('action'=>'link', 'name'=>'Image'),
352 'audio' => array('action'=>'link', 'name'=>'Audio'), 406 'audio' => array('action'=>'link', 'name'=>'Audio'),
@@ -375,13 +429,13 @@ $options->cache_cleanup = 100;
375/// DO NOT CHANGE ANYTHING BELOW THIS /////////// 429/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
376///////////////////////////////////////////////// 430/////////////////////////////////////////////////
377 431
378if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1'); 432if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
379 433
380if (basename(__FILE__) == 'config.php') { 434if (basename(__FILE__) == 'config.php') {
381 if (file_exists(dirname(__FILE__).'/custom_config.php')) { 435 if (file_exists(dirname(__FILE__).'/custom_config.php')) {
382 require_once dirname(__FILE__).'/custom_config.php'; 436 require_once dirname(__FILE__).'/custom_config.php';
383 } 437 }
384 438
385 // check for environment variables - often used on cloud platforms 439 // check for environment variables - often used on cloud platforms
386 // environment variables should be prefixed with 'ftr_', e.g. 440 // environment variables should be prefixed with 'ftr_', e.g.
387 // ftr_max_entries: 1 441 // ftr_max_entries: 1
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php
new file mode 100644
index 00000000..376b6133
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php
@@ -0,0 +1,266 @@
1<?php
2/**
3 * This should be a complete list of all HTML entities, mapped to their UTF-8 character codes.
4 *
5 * @author A. Grandt
6 * @copyright A. Grandt 2009-2013
7 * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else.
8 * @version 3.00
9 */
10global $htmlEntities;
11$htmlEntities = array();
12
13$htmlEntities["&quot;"] ="\x22"; // &#34; ((double) quotation mark)
14$htmlEntities["&amp;"] ="\x26"; // &#38; (ampersand)
15$htmlEntities["&apos;"] ="\x27"; // &#39; (apostrophe = apostrophe-quote)
16$htmlEntities["&lt;"] ="\x3C"; // &#60; (less-than sign)
17$htmlEntities["&gt;"] ="\x3E"; // &#62; (greater-than sign)
18$htmlEntities["&nbsp;"] ="\xC2\xA0"; // &#160; (non-breaking space)
19$htmlEntities["&iexcl;"] ="\xC2\xA1"; // &#161; (inverted exclamation mark)
20$htmlEntities["&cent;"] ="\xC2\xA2"; // &#162; (cent)
21$htmlEntities["&pound;"] ="\xC2\xA3"; // &#163; (pound)
22$htmlEntities["&curren;"] ="\xC2\xA4"; // &#164; (currency)
23$htmlEntities["&yen;"] ="\xC2\xA5"; // &#165; (yen)
24$htmlEntities["&brvbar;"] ="\xC2\xA6"; // &#166; (broken vertical bar)
25$htmlEntities["&sect;"] ="\xC2\xA7"; // &#167; (section)
26$htmlEntities["&uml;"] ="\xC2\xA8"; // &#168; (spacing diaeresis)
27$htmlEntities["&copy;"] ="\xC2\xA9"; // &#169; (copyright)
28$htmlEntities["&ordf;"] ="\xC2\xAA"; // &#170; (feminine ordinal indicator)
29$htmlEntities["&laquo;"] ="\xC2\xAB"; // &#171; (angle quotation mark (left))
30$htmlEntities["&not;"] ="\xC2\xAC"; // &#172; (negation)
31$htmlEntities["&shy;"] ="\xC2\xAD"; // &#173; (soft hyphen)
32$htmlEntities["&reg;"] ="\xC2\xAE"; // &#174; (registered trademark)
33$htmlEntities["&macr;"] ="\xC2\xAF"; // &#175; (spacing macron)
34$htmlEntities["&deg;"] ="\xC2\xB0"; // &#176; (degree)
35$htmlEntities["&plusmn;"] ="\xC2\xB1"; // &#177; (plus-or-minus)
36$htmlEntities["&sup2;"] ="\xC2\xB2"; // &#178; (superscript 2)
37$htmlEntities["&sup3;"] ="\xC2\xB3"; // &#179; (superscript 3)
38$htmlEntities["&acute;"] ="\xC2\xB4"; // &#180; (spacing acute)
39$htmlEntities["&micro;"] ="\xC2\xB5"; // &#181; (micro)
40$htmlEntities["&para;"] ="\xC2\xB6"; // &#182; (paragraph)
41$htmlEntities["&middot;"] ="\xC2\xB7"; // &#183; (middle dot)
42$htmlEntities["&cedil;"] ="\xC2\xB8"; // &#184; (spacing cedilla)
43$htmlEntities["&sup1;"] ="\xC2\xB9"; // &#185; (superscript 1)
44$htmlEntities["&ordm;"] ="\xC2\xBA"; // &#186; (masculine ordinal indicator)
45$htmlEntities["&raquo;"] ="\xC2\xBB"; // &#187; (angle quotation mark (right))
46$htmlEntities["&frac14;"] ="\xC2\xBC"; // &#188; (fraction 1/4)
47$htmlEntities["&frac12;"] ="\xC2\xBD"; // &#189; (fraction 1/2)
48$htmlEntities["&frac34;"] ="\xC2\xBE"; // &#190; (fraction 3/4)
49$htmlEntities["&iquest;"] ="\xC2\xBF"; // &#191; (inverted question mark)
50$htmlEntities["&Agrave;"] ="\xC3\x80"; // &#192; (capital a, grave accent)
51$htmlEntities["&Aacute;"] ="\xC3\x81"; // &#193; (capital a, acute accent)
52$htmlEntities["&Acirc;"] ="\xC3\x82"; // &#194; (capital a, circumflex accent)
53$htmlEntities["&Atilde;"] ="\xC3\x83"; // &#195; (capital a, tilde)
54$htmlEntities["&Auml;"] ="\xC3\x84"; // &#196; (capital a, umlaut mark)
55$htmlEntities["&Aring;"] ="\xC3\x85"; // &#197; (capital a, ring)
56$htmlEntities["&AElig;"] ="\xC3\x86"; // &#198; (capital ae)
57$htmlEntities["&Ccedil;"] ="\xC3\x87"; // &#199; (capital c, cedilla)
58$htmlEntities["&Egrave;"] ="\xC3\x88"; // &#200; (capital e, grave accent)
59$htmlEntities["&Eacute;"] ="\xC3\x89"; // &#201; (capital e, acute accent)
60$htmlEntities["&Ecirc;"] ="\xC3\x8A"; // &#202; (capital e, circumflex accent)
61$htmlEntities["&Euml;"] ="\xC3\x8B"; // &#203; (capital e, umlaut mark)
62$htmlEntities["&Igrave;"] ="\xC3\x8C"; // &#204; (capital i, grave accent)
63$htmlEntities["&Iacute;"] ="\xC3\x8D"; // &#205; (capital i, acute accent)
64$htmlEntities["&Icirc;"] ="\xC3\x8E"; // &#206; (capital i, circumflex accent)
65$htmlEntities["&Iuml;"] ="\xC3\x8F"; // &#207; (capital i, umlaut mark)
66$htmlEntities["&ETH;"] ="\xC3\x90"; // &#208; (capital eth, Icelandic)
67$htmlEntities["&Ntilde;"] ="\xC3\x91"; // &#209; (capital n, tilde)
68$htmlEntities["&Ograve;"] ="\xC3\x92"; // &#210; (capital o, grave accent)
69$htmlEntities["&Oacute;"] ="\xC3\x93"; // &#211; (capital o, acute accent)
70$htmlEntities["&Ocirc;"] ="\xC3\x94"; // &#212; (capital o, circumflex accent)
71$htmlEntities["&Otilde;"] ="\xC3\x95"; // &#213; (capital o, tilde)
72$htmlEntities["&Ouml;"] ="\xC3\x96"; // &#214; (capital o, umlaut mark)
73$htmlEntities["&times;"] ="\xC3\x97"; // &#215; (multiplication)
74$htmlEntities["&Oslash;"] ="\xC3\x98"; // &#216; (capital o, slash)
75$htmlEntities["&Ugrave;"] ="\xC3\x99"; // &#217; (capital u, grave accent)
76$htmlEntities["&Uacute;"] ="\xC3\x9A"; // &#218; (capital u, acute accent)
77$htmlEntities["&Ucirc;"] ="\xC3\x9B"; // &#219; (capital u, circumflex accent)
78$htmlEntities["&Uuml;"] ="\xC3\x9C"; // &#220; (capital u, umlaut mark)
79$htmlEntities["&Yacute;"] ="\xC3\x9D"; // &#221; (capital y, acute accent)
80$htmlEntities["&THORN;"] ="\xC3\x9E"; // &#222; (capital THORN, Icelandic)
81$htmlEntities["&szlig;"] ="\xC3\x9F"; // &#223; (small sharp s, German)
82$htmlEntities["&agrave;"] ="\xC3\xA0"; // &#224; (small a, grave accent)
83$htmlEntities["&aacute;"] ="\xC3\xA1"; // &#225; (small a, acute accent)
84$htmlEntities["&acirc;"] ="\xC3\xA2"; // &#226; (small a, circumflex accent)
85$htmlEntities["&atilde;"] ="\xC3\xA3"; // &#227; (small a, tilde)
86$htmlEntities["&auml;"] ="\xC3\xA4"; // &#228; (small a, umlaut mark)
87$htmlEntities["&aring;"] ="\xC3\xA5"; // &#229; (small a, ring)
88$htmlEntities["&aelig;"] ="\xC3\xA6"; // &#230; (small ae)
89$htmlEntities["&ccedil;"] ="\xC3\xA7"; // &#231; (small c, cedilla)
90$htmlEntities["&egrave;"] ="\xC3\xA8"; // &#232; (small e, grave accent)
91$htmlEntities["&eacute;"] ="\xC3\xA9"; // &#233; (small e, acute accent)
92$htmlEntities["&ecirc;"] ="\xC3\xAA"; // &#234; (small e, circumflex accent)
93$htmlEntities["&euml;"] ="\xC3\xAB"; // &#235; (small e, umlaut mark)
94$htmlEntities["&igrave;"] ="\xC3\xAC"; // &#236; (small i, grave accent)
95$htmlEntities["&iacute;"] ="\xC3\xAD"; // &#237; (small i, acute accent)
96$htmlEntities["&icirc;"] ="\xC3\xAE"; // &#238; (small i, circumflex accent)
97$htmlEntities["&iuml;"] ="\xC3\xAF"; // &#239; (small i, umlaut mark)
98$htmlEntities["&eth;"] ="\xC3\xB0"; // &#240; (small eth, Icelandic)
99$htmlEntities["&ntilde;"] ="\xC3\xB1"; // &#241; (small n, tilde)
100$htmlEntities["&ograve;"] ="\xC3\xB2"; // &#242; (small o, grave accent)
101$htmlEntities["&oacute;"] ="\xC3\xB3"; // &#243; (small o, acute accent)
102$htmlEntities["&ocirc;"] ="\xC3\xB4"; // &#244; (small o, circumflex accent)
103$htmlEntities["&otilde;"] ="\xC3\xB5"; // &#245; (small o, tilde)
104$htmlEntities["&ouml;"] ="\xC3\xB6"; // &#246; (small o, umlaut mark)
105$htmlEntities["&divide;"] ="\xC3\xB7"; // &#247; (division)
106$htmlEntities["&oslash;"] ="\xC3\xB8"; // &#248; (small o, slash)
107$htmlEntities["&ugrave;"] ="\xC3\xB9"; // &#249; (small u, grave accent)
108$htmlEntities["&uacute;"] ="\xC3\xBA"; // &#250; (small u, acute accent)
109$htmlEntities["&ucirc;"] ="\xC3\xBB"; // &#251; (small u, circumflex accent)
110$htmlEntities["&uuml;"] ="\xC3\xBC"; // &#252; (small u, umlaut mark)
111$htmlEntities["&yacute;"] ="\xC3\xBD"; // &#253; (small y, acute accent)
112$htmlEntities["&thorn;"] ="\xC3\xBE"; // &#254; (small thorn, Icelandic)
113$htmlEntities["&yuml;"] ="\xC3\xBF"; // &#255; (small y, umlaut mark)
114$htmlEntities["&OElig;"] ="\xC5\x92"; // &#338; (capital ligature OE)
115$htmlEntities["&oelig;"] ="\xC5\x93"; // &#339; (small ligature oe)
116$htmlEntities["&Scaron;"] ="\xC5\xA0"; // &#352; (capital S with caron)
117$htmlEntities["&scaron;"] ="\xC5\xA1"; // &#353; (small S with caron)
118$htmlEntities["&Yuml;"] ="\xC5\xB8"; // &#376; (capital Y with diaeres)
119$htmlEntities["&fnof;"] ="\xC6\x92"; // &#402; (f with hook)
120$htmlEntities["&circ;"] ="\xCB\x86"; // &#710; (modifier letter circumflex accent)
121$htmlEntities["&tilde;"] ="\xCB\x9C"; // &#732; (small tilde)
122$htmlEntities["&Alpha;"] ="\xCE\x91"; // &#913; (Alpha)
123$htmlEntities["&Beta;"] ="\xCE\x92"; // &#914; (Beta)
124$htmlEntities["&Gamma;"] ="\xCE\x93"; // &#915; (Gamma)
125$htmlEntities["&Delta;"] ="\xCE\x94"; // &#916; (Delta)
126$htmlEntities["&Epsilon;"] ="\xCE\x95"; // &#917; (Epsilon)
127$htmlEntities["&Zeta;"] ="\xCE\x96"; // &#918; (Zeta)
128$htmlEntities["&Eta;"] ="\xCE\x97"; // &#919; (Eta)
129$htmlEntities["&Theta;"] ="\xCE\x98"; // &#920; (Theta)
130$htmlEntities["&Iota;"] ="\xCE\x99"; // &#921; (Iota)
131$htmlEntities["&Kappa;"] ="\xCE\x9A"; // &#922; (Kappa)
132$htmlEntities["&Lambda;"] ="\xCE\x9B"; // &#923; (Lambda)
133$htmlEntities["&Mu;"] ="\xCE\x9C"; // &#924; (Mu)
134$htmlEntities["&Nu;"] ="\xCE\x9D"; // &#925; (Nu)
135$htmlEntities["&Xi;"] ="\xCE\x9E"; // &#926; (Xi)
136$htmlEntities["&Omicron;"] ="\xCE\x9F"; // &#927; (Omicron)
137$htmlEntities["&Pi;"] ="\xCE\xA0"; // &#928; (Pi)
138$htmlEntities["&Rho;"] ="\xCE\xA1"; // &#929; (Rho)
139$htmlEntities["&Sigma;"] ="\xCE\xA3"; // &#931; (Sigma)
140$htmlEntities["&Tau;"] ="\xCE\xA4"; // &#932; (Tau)
141$htmlEntities["&Upsilon;"] ="\xCE\xA5"; // &#933; (Upsilon)
142$htmlEntities["&Phi;"] ="\xCE\xA6"; // &#934; (Phi)
143$htmlEntities["&Chi;"] ="\xCE\xA7"; // &#935; (Chi)
144$htmlEntities["&Psi;"] ="\xCE\xA8"; // &#936; (Psi)
145$htmlEntities["&Omega;"] ="\xCE\xA9"; // &#937; (Omega)
146$htmlEntities["&alpha;"] ="\xCE\xB1"; // &#945; (alpha)
147$htmlEntities["&beta;"] ="\xCE\xB2"; // &#946; (beta)
148$htmlEntities["&gamma;"] ="\xCE\xB3"; // &#947; (gamma)
149$htmlEntities["&delta;"] ="\xCE\xB4"; // &#948; (delta)
150$htmlEntities["&epsilon;"] ="\xCE\xB5"; // &#949; (epsilon)
151$htmlEntities["&zeta;"] ="\xCE\xB6"; // &#950; (zeta)
152$htmlEntities["&eta;"] ="\xCE\xB7"; // &#951; (eta)
153$htmlEntities["&theta;"] ="\xCE\xB8"; // &#952; (theta)
154$htmlEntities["&iota;"] ="\xCE\xB9"; // &#953; (iota)
155$htmlEntities["&kappa;"] ="\xCE\xBA"; // &#954; (kappa)
156$htmlEntities["&lambda;"] ="\xCE\xBB"; // &#955; (lambda)
157$htmlEntities["&mu;"] ="\xCE\xBC"; // &#956; (mu)
158$htmlEntities["&nu;"] ="\xCE\xBD"; // &#957; (nu)
159$htmlEntities["&xi;"] ="\xCE\xBE"; // &#958; (xi)
160$htmlEntities["&omicron;"] ="\xCE\xBF"; // &#959; (omicron)
161$htmlEntities["&pi;"] ="\xCF\x80"; // &#960; (pi)
162$htmlEntities["&rho;"] ="\xCF\x81"; // &#961; (rho)
163$htmlEntities["&sigmaf;"] ="\xCF\x82"; // &#962; (sigmaf)
164$htmlEntities["&sigma;"] ="\xCF\x83"; // &#963; (sigma)
165$htmlEntities["&tau;"] ="\xCF\x84"; // &#964; (tau)
166$htmlEntities["&upsilon;"] ="\xCF\x85"; // &#965; (upsilon)
167$htmlEntities["&phi;"] ="\xCF\x86"; // &#966; (phi)
168$htmlEntities["&chi;"] ="\xCF\x87"; // &#967; (chi)
169$htmlEntities["&psi;"] ="\xCF\x88"; // &#968; (psi)
170$htmlEntities["&omega;"] ="\xCF\x89"; // &#969; (omega)
171$htmlEntities["&thetasym;"] ="\xCF\x91"; // &#977; (theta symbol)
172$htmlEntities["&upsih;"] ="\xCF\x92"; // &#978; (upsilon symbol)
173$htmlEntities["&piv;"] ="\xCF\x96"; // &#982; (pi symbol)
174$htmlEntities["&ensp;"] ="\xE2\x80\x82"; // &#8194; (en space)
175$htmlEntities["&emsp;"] ="\xE2\x80\x83"; // &#8195; (em space)
176$htmlEntities["&thinsp;"] ="\xE2\x80\x89"; // &#8201; (thin space)
177$htmlEntities["&zwnj;"] ="‌\xE2\x80\x8C"; // &#8204; (zero width non-joiner)
178$htmlEntities["&zwj;"] ="\xE2\x80\x8D‍"; // &#8205; (zero width joiner)
179$htmlEntities["&lrm;"] ="‎\xE2\x80\x8E"; // &#8206; (left-to-right mark)
180$htmlEntities["&rlm;"] ="\xE2\x80\x8F"; // &#8207; (right-to-left mark)
181$htmlEntities["&ndash;"] ="\xE2\x80\x93"; // &#8211; (en dash)
182$htmlEntities["&mdash;"] ="\xE2\x80\x94"; // &#8212; (em dash)
183$htmlEntities["&lsquo;"] ="\xE2\x80\x98"; // &#8216; (left single quotation mark)
184$htmlEntities["&rsquo;"] ="\xE2\x80\x99"; // &#8217; (right single quotation mark)
185$htmlEntities["&sbquo;"] ="\xE2\x80\x9A"; // &#8218; (single low-9 quotation mark)
186$htmlEntities["&ldquo;"] ="\xE2\x80\x9C"; // &#8220; (left double quotation mark)
187$htmlEntities["&rdquo;"] ="\xE2\x80\x9D"; // &#8221; (right double quotation mark)
188$htmlEntities["&bdquo;"] ="\xE2\x80\x9E"; // &#8222; (double low-9 quotation mark)
189$htmlEntities["&dagger;"] ="\xE2\x80\xA0"; // &#8224; (dagger)
190$htmlEntities["&Dagger;"] ="\xE2\x80\xA1"; // &#8225; (double dagger)
191$htmlEntities["&bull;"] ="\xE2\x80\xA2"; // &#8226; (bullet)
192$htmlEntities["&hellip;"] ="\xE2\x80\xA6"; // &#8230; (horizontal ellipsis)
193$htmlEntities["&permil;"] ="\xE2\x80\xB0"; // &#8240; (per mille)
194$htmlEntities["&prime;"] ="\xE2\x80\xB2"; // &#8242; (minutes or prime)
195$htmlEntities["&Prime;"] ="\xE2\x80\xB3"; // &#8243; (seconds or Double Prime)
196$htmlEntities["&lsaquo;"] ="\xE2\x80\xB9"; // &#8249; (single left angle quotation)
197$htmlEntities["&rsaquo;"] ="\xE2\x80\xBA"; // &#8250; (single right angle quotation)
198$htmlEntities["&oline;"] ="\xE2\x80\xBE"; // &#8254; (overline)
199$htmlEntities["&frasl;"] ="\xE2\x81\x84"; // &#8260; (fraction slash)
200$htmlEntities["&euro;"] ="\xE2\x82\xAC"; // &#8364; (euro)
201$htmlEntities["&image;"] ="\xE2\x84\x91"; // &#8465; (blackletter capital I)
202$htmlEntities["&weierp;"] ="\xE2\x84\x98"; // &#8472; (script capital P)
203$htmlEntities["&real;"] ="\xE2\x84\x9C"; // &#8476; (blackletter capital R)
204$htmlEntities["&trade;"] ="\xE2\x84\xA2"; // &#8482; (trademark)
205$htmlEntities["&alefsym;"] ="\xE2\x84\xB5"; // &#8501; (alef)
206$htmlEntities["&larr;"] ="\xE2\x86\x90"; // &#8592; (left arrow)
207$htmlEntities["&uarr;"] ="\xE2\x86\x91"; // &#8593; (up arrow)
208$htmlEntities["&rarr;"] ="\xE2\x86\x92"; // &#8594; (right arrow)
209$htmlEntities["&darr;"] ="\xE2\x86\x93"; // &#8595; (down arrow)
210$htmlEntities["&harr;"] ="\xE2\x86\x94"; // &#8596; (left right arrow)
211$htmlEntities["&crarr;"] ="\xE2\x86\xB5"; // &#8629; (carriage return arrow)
212$htmlEntities["&lArr;"] ="\xE2\x87\x90"; // &#8656; (left double arrow)
213$htmlEntities["&uArr;"] ="\xE2\x87\x91"; // &#8657; (up double arrow)
214$htmlEntities["&rArr;"] ="\xE2\x87\x92"; // &#8658; (right double arrow)
215$htmlEntities["&dArr;"] ="\xE2\x87\x93"; // &#8659; (down double arrow)
216$htmlEntities["&hArr;"] ="\xE2\x87\x94"; // &#8660; (left right double arrow)
217$htmlEntities["&forall;"] ="\xE2\x88\x80"; // &#8704; (for all)
218$htmlEntities["&part;"] ="\xE2\x88\x82"; // &#8706; (partial differential)
219$htmlEntities["&exist;"] ="\xE2\x88\x83"; // &#8707; (there exists)
220$htmlEntities["&empty;"] ="\xE2\x88\x85"; // &#8709; (empty set)
221$htmlEntities["&nabla;"] ="\xE2\x88\x87"; // &#8711; (backward difference)
222$htmlEntities["&isin;"] ="\xE2\x88\x88"; // &#8712; (element of)
223$htmlEntities["&notin;"] ="\xE2\x88\x89"; // &#8713; (not an element of)
224$htmlEntities["&ni;"] ="\xE2\x88\x8B"; // &#8715; (ni = contains as member)
225$htmlEntities["&prod;"] ="\xE2\x88\x8F"; // &#8719; (n-ary product)
226$htmlEntities["&sum;"] ="\xE2\x88\x91"; // &#8721; (n-ary sumation)
227$htmlEntities["&minus;"] ="\xE2\x88\x92"; // &#8722; (minus)
228$htmlEntities["&lowast;"] ="\xE2\x88\x97"; // &#8727; (asterisk operator)
229$htmlEntities["&radic;"] ="\xE2\x88\x9A"; // &#8730; (square root)
230$htmlEntities["&prop;"] ="\xE2\x88\x9D"; // &#8733; (proportional to)
231$htmlEntities["&infin;"] ="\xE2\x88\x9E"; // &#8734; (infinity)
232$htmlEntities["&ang;"] ="\xE2\x88\xA0"; // &#8736; (angle)
233$htmlEntities["&and;"] ="\xE2\x88\xA7"; // &#8743; (logical and)
234$htmlEntities["&or;"] ="\xE2\x88\xA8"; // &#8744; (logical or)
235$htmlEntities["&cap;"] ="\xE2\x88\xA9"; // &#8745; (intersection)
236$htmlEntities["&cup;"] ="\xE2\x88\xAA"; // &#8746; (union)
237$htmlEntities["&int;"] ="\xE2\x88\xAB"; // &#8747; (integral)
238$htmlEntities["&there4;"] ="\xE2\x88\xB4"; // &#8756; (therefore)
239$htmlEntities["&sim;"] ="\xE2\x88\xBC"; // &#8764; (similar to)
240$htmlEntities["&cong;"] ="\xE2\x89\x85"; // &#8773; (congruent to)
241$htmlEntities["&asymp;"] ="\xE2\x89\x88"; // &#8776; (approximately equal)
242$htmlEntities["&ne;"] ="\xE2\x89\xA0"; // &#8800; (not equal)
243$htmlEntities["&equiv;"] ="\xE2\x89\xA1"; // &#8801; (equivalent)
244$htmlEntities["&le;"] ="\xE2\x89\xA4"; // &#8804; (less or equal)
245$htmlEntities["&ge;"] ="\xE2\x89\xA5"; // &#8805; (greater or equal)
246$htmlEntities["&sub;"] ="\xE2\x8A\x82"; // &#8834; (subset of)
247$htmlEntities["&sup;"] ="\xE2\x8A\x83"; // &#8835; (superset of)
248$htmlEntities["&nsub;"] ="\xE2\x8A\x84"; // &#8836; (not subset of)
249$htmlEntities["&sube;"] ="\xE2\x8A\x86"; // &#8838; (subset or equal)
250$htmlEntities["&supe;"] ="\xE2\x8A\x87"; // &#8839; (superset or equal)
251$htmlEntities["&oplus;"] ="\xE2\x8A\x95"; // &#8853; (circled plus)
252$htmlEntities["&otimes;"] ="\xE2\x8A\x87"; // &#8855; (circled times)
253$htmlEntities["&perp;"] ="\xE2\x8A\xA5"; // &#8869; (perpendicular)
254$htmlEntities["&sdot;"] ="\xE2\x8C\x85"; // &#8901; (dot operator)
255$htmlEntities["&lceil;"] ="\xE2\x8C\x88"; // &#8968; (left ceiling)
256$htmlEntities["&rceil;"] ="\xE2\x8C\x89"; // &#8969; (right ceiling)
257$htmlEntities["&lfloor;"] ="\xE2\x8C\x8A"; // &#8970; (left floor)
258$htmlEntities["&rfloor;"] ="\xE2\x8C\x8B"; // &#8971; (right floor)
259$htmlEntities["&lang;"] ="\xE2\x8C\xA9"; // &#9001; (left angle bracket = bra)
260$htmlEntities["&rang;"] ="\xE2\x8C\xAA"; // &#9002; (right angle bracket = ket)
261$htmlEntities["&loz;"] ="\xE2\x97\x8A"; // &#9674; (lozenge)
262$htmlEntities["&spades;"] ="\xE2\x99\xA0"; // &#9824; (spade)
263$htmlEntities["&clubs;"] ="\xE2\x99\xA3"; // &#9827; (club)
264$htmlEntities["&hearts;"] ="\xE2\x99\xA5"; // &#9829; (heart)
265$htmlEntities["&diams;"] ="\xE2\x99\xA6"; // &#9830; (diamond)
266?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.NCX.php b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php
new file mode 100644
index 00000000..e5da05cd
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php
@@ -0,0 +1,782 @@
1<?php
2/**
3 * ePub NCX file structure
4 *
5 * @author A. Grandt <php@grandt.com>
6 * @copyright 2009-2014 A. Grandt
7 * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else.
8 * @version 3.20
9 */
10class Ncx {
11 const _VERSION = 3.20;
12
13 const MIMETYPE = "application/x-dtbncx+xml";
14
15 private $bookVersion = EPub::BOOK_VERSION_EPUB2;
16
17 private $navMap = NULL;
18 private $uid = NULL;
19 private $meta = array();
20 private $docTitle = NULL;
21 private $docAuthor = NULL;
22
23 private $currentLevel = NULL;
24 private $lastLevel = NULL;
25
26 private $languageCode = "en";
27 private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT;
28
29 public $chapterList = array();
30 public $referencesTitle = "Guide";
31 public $referencesClass = "references";
32 public $referencesId = "references";
33 public $referencesList = array();
34 public $referencesName = array();
35 public $referencesOrder = NULL;
36
37 /**
38 * Class constructor.
39 *
40 * @param string $uid
41 * @param string $docTitle
42 * @param string $docAuthor
43 * @param string $languageCode
44 * @param string $writingDirection
45 */
46 function __construct($uid = NULL, $docTitle = NULL, $docAuthor = NULL, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) {
47 $this->navMap = new NavMap($writingDirection);
48 $this->currentLevel = $this->navMap;
49 $this->setUid($uid);
50 $this->setDocTitle($docTitle);
51 $this->setDocAuthor($docAuthor);
52 $this->setLanguageCode($languageCode);
53 $this->setWritingDirection($writingDirection);
54 }
55
56 /**
57 * Class destructor
58 *
59 * @return void
60 */
61 function __destruct() {
62 unset($this->bookVersion, $this->navMap, $this->uid, $this->meta);
63 unset($this->docTitle, $this->docAuthor, $this->currentLevel, $this->lastLevel);
64 unset($this->languageCode, $this->writingDirection, $this->chapterList, $this->referencesTitle);
65 unset($this->referencesClass, $this->referencesId, $this->referencesList, $this->referencesName);
66 unset($this->referencesOrder);
67 }
68
69 /**
70 *
71 * Enter description here ...
72 *
73 * @param string $bookVersion
74 */
75 function setVersion($bookVersion) {
76 $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
77 }
78
79 /**
80 *
81 * @return bool TRUE if the book is set to type ePub 2
82 */
83 function isEPubVersion2() {
84 return $this->bookVersion === EPub::BOOK_VERSION_EPUB2;
85 }
86
87 /**
88 *
89 * Enter description here ...
90 *
91 * @param string $uid
92 */
93 function setUid($uid) {
94 $this->uid = is_string($uid) ? trim($uid) : NULL;
95 }
96
97 /**
98 *
99 * Enter description here ...
100 *
101 * @param string $docTitle
102 */
103 function setDocTitle($docTitle) {
104 $this->docTitle = is_string($docTitle) ? trim($docTitle) : NULL;
105 }
106
107 /**
108 *
109 * Enter description here ...
110 *
111 * @param string $docAuthor
112 */
113 function setDocAuthor($docAuthor) {
114 $this->docAuthor = is_string($docAuthor) ? trim($docAuthor) : NULL;
115 }
116
117 /**
118 *
119 * Enter description here ...
120 *
121 * @param string $languageCode
122 */
123 function setLanguageCode($languageCode) {
124 $this->languageCode = is_string($languageCode) ? trim($languageCode) : "en";
125 }
126
127 /**
128 *
129 * Enter description here ...
130 *
131 * @param string $writingDirection
132 */
133 function setWritingDirection($writingDirection) {
134 $this->writingDirection = is_string($writingDirection) ? trim($writingDirection) : EPub::DIRECTION_LEFT_TO_RIGHT;
135 }
136
137 /**
138 *
139 * Enter description here ...
140 *
141 * @param NavMap $navMap
142 */
143 function setNavMap($navMap) {
144 if ($navMap != NULL && is_object($navMap) && get_class($navMap) === "NavMap") {
145 $this->navMap = $navMap;
146 }
147 }
148
149 /**
150 * Add one chapter level.
151 *
152 * Subsequent chapters will be added to this level.
153 *
154 * @param string $navTitle
155 * @param string $navId
156 * @param string $navClass
157 * @param string $isNavHidden
158 * @param string $writingDirection
159 * @return NavPoint
160 */
161 function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) {
162 $navPoint = FALSE;
163 if (isset($navTitle) && isset($navClass)) {
164 $navPoint = new NavPoint($navTitle, NULL, $navId, $navClass, $isNavHidden, $writingDirection);
165 $this->addNavPoint($navPoint);
166 }
167 if ($this->lastLevel !== NULL) {
168 $this->currentLevel = $this->lastLevel;
169 }
170 return $navPoint;
171 }
172
173 /**
174 * Step back one chapter level.
175 *
176 * Subsequent chapters will be added to this chapters parent level.
177 */
178 function backLevel() {
179 $this->lastLevel = $this->currentLevel;
180 $this->currentLevel = $this->currentLevel->getParent();
181 }
182
183 /**
184 * Step back to the root level.
185 *
186 * Subsequent chapters will be added to the rooot NavMap.
187 */
188 function rootLevel() {
189 $this->lastLevel = $this->currentLevel;
190 $this->currentLevel = $this->navMap;
191 }
192
193 /**
194 * Step back to the given level.
195 * Useful for returning to a previous level from deep within the structure.
196 * Values below 2 will have the same effect as rootLevel()
197 *
198 * @param int $newLevel
199 */
200 function setCurrentLevel($newLevel) {
201 if ($newLevel <= 1) {
202 $this->rootLevel();
203 } else {
204 while ($this->currentLevel->getLevel() > $newLevel) {
205 $this->backLevel();
206 }
207 }
208 }
209
210 /**
211 * Get current level count.
212 * The indentation of the current structure point.
213 *
214 * @return current level count;
215 */
216 function getCurrentLevel() {
217 return $this->currentLevel->getLevel();
218 }
219
220 /**
221 * Add child NavPoints to current level.
222 *
223 * @param NavPoint $navPoint
224 */
225 function addNavPoint($navPoint) {
226 $this->lastLevel = $this->currentLevel->addNavPoint($navPoint);
227 }
228
229 /**
230 *
231 * Enter description here ...
232 *
233 * @return NavMap
234 */
235 function getNavMap() {
236 return $this->navMap;
237 }
238
239 /**
240 *
241 * Enter description here ...
242 *
243 * @param string $name
244 * @param string $content
245 */
246 function addMetaEntry($name, $content) {
247 $name = is_string($name) ? trim($name) : NULL;
248 $content = is_string($content) ? trim($content) : NULL;
249
250 if ($name != NULL && $content != NULL) {
251 $this->meta[] = array($name => $content);
252 }
253 }
254
255 /**
256 *
257 * Enter description here ...
258 *
259 * @return string
260 */
261 function finalize() {
262 $nav = $this->navMap->finalize();
263
264 $ncx = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
265 if ($this->isEPubVersion2()) {
266 $ncx .= "<!DOCTYPE ncx PUBLIC \"-//NISO//DTD ncx 2005-1//EN\"\n"
267 . " \"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd\">\n";
268 }
269 $ncx .= "<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\" xml:lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n"
270 . "\t<head>\n"
271 . "\t\t<meta name=\"dtb:uid\" content=\"" . $this->uid . "\" />\n"
272 . "\t\t<meta name=\"dtb:depth\" content=\"" . $this->navMap->getNavLevels() . "\" />\n"
273 . "\t\t<meta name=\"dtb:totalPageCount\" content=\"0\" />\n"
274 . "\t\t<meta name=\"dtb:maxPageNumber\" content=\"0\" />\n";
275
276 if (sizeof($this->meta)) {
277 foreach ($this->meta as $metaEntry) {
278 list($name, $content) = each($metaEntry);
279 $ncx .= "\t\t<meta name=\"" . $name . "\" content=\"" . $content . "\" />\n";
280 }
281 }
282
283 $ncx .= "\t</head>\n\n\t<docTitle>\n\t\t<text>"
284 . $this->docTitle
285 . "</text>\n\t</docTitle>\n\n\t<docAuthor>\n\t\t<text>"
286 . $this->docAuthor
287 . "</text>\n\t</docAuthor>\n\n"
288 . $nav;
289
290 return $ncx . "</ncx>\n";
291 }
292
293 /**
294 *
295 * @param string $title
296 * @param string $cssFileName
297 * @return string
298 */
299 function finalizeEPub3($title = "Table of Contents", $cssFileName = NULL) {
300 $end = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
301 . "<html xmlns=\"http://www.w3.org/1999/xhtml\"\n"
302 . " xmlns:epub=\"http://www.idpf.org/2007/ops\"\n"
303 . " xml:lang=\"" . $this->languageCode . "\" lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n"
304 . "\t<head>\n"
305 . "\t\t<title>" . $this->docTitle . "</title>\n"
306 . "\t\t<meta http-equiv=\"default-style\" content=\"text/html; charset=utf-8\"/>\n";
307 if ($cssFileName !== NULL) {
308 $end .= "\t\t<link rel=\"stylesheet\" href=\"" . $cssFileName . "\" type=\"text/css\"/>\n";
309 }
310 $end .= "\t</head>\n"
311 . "\t<body epub:type=\"frontmatter toc\">\n"
312 . "\t\t<header>\n"
313 . "\t\t\t<h1>" . $title . "</h1>\n"
314 . "\t\t</header>\n"
315 . $this->navMap->finalizeEPub3()
316 . $this->finalizeEPub3Landmarks()
317 . "\t</body>\n"
318 . "</html>\n";
319
320 return $end;
321 }
322
323 /**
324 * Build the references for the ePub 2 toc.
325 * These are merely reference pages added to the end of the navMap though.
326 *
327 * @return string
328 */
329 function finalizeReferences() {
330 if (isset($this->referencesList) && sizeof($this->referencesList) > 0) {
331 $this->rootLevel();
332 $this->subLevel($this->referencesTitle, $this->referencesId, $this->referencesClass);
333 $refId = 1;
334 while (list($item, $descriptive) = each($this->referencesOrder)) {
335 if (array_key_exists($item, $this->referencesList)) {
336 $name = (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item]);
337 $navPoint = new NavPoint($name, $this->referencesList[$item], "ref-" . $refId++);
338 $this->addNavPoint($navPoint);
339 }
340 }
341 }
342 }
343
344 /**
345 * Build the landmarks for the ePub 3 toc.
346 * @return string
347 */
348 function finalizeEPub3Landmarks() {
349 $lm = "";
350 if (isset($this->referencesList) && sizeof($this->referencesList) > 0) {
351 $lm = "\t\t\t<nav epub:type=\"landmarks\">\n"
352 . "\t\t\t\t<h2"
353 . ($this->writingDirection === EPub::DIRECTION_RIGHT_TO_LEFT ? " dir=\"rtl\"" : "")
354 . ">" . $this->referencesTitle . "</h2>\n"
355 . "\t\t\t\t<ol>\n";
356
357 $li = "";
358 while (list($item, $descriptive) = each($this->referencesOrder)) {
359 if (array_key_exists($item, $this->referencesList)) {
360 $li .= "\t\t\t\t\t<li><a epub:type=\""
361 . $item
362 . "\" href=\"" . $this->referencesList[$item] . "\">"
363 . (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item])
364 . "</a></li>\n";
365 }
366 }
367 if (empty($li)) {
368 return "";
369 }
370
371 $lm .= $li
372 . "\t\t\t\t</ol>\n"
373 . "\t\t\t</nav>\n";
374 }
375 return $lm;
376 }
377}
378
379/**
380 * ePub NavMap class
381 */
382class NavMap {
383 const _VERSION = 3.00;
384
385 private $navPoints = array();
386 private $navLevels = 0;
387 private $writingDirection = NULL;
388
389 /**
390 * Class constructor.
391 *
392 * @return void
393 */
394 function __construct($writingDirection = NULL) {
395 $this->setWritingDirection($writingDirection);
396 }
397
398 /**
399 * Class destructor
400 *
401 * @return void
402 */
403 function __destruct() {
404 unset($this->navPoints, $this->navLevels, $this->writingDirection);
405 }
406
407 /**
408 * Set the writing direction to be used for this NavPoint.
409 *
410 * @param string $writingDirection
411 */
412 function setWritingDirection($writingDirection) {
413 $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL;
414 }
415
416 function getWritingDirection() {
417 return $this->writingDirection;
418 }
419
420 /**
421 * Add a navPoint to the root of the NavMap.
422 *
423 * @param NavPoint $navPoint
424 * @return NavMap
425 */
426 function addNavPoint($navPoint) {
427 if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") {
428 $navPoint->setParent($this);
429 if ($navPoint->getWritingDirection() == NULL) {
430 $navPoint->setWritingDirection($this->writingDirection);
431 }
432 $this->navPoints[] = $navPoint;
433 return $navPoint;
434 }
435 return $this;
436 }
437
438 /**
439 * The final max depth for the "dtb:depth" meta attribute
440 * Only available after finalize have been called.
441 *
442 * @return number
443 */
444 function getNavLevels() {
445 return $this->navLevels+1;
446 }
447
448 function getLevel() {
449 return 1;
450 }
451
452 function getParent() {
453 return $this;
454 }
455
456 /**
457 * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization
458 *
459 */
460 function finalize() {
461 $playOrder = 0;
462 $this->navLevels = 0;
463
464 $nav = "\t<navMap>\n";
465 if (sizeof($this->navPoints) > 0) {
466 $this->navLevels++;
467 foreach ($this->navPoints as $navPoint) {
468 $retLevel = $navPoint->finalize($nav, $playOrder, 0);
469 if ($retLevel > $this->navLevels) {
470 $this->navLevels = $retLevel;
471 }
472 }
473 }
474 return $nav . "\t</navMap>\n";
475 }
476
477 /**
478 * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization
479 *
480 */
481 function finalizeEPub3() {
482 $playOrder = 0;
483 $level = 0;
484 $this->navLevels = 0;
485
486 $nav = "\t\t<nav epub:type=\"toc\" id=\"toc\">\n";
487
488 if (sizeof($this->navPoints) > 0) {
489 $this->navLevels++;
490
491 $nav .= str_repeat("\t", $level) . "\t\t\t<ol epub:type=\"list\">\n";
492 foreach ($this->navPoints as $navPoint) {
493 $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, 0);
494 if ($retLevel > $this->navLevels) {
495 $this->navLevels = $retLevel;
496 }
497 }
498 $nav .= str_repeat("\t", $level) . "\t\t\t</ol>\n";
499 }
500
501 return $nav . "\t\t</nav>\n";
502 }
503}
504
505/**
506 * ePub NavPoint class
507 */
508class NavPoint {
509 const _VERSION = 3.00;
510
511 private $label = NULL;
512 private $contentSrc = NULL;
513 private $id = NULL;
514 private $navClass = NULL;
515 private $isNavHidden = FALSE;
516 private $navPoints = array();
517 private $parent = NULL;
518
519 /**
520 * Class constructor.
521 *
522 * All three attributes are mandatory, though if ID is set to null (default) the value will be generated.
523 *
524 * @param string $label
525 * @param string $contentSrc
526 * @param string $id
527 * @param string $navClass
528 * @param bool $isNavHidden
529 * @param string $writingDirection
530 */
531 function __construct($label, $contentSrc = NULL, $id = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) {
532 $this->setLabel($label);
533 $this->setContentSrc($contentSrc);
534 $this->setId($id);
535 $this->setNavClass($navClass);
536 $this->setNavHidden($isNavHidden);
537 $this->setWritingDirection($writingDirection);
538 }
539
540 /**
541 * Class destructor
542 *
543 * @return void
544 */
545 function __destruct() {
546 unset($this->label, $this->contentSrc, $this->id, $this->navClass);
547 unset($this->isNavHidden, $this->navPoints, $this->parent);
548 }
549
550 /**
551 * Set the Text label for the NavPoint.
552 *
553 * The label is mandatory.
554 *
555 * @param string $label
556 */
557 function setLabel($label) {
558 $this->label = is_string($label) ? trim($label) : NULL;
559 }
560
561 /**
562 * Get the Text label for the NavPoint.
563 *
564 * @return string Label
565 */
566 function getLabel() {
567 return $this->label;
568 }
569
570 /**
571 * Set the src reference for the NavPoint.
572 *
573 * The src is mandatory for ePub 2.
574 *
575 * @param string $contentSrc
576 */
577 function setContentSrc($contentSrc) {
578 $this->contentSrc = isset($contentSrc) && is_string($contentSrc) ? trim($contentSrc) : NULL;
579 }
580
581 /**
582 * Get the src reference for the NavPoint.
583 *
584 * @return string content src url.
585 */
586 function getContentSrc() {
587 return $this->contentSrc;
588 }
589 /**
590 * Set the parent for this NavPoint.
591 *
592 * @param NavPoint or NavMap $parent
593 */
594 function setParent($parent) {
595 if ($parent != NULL && is_object($parent) &&
596 (get_class($parent) === "NavPoint" || get_class($parent) === "NavMap") ) {
597 $this->parent = $parent;
598 }
599 }
600
601 /**
602 * Get the parent to this NavPoint.
603 *
604 * @return NavPoint, or NavMap if the parent is the root.
605 */
606 function getParent() {
607 return $this->parent;
608 }
609
610 /**
611 * Get the current level. 1 = document root.
612 *
613 * @return int level
614 */
615 function getLevel() {
616 return $this->parent === NULL ? 1 : $this->parent->getLevel()+1;
617 }
618
619 /**
620 * Set the id for the NavPoint.
621 *
622 * The id must be unique, and is mandatory.
623 *
624 * @param string $id
625 */
626 function setId($id) {
627 $this->id = is_string($id) ? trim($id) : NULL;
628 }
629
630 /**
631 * Set the class to be used for this NavPoint.
632 *
633 * @param string $navClass
634 */
635 function setNavClass($navClass) {
636 $this->navClass = isset($navClass) && is_string($navClass) ? trim($navClass) : NULL;
637 }
638
639 /**
640 * Set the class to be used for this NavPoint.
641 *
642 * @param string $navClass
643 */
644 function setNavHidden($isNavHidden) {
645 $this->isNavHidden = $isNavHidden === TRUE;
646 }
647
648 /**
649 * Set the writing direction to be used for this NavPoint.
650 *
651 * @param string $writingDirection
652 */
653 function setWritingDirection($writingDirection) {
654 $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL;
655 }
656
657 function getWritingDirection() {
658 return $this->writingDirection;
659 }
660
661 /**
662 * Add child NavPoints for multi level NavMaps.
663 *
664 * @param NavPoint $navPoint
665 */
666 function addNavPoint($navPoint) {
667 if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") {
668 $navPoint->setParent($this);
669 if ($navPoint->getWritingDirection() == NULL) {
670 $navPoint->setWritingDirection($this->writingDirection);
671 }
672 $this->navPoints[] = $navPoint;
673 return $navPoint;
674 }
675 return $this;
676 }
677
678 /**
679 *
680 * Enter description here ...
681 *
682 * @param string $nav
683 * @param int $playOrder
684 * @param int $level
685 * @return int
686 */
687 function finalize(&$nav = "", &$playOrder = 0, $level = 0) {
688 $maxLevel = $level;
689 $levelAdjust = 0;
690
691 if ($this->isNavHidden) {
692 return $maxLevel;
693 }
694
695 if (isset($this->contentSrc)) {
696 $playOrder++;
697
698 if ($this->id == NULL) {
699 $this->id = "navpoint-" . $playOrder;
700 }
701 $nav .= str_repeat("\t", $level) . "\t\t<navPoint id=\"" . $this->id . "\" playOrder=\"" . $playOrder . "\">\n"
702 . str_repeat("\t", $level) . "\t\t\t<navLabel>\n"
703 . str_repeat("\t", $level) . "\t\t\t\t<text>" . $this->label . "</text>\n"
704 . str_repeat("\t", $level) . "\t\t\t</navLabel>\n"
705 . str_repeat("\t", $level) . "\t\t\t<content src=\"" . $this->contentSrc . "\" />\n";
706 } else {
707 $levelAdjust++;
708 }
709
710 if (sizeof($this->navPoints) > 0) {
711 $maxLevel++;
712 foreach ($this->navPoints as $navPoint) {
713 $retLevel = $navPoint->finalize($nav, $playOrder, ($level+1+$levelAdjust));
714 if ($retLevel > $maxLevel) {
715 $maxLevel = $retLevel;
716 }
717 }
718 }
719
720 if (isset($this->contentSrc)) {
721 $nav .= str_repeat("\t", $level) . "\t\t</navPoint>\n";
722 }
723
724 return $maxLevel;
725 }
726
727 /**
728 *
729 * Enter description here ...
730 *
731 * @param string $nav
732 * @param int $playOrder
733 * @param int $level
734 * @return int
735 */
736 function finalizeEPub3(&$nav = "", &$playOrder = 0, $level = 0, $subLevelClass = NULL, $subLevelHidden = FALSE) {
737 $maxLevel = $level;
738
739 if ($this->id == NULL) {
740 $this->id = "navpoint-" . $playOrder;
741 }
742 $indent = str_repeat("\t", $level) . "\t\t\t\t";
743
744 $nav .= $indent . "<li id=\"" . $this->id . "\"";
745 if (isset($this->writingDirection)) {
746 $nav .= " dir=\"" . $this->writingDirection . "\"";
747 }
748 $nav .= ">\n";
749
750 if (isset($this->contentSrc)) {
751 $nav .= $indent . "\t<a href=\"" . $this->contentSrc . "\">" . $this->label . "</a>\n";
752 } else {
753 $nav .= $indent . "\t<span>" . $this->label . "</span>\n";
754 }
755
756 if (sizeof($this->navPoints) > 0) {
757 $maxLevel++;
758
759 $nav .= $indent . "\t<ol epub:type=\"list\"";
760 if (isset($subLevelClass)) {
761 $nav .= " class=\"" . $subLevelClass . "\"";
762 }
763 if ($subLevelHidden) {
764 $nav .= " hidden=\"hidden\"";
765 }
766 $nav .= ">\n";
767
768 foreach ($this->navPoints as $navPoint) {
769 $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, ($level+2), $subLevelClass, $subLevelHidden);
770 if ($retLevel > $maxLevel) {
771 $maxLevel = $retLevel;
772 }
773 }
774 $nav .= $indent . "\t</ol>\n";
775 }
776
777 $nav .= $indent . "</li>\n";
778
779 return $maxLevel;
780 }
781}
782?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.OPF.php b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php
new file mode 100644
index 00000000..803a2108
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php
@@ -0,0 +1,1226 @@
1<?php
2/**
3 * ePub OPF file structure
4 *
5 * @author A. Grandt <php@grandt.com>
6 * @copyright 2009-2014 A. Grandt
7 * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else.
8 * @version 3.20
9 */
10class Opf {
11 const _VERSION = 3.20;
12
13 /* Core Media types.
14 * These types are the only guaranteed mime types any ePub reader must understand.
15 * Any other type muse define a fall back whose fallback chain will end in one of these.
16 */
17 const TYPE_GIF = "image/gif";
18 const TYPE_JPEG = "image/jpeg";
19 const TYPE_PNG = "image/png";
20 const TYPE_SVG = "image/svg+xml";
21 const TYPE_XHTML = "application/xhtml+xml";
22 const TYPE_DTBOOK = "application/x-dtbook+xml";
23 const TYPE_CSS = "text/css";
24 const TYPE_XML = "application/xml";
25 const TYPE_OEB1_DOC = "text/x-oeb1-document"; // Deprecated
26 const TYPE_OEB1_CSS = "text/x-oeb1-css"; // Deprecated
27 const TYPE_NCX = "application/x-dtbncx+xml";
28
29 private $bookVersion = EPub::BOOK_VERSION_EPUB2;
30 private $ident = "BookId";
31
32 public $date = NULL;
33 public $metadata = NULL;
34 public $manifest = NULL;
35 public $spine = NULL;
36 public $guide = NULL;
37
38 /**
39 * Class constructor.
40 *
41 * @return void
42 */
43 function __construct($ident = "BookId", $bookVersion = EPub::BOOK_VERSION_EPUB2) {
44 $this->setIdent($ident);
45 $this->setVersion($bookVersion);
46 $this->metadata = new Metadata();
47 $this->manifest = new Manifest();
48 $this->spine = new Spine();
49 $this->guide = new Guide();
50 }
51
52 /**
53 * Class destructor
54 *
55 * @return void
56 */
57 function __destruct() {
58 unset ($this->bookVersion, $this->ident, $this->date, $this->metadata, $this->manifest, $this->spine, $this->guide);
59 }
60
61 /**
62 *
63 * Enter description here ...
64 *
65 * @param string $ident
66 */
67 function setVersion($bookVersion) {
68 $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
69 }
70
71 function isEPubVersion2() {
72 return $this->bookVersion === EPub::BOOK_VERSION_EPUB2;
73 }
74
75 /**
76 *
77 * Enter description here ...
78 *
79 * @param string $ident
80 */
81 function setIdent($ident = "BookId") {
82 $this->ident = is_string($ident) ? trim($ident) : "BookId";
83 }
84
85 /**
86 *
87 * Enter description here ...
88 *
89 * @return string
90 */
91 function finalize() {
92 $opf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
93 . "<package xmlns=\"http://www.idpf.org/2007/opf\" unique-identifier=\"" . $this->ident . "\" version=\"" . $this->bookVersion . "\">\n";
94
95 $opf .= $this->metadata->finalize($this->bookVersion, $this->date);
96 $opf .= $this->manifest->finalize($this->bookVersion);
97 $opf .= $this->spine->finalize();
98
99 if ($this->guide->length() > 0) {
100 $opf .= $this->guide->finalize();
101 }
102
103 return $opf . "</package>\n";
104 }
105
106 // Convenience functions:
107
108 /**
109 *
110 * Enter description here ...
111 *
112 * @param string $title
113 * @param string $language
114 * @param string $identifier
115 * @param string $identifierScheme
116 */
117 function initialize($title, $language, $identifier, $identifierScheme) {
118 $this->metadata->addDublinCore(new DublinCore("title", $title));
119 $this->metadata->addDublinCore(new DublinCore("language", $language));
120
121 $dc = new DublinCore("identifier", $identifier);
122 $dc->addAttr("id", $this->ident);
123 $dc->addOpfAttr("scheme", $identifierScheme);
124 $this->metadata->addDublinCore($dc);
125 }
126
127 /**
128 *
129 * Enter description here ...
130 *
131 * @param string $id
132 * @param string $href
133 * @param string $mediaType
134 */
135 function addItem($id, $href, $mediaType, $properties = NULL) {
136 $this->manifest->addItem(new Item($id, $href, $mediaType, $properties));
137 }
138
139 /**
140 *
141 * Enter description here ...
142 *
143 * @param string $idref
144 * @param bool $linear
145 */
146 function addItemRef($idref, $linear = TRUE) {
147 $this->spine->addItemref(new Itemref($idref, $linear));
148 }
149
150 /**
151 *
152 * Enter description here ...
153 *
154 * @param string $type
155 * @param string $title
156 * @param string $href
157 */
158 function addReference($type, $title, $href) {
159 $this->guide->addReference(new Reference($type, $title, $href));
160 }
161
162 /**
163 *
164 * Enter description here ...
165 *
166 * @param string $name
167 * @param string $value
168 */
169 function addDCMeta($name, $value) {
170 $this->metadata->addDublinCore(new DublinCore($name, $value));
171 }
172
173 /**
174 *
175 * Enter description here ...
176 *
177 * @param string $name
178 * @param string $content
179 */
180 function addMeta($name, $content) {
181 $this->metadata->addMeta($name, $content);
182 }
183
184 /**
185 *
186 * Enter description here ...
187 *
188 * @param string $name
189 * @param string $fileAs
190 * @param string $role Use the MarcCode constants
191 */
192 function addCreator($name, $fileAs = NULL, $role = NULL) {
193 $dc = new DublinCore(DublinCore::CREATOR, trim($name));
194
195 if ($fileAs !== NULL) {
196 $dc->addOpfAttr("file-as", trim($fileAs));
197 }
198
199 if ($role !== NULL) {
200 $dc->addOpfAttr("role", trim($role));
201 }
202
203 $this->metadata->addDublinCore($dc);
204 }
205
206 /**
207 *
208 * Enter description here ...
209 *
210 * @param string $name
211 * @param string $fileAs
212 * @param string $role Use the MarcCode constants
213 */
214 function addColaborator($name, $fileAs = NULL, $role = NULL) {
215 $dc = new DublinCore(DublinCore::CONTRIBUTOR, trim($name));
216
217 if ($fileAs !== NULL) {
218 $dc->addOpfAttr("file-as", trim($fileAs));
219 }
220
221 if ($role !== NULL) {
222 $dc->addOpfAttr("role", trim($role));
223 }
224
225 $this->metadata->addDublinCore($dc);
226 }
227}
228
229/**
230 * ePub OPF Metadata structures
231 */
232class Metadata {
233 const _VERSION = 3.00;
234
235 private $dc = array();
236 private $meta = array();
237
238 /**
239 * Class constructor.
240 *
241 * @return void
242 */
243 function __construct() {
244 }
245
246 /**
247 * Class destructor
248 *
249 * @return void
250 */
251 function __destruct() {
252 unset ($this->dc, $this->meta);
253 }
254
255 /**
256 *
257 * Enter description here ...
258 *
259 * @param DublinCore $dc
260 */
261 function addDublinCore($dc) {
262 if ($dc != NULL && is_object($dc) && get_class($dc) === "DublinCore") {
263 $this->dc[] = $dc;
264 }
265 }
266
267 /**
268 *
269 * Enter description here ...
270 *
271 * @param string $name
272 * @param string $content
273 */
274 function addMeta($name, $content) {
275 $name = is_string($name) ? trim($name) : NULL;
276 if (isset($name)) {
277 $content = is_string($content) ? trim($content) : NULL;
278 }
279 if (isset($content)) {
280 $this->meta[] = array ($name => $content);
281 }
282 }
283
284 /**
285 *
286 * @param string $bookVersion
287 * @param int $date
288 * @return string
289 */
290 function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2, $date = NULL) {
291 $metadata = "\t<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
292 if ($bookVersion === EPub::BOOK_VERSION_EPUB2) {
293 $metadata .= "\t\txmlns:opf=\"http://www.idpf.org/2007/opf\"\n\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n";
294 } else {
295 $metadata .= "\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n";
296 if (!isset($date)) {
297 $date = time();
298 }
299 $metadata .= "\t\t<meta property=\"dcterms:modified\">" . gmdate("Y-m-d\TH:i:s\Z", $date) . "</meta>\n";
300 }
301
302 foreach ($this->dc as $dc) {
303 $metadata .= $dc->finalize($bookVersion);
304 }
305
306 foreach ($this->meta as $data) {
307 list($name, $content) = each($data);
308 $metadata .= "\t\t<meta name=\"" . $name . "\" content=\"" . $content . "\" />\n";
309 }
310
311 return $metadata . "\t</metadata>\n";
312 }
313}
314
315/**
316 * ePub OPF Dublin Core (dc:) Metadata structures
317 */
318class DublinCore {
319 const _VERSION = 3.00;
320
321 const CONTRIBUTOR = "contributor";
322 const COVERAGE = "coverage";
323 const CREATOR = "creator";
324 const DATE = "date";
325 const DESCRIPTION = "description";
326 const FORMAT = "format";
327 const IDENTIFIER = "identifier";
328 const LANGUAGE = "language";
329 const PUBLISHER = "publisher";
330 const RELATION = "relation";
331 const RIGHTS = "rights";
332 const SOURCE = "source";
333 const SUBJECT = "subject";
334 const TITLE = "title";
335 const TYPE = "type";
336
337 private $dcName = NULL;
338 private $dcValue = NULL;
339 private $attr = array();
340 private $opfAttr = array();
341
342 /**
343 * Class constructor.
344 *
345 * @return void
346 */
347 function __construct($name, $value) {
348 $this->setDc($name, $value);
349 }
350
351 /**
352 * Class destructor
353 *
354 * @return void
355 */
356 function __destruct() {
357 unset ($this->dcName, $this->dcValue, $this->attr, $this->opfAttr);
358 }
359
360 /**
361 *
362 * Enter description here ...
363 *
364 * @param string $name
365 * @param string $value
366 */
367 function setDc($name, $value) {
368 $this->dcName = is_string($name) ? trim($name) : NULL;
369 if (isset($this->dcName)) {
370 $this->dcValue = isset($value) ? (string)$value : NULL;
371 }
372 if (! isset($this->dcValue)) {
373 $this->dcName = NULL;
374 }
375 }
376
377 /**
378 *
379 * Enter description here ...
380 *
381 * @param string $attrName
382 * @param string $attrValue
383 */
384 function addAttr($attrName, $attrValue) {
385 $attrName = is_string($attrName) ? trim($attrName) : NULL;
386 if (isset($attrName)) {
387 $attrValue = is_string($attrValue) ? trim($attrValue) : NULL;
388 }
389 if (isset($attrValue)) {
390 $this->attr[$attrName] = $attrValue;
391 }
392 }
393
394 /**
395 *
396 * Enter description here ...
397 *
398 * @param string $opfAttrName
399 * @param string $opfAttrValue
400 */
401 function addOpfAttr($opfAttrName, $opfAttrValue) {
402 $opfAttrName = is_string($opfAttrName) ? trim($opfAttrName) : NULL;
403 if (isset($opfAttrName)) {
404 $opfAttrValue = is_string($opfAttrValue) ? trim($opfAttrValue) : NULL;
405 }
406 if (isset($opfAttrValue)) {
407 $this->opfAttr[$opfAttrName] = $opfAttrValue;
408 }
409 }
410
411
412 /**
413 *
414 * @param string $bookVersion
415 * @return string
416 */
417 function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) {
418 $dc = "\t\t<dc:" . $this->dcName;
419
420 if (sizeof($this->attr) > 0) {
421 while (list($name, $content) = each($this->attr)) {
422 $dc .= " " . $name . "=\"" . $content . "\"";
423 }
424 }
425
426 if ($bookVersion === EPub::BOOK_VERSION_EPUB2 && sizeof($this->opfAttr) > 0) {
427 while (list($name, $content) = each($this->opfAttr)) {
428 $dc .= " opf:" . $name . "=\"" . $content . "\"";
429 }
430 }
431
432 return $dc . ">" . $this->dcValue . "</dc:" . $this->dcName . ">\n";
433 }
434}
435
436/**
437 * ePub OPF Manifest structure
438 */
439class Manifest {
440 const _VERSION = 3.00;
441
442 private $items = array();
443
444 /**
445 * Class constructor.
446 *
447 * @return void
448 */
449 function __construct() {
450 }
451
452 /**
453 * Class destructor
454 *
455 * @return void
456 */
457 function __destruct() {
458 unset ($this->items);
459 }
460
461 /**
462 *
463 * Enter description here ...
464 *
465 * @param Item $item
466 */
467 function addItem($item) {
468 if ($item != NULL && is_object($item) && get_class($item) === "Item") {
469 $this->items[] = $item;
470 }
471 }
472
473 /**
474 *
475 * @param string $bookVersion
476 * @return string
477 */
478 function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) {
479 $manifest = "\n\t<manifest>\n";
480 foreach ($this->items as $item) {
481 $manifest .= $item->finalize($bookVersion);
482 }
483 return $manifest . "\t</manifest>\n";
484 }
485}
486
487/**
488 * ePub OPF Item structure
489 */
490class Item {
491 const _VERSION = 3.00;
492
493 private $id = NULL;
494 private $href = NULL;
495 private $mediaType = NULL;
496 private $properties = NULL;
497 private $requiredNamespace = NULL;
498 private $requiredModules = NULL;
499 private $fallback = NULL;
500 private $fallbackStyle = NULL;
501
502 /**
503 * Class constructor.
504 *
505 * @return void
506 */
507 function __construct($id, $href, $mediaType, $properties = NULL) {
508 $this->setId($id);
509 $this->setHref($href);
510 $this->setMediaType($mediaType);
511 $this->setProperties($properties);
512 }
513
514 /**
515 * Class destructor
516 *
517 * @return void
518 */
519 function __destruct() {
520 unset ($this->id, $this->href, $this->mediaType);
521 unset ($this->properties, $this->requiredNamespace, $this->requiredModules, $this->fallback, $this->fallbackStyle);
522 }
523
524 /**
525 *
526 * Enter description here ...
527 *
528 * @param string $id
529 */
530 function setId($id) {
531 $this->id = is_string($id) ? trim($id) : NULL;
532 }
533
534 /**
535 *
536 * Enter description here ...
537 *
538 * @param string $href
539 */
540 function setHref($href) {
541 $this->href = is_string($href) ? trim($href) : NULL;
542 }
543
544 /**
545 *
546 * Enter description here ...
547 *
548 * @param string $mediaType
549 */
550 function setMediaType($mediaType) {
551 $this->mediaType = is_string($mediaType) ? trim($mediaType) : NULL;
552 }
553
554 /**
555 *
556 * Enter description here ...
557 *
558 * @param string $properties
559 */
560 function setProperties($properties) {
561 $this->properties = is_string($properties) ? trim($properties) : NULL;
562 }
563
564 /**
565 *
566 * Enter description here ...
567 *
568 * @param string $requiredNamespace
569 */
570 function setRequiredNamespace($requiredNamespace) {
571 $this->requiredNamespace = is_string($requiredNamespace) ? trim($requiredNamespace) : NULL;
572 }
573
574 /**
575 *
576 * Enter description here ...
577 *
578 * @param string $requiredModules
579 */
580 function setRequiredModules($requiredModules) {
581 $this->requiredModules = is_string($requiredModules) ? trim($requiredModules) : NULL;
582 }
583
584 /**
585 *
586 * Enter description here ...
587 *
588 * @param string $fallback
589 */
590 function setfallback($fallback) {
591 $this->fallback = is_string($fallback) ? trim($fallback) : NULL;
592 }
593
594 /**
595 *
596 * Enter description here ...
597 *
598 * @param string $fallbackStyle
599 */
600 function setFallbackStyle($fallbackStyle) {
601 $this->fallbackStyle = is_string($fallbackStyle) ? trim($fallbackStyle) : NULL;
602 }
603
604 /**
605 *
606 * @param string $bookVersion
607 * @return string
608 */
609 function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) {
610 $item = "\t\t<item id=\"" . $this->id . "\" href=\"" . $this->href . "\" media-type=\"" . $this->mediaType . "\" ";
611 if ($bookVersion === EPub::BOOK_VERSION_EPUB3 && isset($this->properties)) {
612 $item .= "properties=\"" . $this->properties . "\" ";
613 }
614 if (isset($this->requiredNamespace)) {
615 $item .= "\n\t\t\trequired-namespace=\"" . $this->requiredNamespace . "\" ";
616 if (isset($this->requiredModules)) {
617 $item .= "required-modules=\"" . $this->requiredModules . "\" ";
618 }
619 }
620 if (isset($this->fallback)) {
621 $item .= "\n\t\t\tfallback=\"" . $this->fallback . "\" ";
622 }
623 if (isset($this->fallbackStyle)) {
624 $item .= "\n\t\t\tfallback-style=\"" . $this->fallbackStyle . "\" ";
625 }
626 return $item . "/>\n";
627 }
628}
629
630/**
631 * ePub OPF Spine structure
632 */
633class Spine {
634 const _VERSION = 1.00;
635
636 private $itemrefs = array();
637 private $toc = NULL;
638
639 /**
640 * Class constructor.
641 *
642 * @return void
643 */
644 function __construct($toc = "ncx") {
645 $this->setToc($toc);
646 }
647
648 /**
649 * Class destructor
650 *
651 * @return void
652 */
653 function __destruct() {
654 unset ($this->itemrefs, $this->toc);
655 }
656
657 /**
658 *
659 * Enter description here ...
660 *
661 * @param string $toc
662 */
663 function setToc($toc) {
664 $this->toc = is_string($toc) ? trim($toc) : NULL;
665 }
666
667 /**
668 *
669 * Enter description here ...
670 *
671 * @param Itemref $itemref
672 */
673 function addItemref($itemref) {
674 if ($itemref != NULL
675 && is_object($itemref)
676 && get_class($itemref) === "Itemref"
677 && !isset($this->itemrefs[$itemref->getIdref()])) {
678 $this->itemrefs[$itemref->getIdref()] = $itemref;
679 }
680 }
681
682 /**
683 *
684 * Enter description here ...
685 *
686 * @return string
687 */
688 function finalize() {
689 $spine = "\n\t<spine toc=\"" . $this->toc . "\">\n";
690 foreach ($this->itemrefs as $itemref) {
691 $spine .= $itemref->finalize();
692 }
693 return $spine . "\t</spine>\n";
694 }
695}
696
697/**
698 * ePub OPF ItemRef structure
699 */
700class Itemref {
701 const _VERSION = 3.00;
702
703 private $idref = NULL;
704 private $linear = TRUE;
705
706 /**
707 * Class constructor.
708 *
709 * @return void
710 */
711 function __construct($idref, $linear = TRUE) {
712 $this->setIdref($idref);
713 $this->setLinear($linear);
714 }
715
716 /**
717 * Class destructor
718 *
719 * @return void
720 */
721 function __destruct() {
722 unset ($this->idref, $this->linear);
723 }
724
725 /**
726 *
727 * Enter description here ...
728 *
729 * @param string $idref
730 */
731 function setIdref($idref) {
732 $this->idref = is_string($idref) ? trim($idref) : NULL;
733 }
734
735 /**
736 *
737 * Enter description here ...
738 *
739 * @return string $idref
740 */
741 function getIdref() {
742 return $this->idref;
743 }
744
745 /**
746 *
747 * Enter description here ...
748 *
749 * @param bool $linear
750 */
751 function setLinear($linear = TRUE) {
752 $this->linear = $linear === TRUE;
753 }
754
755 /**
756 *
757 * Enter description here ...
758 *
759 * @return string
760 */
761 function finalize() {
762 $itemref = "\t\t<itemref idref=\"" . $this->idref . "\"";
763 if ($this->linear == FALSE) {
764 return $itemref .= " linear=\"no\" />\n";
765 }
766 return $itemref . " />\n";
767 }
768}
769
770/**
771 * ePub OPF Guide structure
772 */
773class Guide {
774 const _VERSION = 3.00;
775
776 private $references = array();
777
778 /**
779 * Class constructor.
780 *
781 * @return void
782 */
783 function __construct() {
784 }
785
786 /**
787 * Class destructor
788 *
789 * @return void
790 */
791 function __destruct() {
792 unset ($this->references);
793 }
794
795 /**
796 *
797 * Enter description here ...
798 *
799 */
800 function length() {
801 return sizeof($this->references);
802 }
803
804 /**
805 *
806 * Enter description here ...
807 *
808 * @param Reference $reference
809 */
810 function addReference($reference) {
811 if ($reference != NULL && is_object($reference) && get_class($reference) === "Reference") {
812 $this->references[] = $reference;
813 }
814 }
815
816 /**
817 *
818 * Enter description here ...
819 *
820 * @return string
821 */
822 function finalize() {
823 $ref = "";
824 if (sizeof($this->references) > 0) {
825 $ref = "\n\t<guide>\n";
826 foreach ($this->references as $reference) {
827 $ref .= $reference->finalize();
828 }
829 $ref .= "\t</guide>\n";
830 }
831 return $ref;
832 }
833}
834
835/**
836 * Reference constants
837 */
838class Reference {
839 const _VERSION = 1.00;
840
841 /* REFERENCE types are derived from the "Chicago Manual of Style"
842 */
843
844 /** Acknowledgements page */
845 const ACKNOWLEDGEMENTS = "acknowledgements";
846
847 /** Bibliography page */
848 const BIBLIOGRAPHY = "bibliography";
849
850 /** Colophon page */
851 const COLOPHON = "colophon";
852
853 /** Copyright page */
854 const COPYRIGHT_PAGE = "copyright-page";
855
856 /** Dedication */
857 const DEDICATION = "dedication";
858
859 /** Epigraph */
860 const EPIGRAPH = "epigraph";
861
862 /** Foreword */
863 const FOREWORD = "foreword";
864
865 /** Glossary page */
866 const GLOSSARY = "glossary";
867
868 /** back-of-book style index */
869 const INDEX = "index";
870
871 /** List of illustrations */
872 const LIST_OF_ILLUSTRATIONS = "loi";
873
874 /** List of tables */
875 const LIST_OF_TABLES = "lot";
876
877 /** Notes page */
878 const NOTES = "notes";
879
880 /** Preface page */
881 const PREFACE = "preface";
882
883 /** Table of contents */
884 const TABLE_OF_CONTENTS = "toc";
885
886 /** Page with possibly title, author, publisher, and other metadata */
887 const TITLE_PAGE = "titlepage";
888
889 /** First page of the book, ie. first page of the first chapter */
890 const TEXT = "text";
891
892 // ******************
893 // ePub3 constants
894 // ******************
895
896 // Document partitions
897 /** The publications cover(s), jacket information, etc. This is officially in ePub3, but works for ePub 2 as well */
898 const COVER = "cover";
899
900 /** Preliminary material to the content body, such as tables of contents, dedications, etc. */
901 const FRONTMATTER = "frontmatter";
902
903 /** The main (body) content of a document. */
904 const BODYMATTER = "bodymatter";
905
906 /** Ancillary material occurring after the document body, such as indices, appendices, etc. */
907 const BACKMATTER = "backmatter";
908
909
910 private $type = NULL;
911 private $title = NULL;
912 private $href = NULL;
913
914 /**
915 * Class constructor.
916 *
917 * @param string $type
918 * @param string $title
919 * @param string $href
920 */
921 function __construct($type, $title, $href) {
922 $this->setType($type);
923 $this->setTitle($title);
924 $this->setHref($href);
925 }
926
927 /**
928 * Class destructor
929 *
930 * @return void
931 */
932 function __destruct() {
933 unset ($this->type, $this->title, $this->href);
934 }
935
936 /**
937 *
938 * Enter description here ...
939 *
940 * @param string $type
941 */
942 function setType($type) {
943 $this->type = is_string($type) ? trim($type) : NULL;
944 }
945
946 /**
947 *
948 * Enter description here ...
949 *
950 * @param string $title
951 */
952 function setTitle($title) {
953 $this->title = is_string($title) ? trim($title) : NULL;
954 }
955
956 /**
957 *
958 * Enter description here ...
959 *
960 * @param string $href
961 */
962 function setHref($href) {
963 $this->href = is_string($href) ? trim($href) : NULL;
964 }
965
966 /**
967 *
968 * Enter description here ...
969 *
970 * @return string
971 */
972 function finalize() {
973 return "\t\t<reference type=\"" . $this->type . "\" title=\"" . $this->title . "\" href=\"" . $this->href . "\" />\n";
974 }
975}
976
977/**
978 * Common Marc codes.
979 * Ref: http://www.loc.gov/marc/relators/
980 */
981class MarcCode {
982 const _VERSION = 3.00;
983
984 /**
985 * Adapter
986 *
987 * Use for a person who
988 * 1) reworks a musical composition, usually for a different medium, or
989 * 2) rewrites novels or stories for motion pictures or other audiovisual medium.
990 */
991 const ADAPTER = "adp";
992
993 /**
994 * Annotator
995 *
996 * Use for a person who writes manuscript annotations on a printed item.
997 */
998 const ANNOTATOR = "ann";
999
1000 /**
1001 * Arranger
1002 *
1003 * Use for a person who transcribes a musical composition, usually for a different
1004 * medium from that of the original; in an arrangement the musical substance remains
1005 * essentially unchanged.
1006 */
1007 const ARRANGER = "arr";
1008
1009 /**
1010 * Artist
1011 *
1012 * Use for a person (e.g., a painter) who conceives, and perhaps also implements,
1013 * an original graphic design or work of art, if specific codes (e.g., [egr],
1014 * [etr]) are not desired. For book illustrators, prefer Illustrator [ill].
1015 */
1016 const ARTIST = "art";
1017
1018 /**
1019 * Associated name
1020 *
1021 * Use as a general relator for a name associated with or found in an item or
1022 * collection, or which cannot be determined to be that of a Former owner [fmo]
1023 * or other designated relator indicative of provenance.
1024 */
1025 const ASSOCIATED_NAME = "asn";
1026
1027 /**
1028 * Author
1029 *
1030 * Use for a person or corporate body chiefly responsible for the intellectual
1031 * or artistic content of a work. This term may also be used when more than one
1032 * person or body bears such responsibility.
1033 */
1034 const AUTHOR = "aut";
1035
1036 /**
1037 * Author in quotations or text extracts
1038 *
1039 * Use for a person whose work is largely quoted or extracted in a works to which
1040 * he or she did not contribute directly. Such quotations are found particularly
1041 * in exhibition catalogs, collections of photographs, etc.
1042 */
1043 const AUTHOR_IN_QUOTES = "aqt";
1044
1045 /**
1046 * Author of afterword, colophon, etc.
1047 *
1048 * Use for a person or corporate body responsible for an afterword, postface,
1049 * colophon, etc. but who is not the chief author of a work.
1050 */
1051 const AUTHOR_OF_AFTERWORD = "aft";
1052
1053 /**
1054 * Author of introduction, etc.
1055 *
1056 * Use for a person or corporate body responsible for an introduction, preface,
1057 * foreword, or other critical matter, but who is not the chief author.
1058 */
1059 const AUTHOR_OF_INTRO = "aui";
1060
1061 /**
1062 * Bibliographic antecedent
1063 *
1064 * Use for the author responsible for a work upon which the work represented by
1065 * the catalog record is based. This can be appropriate for adaptations, sequels,
1066 * continuations, indexes, etc.
1067 */
1068 const BIB_ANTECEDENT = "ant";
1069
1070 /**
1071 * Book producer
1072 *
1073 * Use for the person or firm responsible for the production of books and other
1074 * print media, if specific codes (e.g., [bkd], [egr], [tyd], [prt]) are not desired.
1075 */
1076 const BOOK_PRODUCER = "bkp";
1077
1078 /**
1079 * Collaborator
1080 *
1081 * Use for a person or corporate body that takes a limited part in the elaboration
1082 * of a work of another author or that brings complements (e.g., appendices, notes)
1083 * to the work of another author.
1084 */
1085 const COLABORATOR = "clb";
1086
1087 /**
1088 * Commentator
1089 *
1090 * Use for a person who provides interpretation, analysis, or a discussion of the
1091 * subject matter on a recording, motion picture, or other audiovisual medium.
1092 * Compiler [com] Use for a person who produces a work or publication by selecting
1093 * and putting together material from the works of various persons or bodies.
1094 */
1095 const COMMENTATOR = "cmm";
1096
1097 /**
1098 * Designer
1099 *
1100 * Use for a person or organization responsible for design if specific codes (e.g.,
1101 * [bkd], [tyd]) are not desired.
1102 */
1103 const DESIGNER = "dsr";
1104
1105 /**
1106 * Editor
1107 *
1108 * Use for a person who prepares for publication a work not primarily his/her own,
1109 * such as by elucidating text, adding introductory or other critical matter, or
1110 * technically directing an editorial staff.
1111 */
1112 const EDITORT = "edt";
1113
1114 /**
1115 * Illustrator
1116 *
1117 * Use for the person who conceives, and perhaps also implements, a design or
1118 * illustration, usually to accompany a written text.
1119 */
1120 const ILLUSTRATOR = "ill";
1121
1122 /**
1123 * Lyricist
1124 *
1125 * Use for the writer of the text of a song.
1126 */
1127 const LYRICIST = "lyr";
1128
1129 /**
1130 * Metadata contact
1131 *
1132 * Use for the person or organization primarily responsible for compiling and
1133 * maintaining the original description of a metadata set (e.g., geospatial
1134 * metadata set).
1135 */
1136 const METADATA_CONTACT = "mdc";
1137
1138 /**
1139 * Musician
1140 *
1141 * Use for the person who performs music or contributes to the musical content
1142 * of a work when it is not possible or desirable to identify the function more
1143 * precisely.
1144 */
1145 const MUSICIAN = "mus";
1146
1147 /**
1148 * Narrator
1149 *
1150 * Use for the speaker who relates the particulars of an act, occurrence, or
1151 * course of events.
1152 */
1153 const NARRATOR = "nrt";
1154
1155 /**
1156 * Other
1157 *
1158 * Use for relator codes from other lists which have no equivalent in the MARC
1159 * list or for terms which have not been assigned a code.
1160 */
1161 const OTHER = "oth";
1162
1163 /**
1164 * Photographer
1165 *
1166 * Use for the person or organization responsible for taking photographs, whether
1167 * they are used in their original form or as reproductions.
1168 */
1169 const PHOTOGRAPHER = "pht";
1170
1171 /**
1172 * Printer
1173 *
1174 * Use for the person or organization who prints texts, whether from type or plates.
1175 */
1176 const PRINTER = "prt";
1177
1178 /**
1179 * Redactor
1180 *
1181 * Use for a person who writes or develops the framework for an item without
1182 * being intellectually responsible for its content.
1183 */
1184 const REDACTOR = "red";
1185
1186 /**
1187 * Reviewer
1188 *
1189 * Use for a person or corporate body responsible for the review of book, motion
1190 * picture, performance, etc.
1191 */
1192 const REVIEWER = "rev";
1193
1194 /**
1195 * Sponsor
1196 *
1197 * Use for the person or agency that issued a contract, or under whose auspices
1198 * a work has been written, printed, published, etc.
1199 */
1200 const SPONSOR = "spn";
1201
1202 /**
1203 * Thesis advisor
1204 *
1205 * Use for the person under whose supervision a degree candidate develops and
1206 * presents a thesis, memoir, or text of a dissertation.
1207 */
1208 const THESIS_ADVISOR = "ths";
1209
1210 /**
1211 * Transcriber
1212 *
1213 * Use for a person who prepares a handwritten or typewritten copy from original
1214 * material, including from dictated or orally recorded material.
1215 */
1216 const TRANSCRIBER = "trc";
1217
1218 /**
1219 * Translator
1220 *
1221 * Use for a person who renders a text from one language into another, or from
1222 * an older form of a language into the modern form.
1223 */
1224 const TRANSLATOR = "trl";
1225}
1226?>
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php
new file mode 100644
index 00000000..d9b990b7
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/EPub.php
@@ -0,0 +1,2438 @@
1<?php
2/**
3 * Create an ePub compatible book file.
4 *
5 * Please note, once finalized a book can no longer have chapters of data added or changed.
6 *
7 * License: GNU LGPL, Attribution required for commercial implementations, requested for everything else.
8 *
9 * Thanks to: Adam Schmalhofer and Kirstyn Fox for invaluable input and for "nudging" me in the right direction :)
10 *
11 * @author A. Grandt <php@grandt.com>
12 * @copyright 2009-2014 A. Grandt
13 * @license GNU LGPL 2.1
14 * @version 3.20
15 * @link http://www.phpclasses.org/package/6115
16 * @link https://github.com/Grandt/PHPePub
17 * @uses Zip.php version 1.50; http://www.phpclasses.org/browse/package/6110.html or https://github.com/Grandt/PHPZip
18 */
19class EPub {
20 const VERSION = 3.20;
21 const REQ_ZIP_VERSION = 1.60;
22
23 const IDENTIFIER_UUID = 'UUID';
24 const IDENTIFIER_URI = 'URI';
25 const IDENTIFIER_ISBN = 'ISBN';
26
27 /** Ignore all external references, and do not process the file for these */
28 const EXTERNAL_REF_IGNORE = 0;
29 /** Process the file for external references and add them to the book */
30 const EXTERNAL_REF_ADD = 1;
31 /** Process the file for external references and add them to the book, but remove images, and img tags */
32 const EXTERNAL_REF_REMOVE_IMAGES = 2;
33 /** Process the file for external references and add them to the book, but replace images, and img tags with [image] */
34 const EXTERNAL_REF_REPLACE_IMAGES = 3;
35
36 const DIRECTION_LEFT_TO_RIGHT = "ltr";
37 const DIRECTION_RIGHT_TO_LEFT = "rtl";
38
39 const BOOK_VERSION_EPUB2 = "2.0";
40 const BOOK_VERSION_EPUB3 = "3.0";
41
42 private $bookVersion = EPub::BOOK_VERSION_EPUB2;
43
44 private $debugInside = FALSE;
45
46 public $maxImageWidth = 768;
47 public $maxImageHeight = 1024;
48
49 public $splitDefaultSize = 250000;
50 /** Gifs can crash some early ADE based readers, and are disabled by default.
51 * getImage will convert these if it can, unless this is set to TRUE.
52 */
53 public $isGifImagesEnabled = FALSE;
54 public $isReferencesAddedToToc = TRUE;
55
56 private $zip;
57
58 private $title = "";
59 private $language = "en";
60 private $identifier = "";
61 private $identifierType = "";
62 private $description = "";
63 private $author = "";
64 private $authorSortKey = "";
65 private $publisherName = "";
66 private $publisherURL = "";
67 private $date = 0;
68 private $rights = "";
69 private $coverage = "";
70 private $relation = "";
71 private $sourceURL = "";
72
73 private $chapterCount = 0;
74 private $opf = NULL;
75 private $ncx = NULL;
76 private $isFinalized = FALSE;
77 private $isCoverImageSet = FALSE;
78 private $buildTOC = FALSE;
79 private $tocTitle = NULL;
80 private $tocFileName = NULL;
81 private $tocCSSClass = NULL;
82 private $tocAddReferences = FALSE;
83 private $tocCssFileName = NULL;
84
85 private $fileList = array();
86 private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT;
87 private $languageCode = "en";
88
89 /**
90 * Used for building the TOC.
91 * If this list is overwritten it MUST contain at least "text" as an element.
92 */
93 public $referencesOrder = NULL;
94
95 private $dateformat = 'Y-m-d\TH:i:s.000000P'; // ISO 8601 long
96 private $dateformatShort = 'Y-m-d'; // short date format to placate ePubChecker.
97 private $headerDateFormat = "D, d M Y H:i:s T";
98
99 protected $isCurlInstalled;
100 protected $isGdInstalled;
101 protected $isExifInstalled;
102 protected $isFileGetContentsInstalled;
103 protected $isFileGetContentsExtInstalled;
104
105 private $bookRoot = "OEBPS/";
106 private $docRoot = NULL;
107 private $EPubMark = TRUE;
108 private $generator = "";
109
110 private $log = NULL;
111 public $isLogging = TRUE;
112
113 public $encodeHTML = FALSE;
114
115 private $mimetypes = array(
116 "js" => "application/x-javascript", "swf" => "application/x-shockwave-flash", "xht" => "application/xhtml+xml", "xhtml" => "application/xhtml+xml", "zip" => "application/zip",
117 "aif" => "audio/x-aiff", "aifc" => "audio/x-aiff", "aiff" => "audio/x-aiff", "au" => "audio/basic", "kar" => "audio/midi", "m3u" => "audio/x-mpegurl", "mid" => "audio/midi", "midi" => "audio/midi", "mp2" => "audio/mpeg", "mp3" => "audio/mpeg", "mpga" => "audio/mpeg", "oga" => "audio/ogg", "ogg" => "audio/ogg", "ra" => "audio/x-realaudio", "ram" => "audio/x-pn-realaudio", "rm" => "audio/x-pn-realaudio", "rpm" => "audio/x-pn-realaudio-plugin", "snd" => "audio/basic", "wav" => "audio/x-wav",
118 "bmp" => "image/bmp", "djv" => "image/vnd.djvu", "djvu" => "image/vnd.djvu", "gif" => "image/gif", "ief" => "image/ief", "jpe" => "image/jpeg", "jpeg" => "image/jpeg", "jpg" => "image/jpeg", "pbm" => "image/x-portable-bitmap", "pgm" => "image/x-portable-graymap", "png" => "image/png", "pnm" => "image/x-portable-anymap", "ppm" => "image/x-portable-pixmap", "ras" => "image/x-cmu-raster", "rgb" => "image/x-rgb", "tif" => "image/tif", "tiff" => "image/tiff", "wbmp" => "image/vnd.wap.wbmp", "xbm" => "image/x-xbitmap", "xpm" => "image/x-xpixmap", "xwd" => "image/x-windowdump",
119 "asc" => "text/plain", "css" => "text/css", "etx" => "text/x-setext", "htm" => "text/html", "html" => "text/html", "rtf" => "text/rtf", "rtx" => "text/richtext", "sgm" => "text/sgml", "sgml" => "text/sgml", "tsv" => "text/tab-seperated-values", "txt" => "text/plain", "wml" => "text/vnd.wap.wml", "wmls" => "text/vnd.wap.wmlscript", "xml" => "text/xml", "xsl" => "text/xml",
120 "avi" => "video/x-msvideo", "mov" => "video/quicktime", "movie" => "video/x-sgi-movie", "mp4" => "video/mp4", "mpe" => "video/mpeg", "mpeg" => "video/mpeg", "mpg" => "video/mpeg", "mxu" => "video/vnd.mpegurl", "ogv" => "video/ogg", "qt" => "video/quicktime", "webm" => "video/webm");
121
122 // These are the ONLY allowed types in that these are the ones ANY reader must support, any other MUST have the fallback attribute pointing to one of these.
123 private $coreMediaTypes = array("image/gif", "image/jpeg", "image/png", "image/svg+xml", "application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/css", "text/x-oeb1-css", "text/x-oeb1-document");
124
125 private $opsContentTypes = array("application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/x-oeb1-document");
126
127 private $forbiddenCharacters = array("?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "%");
128
129 private $htmlContentHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<title></title>\n</head>\n<body>\n";
130 private $htmlContentFooter = "</body>\n</html>\n";
131
132 /**
133 * Class constructor.
134 *
135 * @return void
136 */
137 function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $debugInside = FALSE, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) {
138 include_once("Zip.php");
139 include_once("Logger.php");
140
141 if (!$debugInside) {
142 error_reporting(E_ERROR | E_PARSE);
143 }
144
145 $this->bookVersion = $bookVersion;
146 $this->writingDirection = $writingDirection;
147 $this->languageCode = $languageCode;
148
149 $this->log = new Logger("EPub", $this->isLogging);
150
151 /* Prepare Logging. Just in case it's used. later */
152 if ($this->isLogging) {
153 $this->log->logLine("EPub class version....: " . self::VERSION);
154 $this->log->logLine("EPub req. Zip version.: " . self::REQ_ZIP_VERSION);
155 $this->log->logLine("Zip version...........: " . Zip::VERSION);
156 $this->log->dumpInstalledModules();
157 }
158
159 if (!defined("Zip::VERSION") || Zip::VERSION < self::REQ_ZIP_VERSION) {
160 die("<p>EPub version " . self::VERSION . " requires Zip.php at version " . self::REQ_ZIP_VERSION . " or higher.<br />You can obtain the latest version from <a href=\"http://www.phpclasses.org/browse/package/6110.html\">http://www.phpclasses.org/browse/package/6110.html</a>.</p>");
161 }
162
163 include_once("EPubChapterSplitter.php");
164 include_once("EPub.HtmlEntities.php");
165 include_once("EPub.NCX.php");
166 include_once("EPub.OPF.php");
167
168 $this->initialize();
169 }
170
171 /**
172 * Class destructor
173 *
174 * @return void
175 * @TODO make sure elements in the destructor match the current class elements
176 */
177 function __destruct() {
178 unset($this->bookVersion, $this->maxImageWidth, $this->maxImageHeight);
179 unset($this->splitDefaultSize, $this->isGifImagesEnabled, $this->isReferencesAddedToToc);
180 unset($this->zip, $this->title, $this->language, $this->identifier, $this->identifierType);
181 unset($this->description, $this->author, $this->authorSortKey, $this->publisherName);
182 unset($this->publisherURL, $this->date, $this->rights, $this->coverage, $this->relation);
183 unset($this->sourceURL, $this->chapterCount, $this->opf, $this->ncx, $this->isFinalized);
184 unset($this->isCoverImageSet, $this->fileList, $this->writingDirection, $this->languageCode);
185 unset($this->referencesOrder, $this->dateformat, $this->dateformatShort, $this->headerDateFormat);
186 unset($this->isCurlInstalled, $this->isGdInstalled, $this->isExifInstalled);
187 unset($this->isFileGetContentsInstalled, $this->isFileGetContentsExtInstalled, $this->bookRoot);
188 unset($this->docRoot, $this->EPubMark, $this->generator, $this->log, $this->isLogging);
189 unset($this->encodeHTML, $this->mimetypes, $this->coreMediaTypes, $this->opsContentTypes);
190 unset($this->forbiddenCharacters, $this->htmlContentHeader, $this->htmlContentFooter);
191 unset($this->buildTOC, $this->tocTitle, $this->tocCSSClass, $this->tocAddReferences);
192 unset($this->tocFileName, $this->tocCssFileName);
193 }
194
195 /**
196 * initialize defaults.
197 */
198 private function initialize() {
199 $this->referencesOrder = array(
200 Reference::COVER => "Cover Page",
201 Reference::TITLE_PAGE => "Title Page",
202 Reference::ACKNOWLEDGEMENTS => "Acknowledgements",
203 Reference::BIBLIOGRAPHY => "Bibliography",
204 Reference::COLOPHON => "Colophon",
205 Reference::COPYRIGHT_PAGE => "Copyright",
206 Reference::DEDICATION => "Dedication",
207 Reference::EPIGRAPH => "Epigraph",
208 Reference::FOREWORD => "Foreword",
209 Reference::TABLE_OF_CONTENTS => "Table of Contents",
210 Reference::NOTES => "Notes",
211 Reference::PREFACE => "Preface",
212 Reference::TEXT => "First Page",
213 Reference::LIST_OF_ILLUSTRATIONS => "List of Illustrations",
214 Reference::LIST_OF_TABLES => "List of Tables",
215 Reference::GLOSSARY => "Glossary",
216 Reference::INDEX => "Index");
217
218 $this->docRoot = filter_input(INPUT_SERVER, "DOCUMENT_ROOT") . "/";
219
220 $this->isCurlInstalled = extension_loaded('curl') && function_exists('curl_version');
221 $this->isGdInstalled = extension_loaded('gd') && function_exists('gd_info');
222 $this->isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype');
223 $this->isFileGetContentsInstalled = function_exists('file_get_contents');
224 $this->isFileGetContentsExtInstalled = $this->isFileGetContentsInstalled && ini_get('allow_url_fopen');
225
226 $this->zip = new Zip();
227 $this->zip->setExtraField(FALSE);
228 $this->zip->addFile("application/epub+zip", "mimetype");
229 $this->zip->setExtraField(TRUE);
230 $this->zip->addDirectory("META-INF");
231
232 $this->content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n\t<rootfiles>\n\t\t<rootfile full-path=\"" . $this->bookRoot . "book.opf\" media-type=\"application/oebps-package+xml\" />\n\t</rootfiles>\n</container>\n";
233
234 if (!$this->isEPubVersion2()) {
235 $this->htmlContentHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
236 . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n"
237 . "<head>"
238 . "<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n"
239 . "<title></title>\n"
240 . "</head>\n"
241 . "<body>\n";
242 }
243
244 $this->zip->addFile($this->content, "META-INF/container.xml", 0, NULL, FALSE);
245 $this->content = NULL;
246 $this->ncx = new Ncx(NULL, NULL, NULL, $this->languageCode, $this->writingDirection);
247 $this->opf = new Opf();
248 $this->ncx->setVersion($this->bookVersion);
249 $this->opf->setVersion($this->bookVersion);
250 $this->opf->addItem("ncx", "book.ncx", Ncx::MIMETYPE);
251 $this->chapterCount = 0;
252 }
253
254 /**
255 * Add dynamically generated data as a file to the book.
256 *
257 * @param string $fileName Filename to use for the file, must be unique for the book.
258 * @param string $fileId Unique identifier for the file.
259 * @param string $fileData File data
260 * @param string $mimetype file mime type
261 * @return bool $success
262 */
263 function addFile($fileName, $fileId, $fileData, $mimetype) {
264 if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) {
265 return FALSE;
266 }
267
268 $fileName = $this->normalizeFileName($fileName);
269
270 $compress = (strpos($mimetype, "image/") !== 0);
271
272 $this->zip->addFile($fileData, $this->bookRoot.$fileName, 0, NULL, $compress);
273 $this->fileList[$fileName] = $fileName;
274 $this->opf->addItem($fileId, $fileName, $mimetype);
275 return TRUE;
276 }
277
278 /**
279 * Add a large file directly from the filestystem to the book.
280 *
281 * @param string $fileName Filename to use for the file, must be unique for the book.
282 * @param string $fileId Unique identifier for the file.
283 * @param string $filePath File path
284 * @param string $mimetype file mime type
285 * @return bool $success
286 */
287 function addLargeFile($fileName, $fileId, $filePath, $mimetype) {
288 if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) {
289 return FALSE;
290 }
291 $fileName = $this->normalizeFileName($fileName);
292
293 if ($this->zip->addLargeFile($filePath, $this->bookRoot.$fileName)) {
294 $this->fileList[$fileName] = $fileName;
295 $this->opf->addItem($fileId, $fileName, $mimetype);
296 return TRUE;
297 }
298 return FALSE;
299 }
300
301 /**
302 * Add a CSS file to the book.
303 *
304 * @param string $fileName Filename to use for the CSS file, must be unique for the book.
305 * @param string $fileId Unique identifier for the file.
306 * @param string $fileData CSS data
307 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for <code>processCSSExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE.
308 * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE.
309 *
310 * @return bool $success
311 */
312 function addCSSFile($fileName, $fileId, $fileData, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") {
313 if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) {
314 return FALSE;
315 }
316 $fileName = Zip::getRelativePath($fileName);
317 $fileName = preg_replace('#^[/\.]+#i', "", $fileName);
318
319 if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) {
320 $cssDir = pathinfo($fileName);
321 $cssDir = preg_replace('#^[/\.]+#i', "", $cssDir["dirname"] . "/");
322 if (!empty($cssDir)) {
323 $cssDir = preg_replace('#[^/]+/#i', "../", $cssDir);
324 }
325
326 $this->processCSSExternalReferences($fileData, $externalReferences, $baseDir, $cssDir);
327 }
328
329 $this->addFile($fileName, "css_" . $fileId, $fileData, "text/css");
330
331 return TRUE;
332 }
333
334 /**
335 * Add a chapter to the book, as a chapter should not exceed 250kB, you can parse an array with multiple parts as $chapterData.
336 * These will still only show up as a single chapter in the book TOC.
337 *
338 * @param string $chapterName Name of the chapter, will be use din the TOC
339 * @param string $fileName Filename to use for the chapter, must be unique for the book.
340 * @param string $chapter Chapter text in XHTML or array $chapterData valid XHTML data for the chapter. File should NOT exceed 250kB.
341 * @param bool $autoSplit Should the chapter be split if it exceeds the default split size? Default=FALSE, only used if $chapterData is a string.
342 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for <code>processChapterExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE.
343 * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE.
344 * @return mixed $success FALSE if the addition failed, else the new NavPoint.
345 */
346 function addChapter($chapterName, $fileName, $chapterData = NULL, $autoSplit = FALSE, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") {
347 if ($this->isFinalized) {
348 return FALSE;
349 }
350 $fileName = Zip::getRelativePath($fileName);
351 $fileName = preg_replace('#^[/\.]+#i', "", $fileName);
352 $fileName = $this->sanitizeFileName($fileName);
353
354 $chapter = $chapterData;
355 if ($autoSplit && is_string($chapterData) && mb_strlen($chapterData) > $this->splitDefaultSize) {
356 $splitter = new EPubChapterSplitter();
357
358 $chapterArray = $splitter->splitChapter($chapterData);
359 if (count($chapterArray) > 1) {
360 $chapter = $chapterArray;
361 }
362 }
363
364 if (!empty($chapter) && is_string($chapter)) {
365 if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) {
366 $htmlDirInfo = pathinfo($fileName);
367 $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/");
368 $this->processChapterExternalReferences($chapter, $externalReferences, $baseDir, $htmlDir);
369 }
370
371 if ($this->encodeHTML === TRUE) {
372 $chapter = $this->encodeHtml($chapter);
373 }
374
375 $this->chapterCount++;
376 $this->addFile($fileName, "chapter" . $this->chapterCount, $chapter, "application/xhtml+xml");
377 $this->opf->addItemRef("chapter" . $this->chapterCount);
378
379 $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount);
380 $this->ncx->addNavPoint($navPoint);
381 $this->ncx->chapterList[$chapterName] = $navPoint;
382 } else if (is_array($chapter)) {
383 $fileNameParts = pathinfo($fileName);
384 $extension = $fileNameParts['extension'];
385 $name = $fileNameParts['filename'];
386
387 $partCount = 0;
388 $this->chapterCount++;
389
390 $oneChapter = each($chapter);
391 while ($oneChapter) {
392 list($k, $v) = $oneChapter;
393 if ($this->encodeHTML === TRUE) {
394 $v = $this->encodeHtml($v);
395 }
396
397 if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) {
398 $this->processChapterExternalReferences($v, $externalReferences, $baseDir);
399 }
400 $partCount++;
401 $partName = $name . "_" . $partCount;
402 $this->addFile($partName . "." . $extension, $partName, $v, "application/xhtml+xml");
403 $this->opf->addItemRef($partName);
404
405 $oneChapter = each($chapter);
406 }
407 $partName = $name . "_1." . $extension;
408 $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $partName, $partName);
409 $this->ncx->addNavPoint($navPoint);
410
411 $this->ncx->chapterList[$chapterName] = $navPoint;
412 } else if (!isset($chapterData) && strpos($fileName, "#") > 0) {
413 $this->chapterCount++;
414 //$this->opf->addItemRef("chapter" . $this->chapterCount);
415
416 $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount);
417 $this->ncx->addNavPoint($navPoint);
418 $this->ncx->chapterList[$chapterName] = $navPoint;
419 } else if (!isset($chapterData) && $fileName=="TOC.xhtml") {
420 $this->chapterCount++;
421 $this->opf->addItemRef("toc");
422
423 $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount);
424 $this->ncx->addNavPoint($navPoint);
425 $this->ncx->chapterList[$chapterName] = $navPoint;
426 }
427 return $navPoint;
428 }
429
430 /**
431 * Add one chapter level.
432 *
433 * Subsequent chapters will be added to this level.
434 *
435 * @param string $navTitle
436 * @param string $navId
437 * @param string $navClass
438 * @param int $isNavHidden
439 * @param string $writingDirection
440 * @return NavPoint The new NavPoint for that level.
441 */
442 function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) {
443 return $this->ncx->subLevel($this->decodeHtmlEntities($navTitle), $navId, $navClass, $isNavHidden, $writingDirection);
444 }
445
446 /**
447 * Step back one chapter level.
448 *
449 * Subsequent chapters will be added to this chapters parent level.
450 */
451 function backLevel() {
452 $this->ncx->backLevel();
453 }
454
455 /**
456 * Step back to the root level.
457 *
458 * Subsequent chapters will be added to the rooot NavMap.
459 */
460 function rootLevel() {
461 $this->ncx->rootLevel();
462 }
463
464 /**
465 * Step back to the given level.
466 * Useful for returning to a previous level from deep within the structure.
467 * Values below 2 will have the same effect as rootLevel()
468 *
469 * @param int $newLevel
470 */
471 function setCurrentLevel($newLevel) {
472 $this->ncx->setCurrentLevel($newLevel);
473 }
474
475 /**
476 * Get current level count.
477 * The indentation of the current structure point.
478 *
479 * @return current level count;
480 */
481 function getCurrentLevel() {
482 return $this->ncx->getCurrentLevel();
483 }
484
485 /**
486 * Wrap ChapterContent with Head and Footer
487 *
488 * @param $content
489 * @return string $content
490 */
491 private function wrapChapter($content) {
492 return $this->htmlContentHeader . "\n" . $content . "\n" . $this->htmlContentFooter;
493 }
494
495 /**
496 * Reference pages is usually one or two pages for items such as Table of Contents, reference lists, Author notes or Acknowledgements.
497 * These do not show up in the regular navigation list.
498 *
499 * As they are supposed to be short.
500 *
501 * @param string $pageName Name of the chapter, will be use din the TOC
502 * @param string $fileName Filename to use for the chapter, must be unique for the book.
503 * @param string $pageData Page content in XHTML. File should NOT exceed 250kB.
504 * @param string $reference Reference key
505 * @param int $externalReferences How to handle external references. See documentation for <code>processChapterExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE.
506 * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE.
507 * @return bool $success
508 */
509 function addReferencePage($pageName, $fileName, $pageData, $reference, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") {
510 if ($this->isFinalized) {
511 return FALSE;
512 }
513 $fileName = Zip::getRelativePath($fileName);
514 $fileName = preg_replace('#^[/\.]+#i', "", $fileName);
515
516
517 if (!empty($pageData) && is_string($pageData)) {
518 if ($this->encodeHTML === TRUE) {
519 $pageData = $this->encodeHtml($pageData);
520 }
521
522 $this->wrapChapter($pageData);
523
524 if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) {
525 $htmlDirInfo = pathinfo($fileName);
526 $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/");
527 $this->processChapterExternalReferences($pageData, $externalReferences, $baseDir, $htmlDir);
528 }
529
530 $this->addFile($fileName, "ref_" . $reference, $pageData, "application/xhtml+xml");
531
532 if ($reference !== Reference::TABLE_OF_CONTENTS || !isset($this->ncx->referencesList[$reference])) {
533 $this->opf->addItemRef("ref_" . $reference, FALSE);
534 $this->opf->addReference($reference, $pageName, $fileName);
535
536 $this->ncx->referencesList[$reference] = $fileName;
537 $this->ncx->referencesName[$reference] = $pageName;
538 }
539 return TRUE;
540 }
541 return TRUE;
542 }
543
544 /**
545 * Add custom metadata to the book.
546 *
547 * It is up to the builder to make sure there are no collisions. Metadata are just key value pairs.
548 *
549 * @param string $name
550 * @param string $content
551 */
552 function addCustomMetadata($name, $content) {
553 $this->opf->addMeta($name, $content);
554 }
555
556 /**
557 * Add DublinCore metadata to the book
558 *
559 * Use the DublinCore constants included in EPub, ie DublinCore::DATE
560 *
561 * @param string $dublinCore name
562 * @param string $value
563 */
564 function addDublinCoreMetadata($dublinCoreConstant, $value) {
565 if ($this->isFinalized) {
566 return;
567 }
568
569 $this->opf->addDCMeta($dublinCoreConstant, $this->decodeHtmlEntities($value));
570 }
571
572 /**
573 * Add a cover image to the book.
574 * If the $imageData is not set, the function assumes the $fileName is the path to the image file.
575 *
576 * The styling and structure of the generated XHTML is heavily inspired by the XHTML generated by Calibre.
577 *
578 * @param string $fileName Filename to use for the image, must be unique for the book.
579 * @param string $imageData Binary image data
580 * @param string $mimetype Image mimetype, such as "image/jpeg" or "image/png".
581 * @return bool $success
582 */
583 function setCoverImage($fileName, $imageData = NULL, $mimetype = NULL,$bookTitle) {
584 if ($this->isFinalized || $this->isCoverImageSet || array_key_exists("CoverPage.html", $this->fileList)) {
585 return FALSE;
586 }
587
588 if ($imageData == NULL) {
589 // assume $fileName is the valid file path.
590 if (!file_exists($fileName)) {
591 // Attempt to locate the file using the doc root.
592 $rp = realpath($this->docRoot . "/" . $fileName);
593
594 if ($rp !== FALSE) {
595 // only assign the docroot path if it actually exists there.
596 $fileName = $rp;
597 }
598 }
599 $image = $this->getImage($fileName);
600 $imageData = $image['image'];
601 $mimetype = $image['mime'];
602 $fileName = preg_replace("#\.[^\.]+$#", "." . $image['ext'], $fileName);
603 }
604
605
606 $path = pathinfo($fileName);
607 $imgPath = "images/" . $path["basename"];
608
609 if (empty($mimetype) && file_exists($fileName)) {
610 list($width, $height, $type, $attr) = getimagesize($fileName);
611 $mimetype = image_type_to_mime_type($type);
612 }
613 if (empty($mimetype)) {
614 $ext = strtolower($path['extension']);
615 if ($ext == "jpg") {
616 $ext = "jpeg";
617 }
618 $mimetype = "image/" . $ext;
619 }
620
621 $coverPage = "";
622
623 if ($this->isEPubVersion2()) {
624 $coverPage = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
625 . "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n"
626 . " \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"
627 . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" xml:lang=\"en\">\n"
628 . "\t<head>\n"
629 . "\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n"
630 . "\t\t<title>Cover Image</title>\n"
631 . "\t\t<link type=\"text/css\" rel=\"stylesheet\" href=\"Styles/CoverPage.css\" />\n"
632 . "\t</head>\n"
633 . "\t<body>\n"
634 . "\t" . $bookTitle . "\n"
635 . "\t\t<div>\n"
636 . "\t\t\t<img src=\"" . $imgPath . "\" alt=\"Cover image\" style=\"height: 100%\"/>\n"
637 . "\t\t</div>\n"
638 . "\t</body>\n"
639 . "</html>\n";
640 } else {
641 $coverPage = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
642 . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n"
643 . "<head>"
644 . "\t<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n"
645 . "\t\t<title>Cover Image</title>\n"
646 . "\t\t<link type=\"text/css\" rel=\"stylesheet\" href=\"Styles/CoverPage.css\" />\n"
647 . "\t</head>\n"
648 . "\t<body>\n"
649 . "\t\t<section epub:type=\"cover\">\n"
650 . "\t" . $bookTitle . "\n"
651 . "\t\t\t<img src=\"" . $imgPath . "\" alt=\"Cover image\" style=\"height: 30%\"/>\n"
652 . "\t\t</section>\n"
653 . "\t</body>\n"
654 . "</html>\n";
655 }
656 $coverPageCss = "@page, body, div, img {\n"
657 . "\tpadding: 0pt;\n"
658 . "\tmargin:0pt;\n"
659 . "}\n\nbody {\n"
660 . "\ttext-align: center;\n"
661 . "}\n";
662
663 $this->addCSSFile("Styles/CoverPage.css", "CoverPageCss", $coverPageCss);
664 $this->addFile($imgPath, "CoverImage", $imageData, $mimetype);
665 $this->addReferencePage("CoverPage", "CoverPage.xhtml", $coverPage, "cover");
666 $this->isCoverImageSet = TRUE;
667 return TRUE;
668 }
669
670 /**
671 * Process external references from a HTML to the book. The chapter itself is not stored.
672 * the HTML is scanned for &lt;link..., &lt;style..., and &lt;img tags.
673 * Embedded CSS styles and links will also be processed.
674 * Script tags are not processed, as scripting should be avoided in e-books.
675 *
676 * EPub keeps track of added files, and duplicate files referenced across multiple
677 * chapters, are only added once.
678 *
679 * If the $doc is a string, it is assumed to be the content of an HTML file,
680 * else is it assumes to be a DOMDocument.
681 *
682 * Basedir is the root dir the HTML is supposed to "live" in, used to resolve
683 * relative references such as <code>&lt;img src="../images/image.png"/&gt;</code>
684 *
685 * $externalReferences determines how the function will handle external references.
686 *
687 * @param mixed &$doc (referenced)
688 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD.
689 * @param string $baseDir Default is "", meaning it is pointing to the document root.
690 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
691 *
692 * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE).
693 */
694 protected function processChapterExternalReferences(&$doc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") {
695 if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) {
696 return FALSE;
697 }
698
699 $backPath = preg_replace('#[^/]+/#i', "../", $htmlDir);
700 $isDocAString = is_string($doc);
701 $xmlDoc = NULL;
702
703 if ($isDocAString) {
704 $xmlDoc = new DOMDocument();
705 @$xmlDoc->loadHTML($doc);
706 } else {
707 $xmlDoc = $doc;
708 }
709
710 $this->processChapterStyles($xmlDoc, $externalReferences, $baseDir, $htmlDir);
711 $this->processChapterLinks($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath);
712 $this->processChapterImages($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath);
713 $this->processChapterSources($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath);
714
715 if ($isDocAString) {
716 //$html = $xmlDoc->saveXML();
717
718 $htmlNode = $xmlDoc->getElementsByTagName("html");
719 $headNode = $xmlDoc->getElementsByTagName("head");
720 $bodyNode = $xmlDoc->getElementsByTagName("body");
721
722 $htmlNS = "";
723 for ($index = 0; $index < $htmlNode->item(0)->attributes->length; $index++) {
724 $nodeName = $htmlNode->item(0)->attributes->item($index)->nodeName;
725 $nodeValue = $htmlNode->item(0)->attributes->item($index)->nodeValue;
726
727 if ($nodeName != "xmlns") {
728 $htmlNS .= " $nodeName=\"$nodeValue\"";
729 }
730 }
731
732 $xml = new DOMDocument('1.0', "utf-8");
733 $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
734 $xml->preserveWhiteSpace = FALSE;
735 $xml->formatOutput = TRUE;
736
737 $xml2Doc = new DOMDocument('1.0', "utf-8");
738 $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
739 $xml2Doc->loadXML("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\"$htmlNS>\n</html>\n");
740 $html = $xml2Doc->getElementsByTagName("html")->item(0);
741 $html->appendChild($xml2Doc->importNode($headNode->item(0), TRUE));
742 $html->appendChild($xml2Doc->importNode($bodyNode->item(0), TRUE));
743
744 // force pretty printing and correct formatting, should not be needed, but it is.
745 $xml->loadXML($xml2Doc->saveXML());
746 $doc = $xml->saveXML();
747
748 if (!$this->isEPubVersion2()) {
749 $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
750 }
751 }
752 return TRUE;
753 }
754
755 /**
756 * Process images referenced from an CSS file to the book.
757 *
758 * $externalReferences determins how the function will handle external references.
759 *
760 * @param string &$cssFile (referenced)
761 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD.
762 * @param string $baseDir Default is "", meaning it is pointing to the document root.
763 * @param string $cssDir The of the CSS file's directory from the root of the archive.
764 *
765 * @return bool FALSE if unsuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE).
766 */
767 protected function processCSSExternalReferences(&$cssFile, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $cssDir = "") {
768 if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) {
769 return FALSE;
770 }
771
772 $backPath = preg_replace('#[^/]+/#i', "../", $cssDir);
773 $imgs = null;
774 preg_match_all('#url\s*\([\'\"\s]*(.+?)[\'\"\s]*\)#im', $cssFile, $imgs, PREG_SET_ORDER);
775
776 $itemCount = count($imgs);
777 for ($idx = 0; $idx < $itemCount; $idx++) {
778 $img = $imgs[$idx];
779 if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES || $externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) {
780 $cssFile = str_replace($img[0], "", $cssFile);
781 } else {
782 $source = $img[1];
783
784 $pathData = pathinfo($source);
785 $internalSrc = $pathData['basename'];
786 $internalPath = "";
787 $isSourceExternal = FALSE;
788
789 if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $cssDir, $backPath)) {
790 $cssFile = str_replace($img[0], "url('" . $backPath . $internalPath . "')", $cssFile);
791 } else if ($isSourceExternal) {
792 $cssFile = str_replace($img[0], "", $cssFile); // External image is missing
793 } // else do nothing, if the image is local, and missing, assume it's been generated.
794 }
795 }
796 return TRUE;
797 }
798
799 /**
800 * Process style tags in a DOMDocument. Styles will be passed as CSS files and reinserted into the document.
801 *
802 * @param DOMDocument &$xmlDoc (referenced)
803 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD.
804 * @param string $baseDir Default is "", meaning it is pointing to the document root.
805 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
806 *
807 * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE).
808 */
809 protected function processChapterStyles(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") {
810 if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) {
811 return FALSE;
812 }
813 // process inlined CSS styles in style tags.
814 $styles = $xmlDoc->getElementsByTagName("style");
815 $styleCount = $styles->length;
816 for ($styleIdx = 0; $styleIdx < $styleCount; $styleIdx++) {
817 $style = $styles->item($styleIdx);
818
819 $styleData = preg_replace('#[/\*\s]*\<\!\[CDATA\[[\s\*/]*#im', "", $style->nodeValue);
820 $styleData = preg_replace('#[/\*\s]*\]\]\>[\s\*/]*#im', "", $styleData);
821
822 $this->processCSSExternalReferences($styleData, $externalReferences, $baseDir, $htmlDir);
823 $style->nodeValue = "\n" . trim($styleData) . "\n";
824 }
825 return TRUE;
826 }
827
828 /**
829 * Process link tags in a DOMDocument. Linked files will be loaded into the archive, and the link src will be rewritten to point to that location.
830 * Link types text/css will be passed as CSS files.
831 *
832 * @param DOMDocument &$xmlDoc (referenced)
833 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD.
834 * @param string $baseDir Default is "", meaning it is pointing to the document root.
835 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
836 * @param string $backPath The path to get back to the root of the archive from $htmlDir.
837 *
838 * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE).
839 */
840 protected function processChapterLinks(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") {
841 if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) {
842 return FALSE;
843 }
844 // process link tags.
845 $links = $xmlDoc->getElementsByTagName("link");
846 $linkCount = $links->length;
847 for ($linkIdx = 0; $linkIdx < $linkCount; $linkIdx++) {
848 $link = $links->item($linkIdx);
849 $source = $link->attributes->getNamedItem("href")->nodeValue;
850 $sourceData = NULL;
851
852 $pathData = pathinfo($source);
853 $internalSrc = $pathData['basename'];
854
855 if (preg_match('#^(http|ftp)s?://#i', $source) == 1) {
856 $urlinfo = parse_url($source);
857
858 if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) {
859 $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1);
860 }
861
862 @$sourceData = getFileContents($source);
863 } else if (strpos($source, "/") === 0) {
864 @$sourceData = file_get_contents($this->docRoot . $source);
865 } else {
866 @$sourceData = file_get_contents($this->docRoot . $baseDir . "/" . $source);
867 }
868
869 if (!empty($sourceData)) {
870 if (!array_key_exists($internalSrc, $this->fileList)) {
871 $mime = $link->attributes->getNamedItem("type")->nodeValue;
872 if (empty($mime)) {
873 $mime = "text/plain";
874 }
875 if ($mime == "text/css") {
876 $this->processCSSExternalReferences($sourceData, $externalReferences, $baseDir, $htmlDir);
877 $this->addCSSFile($internalSrc, $internalSrc, $sourceData, EPub::EXTERNAL_REF_IGNORE, $baseDir);
878 $link->setAttribute("href", $backPath . $internalSrc);
879 } else {
880 $this->addFile($internalSrc, $internalSrc, $sourceData, $mime);
881 }
882 $this->fileList[$internalSrc] = $source;
883 } else {
884 $link->setAttribute("href", $backPath . $internalSrc);
885 }
886 } // else do nothing, if the link is local, and missing, assume it's been generated.
887 }
888 return TRUE;
889 }
890
891 /**
892 * Process img tags in a DOMDocument.
893 * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly.
894 *
895 * @param DOMDocument &$xmlDoc (referenced)
896 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD.
897 * @param string $baseDir Default is "", meaning it is pointing to the document root.
898 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
899 * @param string $backPath The path to get back to the root of the archive from $htmlDir.
900 *
901 * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE).
902 */
903 protected function processChapterImages(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") {
904 if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) {
905 return FALSE;
906 }
907 // process img tags.
908 $postProcDomElememts = array();
909 $images = $xmlDoc->getElementsByTagName("img");
910 $itemCount = $images->length;
911
912 for ($idx = 0; $idx < $itemCount; $idx++) {
913 $img = $images->item($idx);
914
915 if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) {
916 $postProcDomElememts[] = $img;
917 } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) {
918 $altNode = $img->attributes->getNamedItem("alt");
919 $alt = "image";
920 if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) {
921 $alt = $altNode->nodeValue;
922 }
923 $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "<em>[" . $alt . "]</em>"));
924 } else {
925 $source = $img->attributes->getNamedItem("src")->nodeValue;
926
927 $parsedSource = parse_url($source);
928 $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME)));
929 $internalPath = "";
930 $isSourceExternal = FALSE;
931
932 if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) {
933 $img->setAttribute("src", $backPath . $internalPath);
934 } else if ($isSourceExternal) {
935 $postProcDomElememts[] = $img; // External image is missing
936 } // else do nothing, if the image is local, and missing, assume it's been generated.
937 }
938 }
939
940 foreach ($postProcDomElememts as $target) {
941 if (is_array($target)) {
942 $target[0]->parentNode->replaceChild($target[1], $target[0]);
943 } else {
944 $target->parentNode->removeChild($target);
945 }
946 }
947 return TRUE;
948 }
949
950 /**
951 * Process source tags in a DOMDocument.
952 * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly.
953 *
954 * @param DOMDocument &$xmlDoc (referenced)
955 * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD.
956 * @param string $baseDir Default is "", meaning it is pointing to the document root.
957 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
958 * @param string $backPath The path to get back to the root of the archive from $htmlDir.
959 *
960 * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE).
961 */
962 protected function processChapterSources(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") {
963 if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) {
964 return FALSE;
965 }
966
967 if ($this->bookVersion !== EPub::BOOK_VERSION_EPUB3) {
968 // ePub 2 does not support multimedia formats, and they must be removed.
969 $externalReferences = EPub::EXTERNAL_REF_REMOVE_IMAGES;
970 }
971
972 $postProcDomElememts = array();
973 $images = $xmlDoc->getElementsByTagName("source");
974 $itemCount = $images->length;
975 for ($idx = 0; $idx < $itemCount; $idx++) {
976 $img = $images->item($idx);
977 if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) {
978 $postProcDomElememts[] = $img;
979 } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) {
980 $altNode = $img->attributes->getNamedItem("alt");
981 $alt = "image";
982 if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) {
983 $alt = $altNode->nodeValue;
984 }
985 $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "[" . $alt . "]"));
986 } else {
987 $source = $img->attributes->getNamedItem("src")->nodeValue;
988
989 $parsedSource = parse_url($source);
990 $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME)));
991 $internalPath = "";
992 $isSourceExternal = FALSE;
993
994 if ($this->resolveMedia($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) {
995 $img->setAttribute("src", $backPath . $internalPath);
996 } else if ($isSourceExternal) {
997 $postProcDomElememts[] = $img; // External image is missing
998 } // else do nothing, if the image is local, and missing, assume it's been generated.
999 }
1000 }
1001 }
1002
1003 /**
1004 * Resolve an image src and determine it's target location and add it to the book.
1005 *
1006 * @param string $source Image Source link.
1007 * @param string &$internalPath (referenced) Return value, will be set to the target path and name in the book.
1008 * @param string &$internalSrc (referenced) Return value, will be set to the target name in the book.
1009 * @param string &$isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL.
1010 * @param string $baseDir Default is "", meaning it is pointing to the document root.
1011 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
1012 * @param string $backPath The path to get back to the root of the archive from $htmlDir.
1013 */
1014 protected function resolveImage($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") {
1015 if ($this->isFinalized) {
1016 return FALSE;
1017 }
1018 $imageData = NULL;
1019
1020 if (preg_match('#^(http|ftp)s?://#i', $source) == 1) {
1021 $urlinfo = parse_url($source);
1022 $urlPath = pathinfo($urlinfo['path']);
1023
1024 if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) {
1025 $internalSrc = $this->sanitizeFileName(urldecode(substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1)));
1026 }
1027 $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME);
1028 $isSourceExternal = TRUE;
1029 $imageData = $this->getImage($source);
1030 } else if (strpos($source, "/") === 0) {
1031 $internalPath = pathinfo($source, PATHINFO_DIRNAME);
1032
1033 $path = $source;
1034 if (!file_exists($path)) {
1035 $path = $this->docRoot . $path;
1036 }
1037
1038 $imageData = $this->getImage($path);
1039 } else {
1040 $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME));
1041
1042 $path = $baseDir . "/" . $source;
1043 if (!file_exists($path)) {
1044 $path = $this->docRoot . $path;
1045 }
1046
1047 $imageData = $this->getImage($path);
1048 }
1049 if ($imageData !== FALSE) {
1050 $iSrcInfo = pathinfo($internalSrc);
1051 if (!empty($imageData['ext']) && $imageData['ext'] != $iSrcInfo['extension']) {
1052 $internalSrc = $iSrcInfo['filename'] . "." . $imageData['ext'];
1053 }
1054 $internalPath = Zip::getRelativePath("images/" . $internalPath . "/" . $internalSrc);
1055 if (!array_key_exists($internalPath, $this->fileList)) {
1056 $this->addFile($internalPath, "i_" . $internalSrc, $imageData['image'], $imageData['mime']);
1057 $this->fileList[$internalPath] = $source;
1058 }
1059 return TRUE;
1060 }
1061 return FALSE;
1062 }
1063
1064 /**
1065 * Resolve a media src and determine it's target location and add it to the book.
1066 *
1067 * @param string $source Source link.
1068 * @param string $internalPath (referenced) Return value, will be set to the target path and name in the book.
1069 * @param string $internalSrc (referenced) Return value, will be set to the target name in the book.
1070 * @param string $isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL.
1071 * @param string $baseDir Default is "", meaning it is pointing to the document root.
1072 * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive.
1073 * @param string $backPath The path to get back to the root of the archive from $htmlDir.
1074 */
1075 protected function resolveMedia($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") {
1076 if ($this->isFinalized) {
1077 return FALSE;
1078 }
1079 $mediaPath = NULL;
1080 $tmpFile;
1081
1082 if (preg_match('#^(http|ftp)s?://#i', $source) == 1) {
1083 $urlinfo = parse_url($source);
1084
1085 if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) {
1086 $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1);
1087 }
1088 $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME);
1089 $isSourceExternal = TRUE;
1090 $mediaPath = $this->getFileContents($source, true);
1091 $tmpFile = $mediaPath;
1092 } else if (strpos($source, "/") === 0) {
1093 $internalPath = pathinfo($source, PATHINFO_DIRNAME);
1094
1095 $mediaPath = $source;
1096 if (!file_exists($mediaPath)) {
1097 $mediaPath = $this->docRoot . $mediaPath;
1098 }
1099 } else {
1100 $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME));
1101
1102 $mediaPath = $baseDir . "/" . $source;
1103 if (!file_exists($mediaPath)) {
1104 $mediaPath = $this->docRoot . $mediaPath;
1105 }
1106 }
1107
1108 if ($mediaPath !== FALSE) {
1109 $mime = $this->getMime($source);
1110 $internalPath = Zip::getRelativePath("media/" . $internalPath . "/" . $internalSrc);
1111
1112 if (!array_key_exists($internalPath, $this->fileList) &&
1113 $this->addLargeFile($internalPath, "m_" . $internalSrc, $mediaPath, $mime)) {
1114 $this->fileList[$internalPath] = $source;
1115 }
1116 if (isset($tmpFile)) {
1117 unlink($tmpFile);
1118 }
1119 return TRUE;
1120 }
1121 return FALSE;
1122 }
1123
1124 /**
1125 * Get Book Chapter count.
1126 *
1127 * @access public
1128 * @return number of chapters
1129 */
1130 function getChapterCount() {
1131 return $this->chapterCount;
1132 }
1133
1134 /**
1135 * Book title, mandatory.
1136 *
1137 * Used for the dc:title metadata parameter in the OPF file as well as the DocTitle attribute in the NCX file.
1138 *
1139 * @param string $title
1140 * @access public
1141 * @return bool $success
1142 */
1143 function setTitle($title) {
1144 if ($this->isFinalized) {
1145 return FALSE;
1146 }
1147 $this->title = $title;
1148 return TRUE;
1149 }
1150
1151 /**
1152 * Get Book title.
1153 *
1154 * @access public
1155 * @return $title
1156 */
1157 function getTitle() {
1158 return $this->title;
1159 }
1160
1161 /**
1162 * Book language, mandatory
1163 *
1164 * Use the RFC3066 Language codes, such as "en", "da", "fr" etc.
1165 * Defaults to "en".
1166 *
1167 * Used for the dc:language metadata parameter in the OPF file.
1168 *
1169 * @param string $language
1170 * @access public
1171 * @return bool $success
1172 */
1173 function setLanguage($language) {
1174 if ($this->isFinalized || mb_strlen($language) != 2) {
1175 return FALSE;
1176 }
1177 $this->language = $language;
1178 return TRUE;
1179 }
1180
1181 /**
1182 * Get Book language.
1183 *
1184 * @access public
1185 * @return $language
1186 */
1187 function getLanguage() {
1188 return $this->language;
1189 }
1190
1191 /**
1192 * Unique book identifier, mandatory.
1193 * Use the URI, or ISBN if available.
1194 *
1195 * An unambiguous reference to the resource within a given context.
1196 *
1197 * Recommended best practice is to identify the resource by means of a
1198 * string conforming to a formal identification system.
1199 *
1200 * Used for the dc:identifier metadata parameter in the OPF file, as well
1201 * as dtb:uid in the NCX file.
1202 *
1203 * Identifier type should only be:
1204 * EPub::IDENTIFIER_URI
1205 * EPub::IDENTIFIER_ISBN
1206 * EPub::IDENTIFIER_UUID
1207 *
1208 * @param string $identifier
1209 * @param string $identifierType
1210 * @access public
1211 * @return bool $success
1212 */
1213 function setIdentifier($identifier, $identifierType) {
1214 if ($this->isFinalized || ($identifierType !== EPub::IDENTIFIER_URI && $identifierType !== EPub::IDENTIFIER_ISBN && $identifierType !== EPub::IDENTIFIER_UUID)) {
1215 return FALSE;
1216 }
1217 $this->identifier = $identifier;
1218 $this->identifierType = $identifierType;
1219 return TRUE;
1220 }
1221
1222 /**
1223 * Get Book identifier.
1224 *
1225 * @access public
1226 * @return $identifier
1227 */
1228 function getIdentifier() {
1229 return $this->identifier;
1230 }
1231
1232 /**
1233 * Get Book identifierType.
1234 *
1235 * @access public
1236 * @return $identifierType
1237 */
1238 function getIdentifierType() {
1239 return $this->identifierType;
1240 }
1241
1242 /**
1243 * Book description, optional.
1244 *
1245 * An account of the resource.
1246 *
1247 * Description may include but is not limited to: an abstract, a table of
1248 * contents, a graphical representation, or a free-text account of the
1249 * resource.
1250 *
1251 * Used for the dc:source metadata parameter in the OPF file
1252 *
1253 * @param string $description
1254 * @access public
1255 * @return bool $success
1256 */
1257 function setDescription($description) {
1258 if ($this->isFinalized) {
1259 return FALSE;
1260 }
1261 $this->description = $description;
1262 return TRUE;
1263 }
1264
1265 /**
1266 * Get Book description.
1267 *
1268 * @access public
1269 * @return $description
1270 */
1271 function getDescription() {
1272 return $this->description;
1273 }
1274
1275 /**
1276 * Book author or creator, optional.
1277 * The $authorSortKey is basically how the name is to be sorted, usually
1278 * it's "Lastname, First names" where the $author is the straight
1279 * "Firstnames Lastname"
1280 *
1281 * An entity primarily responsible for making the resource.
1282 *
1283 * Examples of a Creator include a person, an organization, or a service.
1284 * Typically, the name of a Creator should be used to indicate the entity.
1285 *
1286 * Used for the dc:creator metadata parameter in the OPF file and the
1287 * docAuthor attribure in the NCX file.
1288 * The sort key is used for the opf:file-as attribute in dc:creator.
1289 *
1290 * @param string $author
1291 * @param string $authorSortKey
1292 * @access public
1293 * @return bool $success
1294 */
1295 function setAuthor($author, $authorSortKey) {
1296 if ($this->isFinalized) {
1297 return FALSE;
1298 }
1299 $this->author = $author;
1300 $this->authorSortKey = $authorSortKey;
1301 return TRUE;
1302 }
1303
1304 /**
1305 * Get Book author.
1306 *
1307 * @access public
1308 * @return $author
1309 */
1310 function getAuthor() {
1311 return $this->author;
1312 }
1313
1314 /**
1315 * Publisher Information, optional.
1316 *
1317 * An entity responsible for making the resource available.
1318 *
1319 * Examples of a Publisher include a person, an organization, or a service.
1320 * Typically, the name of a Publisher should be used to indicate the entity.
1321 *
1322 * Used for the dc:publisher and dc:relation metadata parameters in the OPF file.
1323 *
1324 * @param string $publisherName
1325 * @param string $publisherURL
1326 * @access public
1327 * @return bool $success
1328 */
1329 function setPublisher($publisherName, $publisherURL) {
1330 if ($this->isFinalized) {
1331 return FALSE;
1332 }
1333 $this->publisherName = $publisherName;
1334 $this->publisherURL = $publisherURL;
1335 return TRUE;
1336 }
1337
1338 /**
1339 * Get Book publisherName.
1340 *
1341 * @access public
1342 * @return $publisherName
1343 */
1344 function getPublisherName() {
1345 return $this->publisherName;
1346 }
1347
1348 /**
1349 * Get Book publisherURL.
1350 *
1351 * @access public
1352 * @return $publisherURL
1353 */
1354 function getPublisherURL() {
1355 return $this->publisherURL;
1356 }
1357
1358 /**
1359 * Release date, optional. If left blank, the time of the finalization will
1360 * be used.
1361 *
1362 * A point or period of time associated with an event in the lifecycle of
1363 * the resource.
1364 *
1365 * Date may be used to express temporal information at any level of
1366 * granularity. Recommended best practice is to use an encoding scheme,
1367 * such as the W3CDTF profile of ISO 8601 [W3CDTF].
1368 *
1369 * Used for the dc:date metadata parameter in the OPF file
1370 *
1371 * @param long $timestamp
1372 * @access public
1373 * @return bool $success
1374 */
1375 function setDate($timestamp) {
1376 if ($this->isFinalized) {
1377 return FALSE;
1378 }
1379 $this->date = $timestamp;
1380 $this->opf->date = $timestamp;
1381 return TRUE;
1382 }
1383
1384 /**
1385 * Get Book date.
1386 *
1387 * @access public
1388 * @return $date
1389 */
1390 function getDate() {
1391 return $this->date;
1392 }
1393
1394 /**
1395 * Book (copy)rights, optional.
1396 *
1397 * Information about rights held in and over the resource.
1398 *
1399 * Typically, rights information includes a statement about various
1400 * property rights associated with the resource, including intellectual
1401 * property rights.
1402 *
1403 * Used for the dc:rights metadata parameter in the OPF file
1404 *
1405 * @param string $rightsText
1406 * @access public
1407 * @return bool $success
1408 */
1409 function setRights($rightsText) {
1410 if ($this->isFinalized) {
1411 return FALSE;
1412 }
1413 $this->rights = $rightsText;
1414 return TRUE;
1415 }
1416
1417 /**
1418 * Get Book rights.
1419 *
1420 * @access public
1421 * @return $rights
1422 */
1423 function getRights() {
1424 return $this->rights;
1425 }
1426
1427 /**
1428 * Add book Subject.
1429 *
1430 * The topic of the resource.
1431 *
1432 * Typically, the subject will be represented using keywords, key phrases,
1433 * or classification codes. Recommended best practice is to use a
1434 * controlled vocabulary. To describe the spatial or temporal topic of the
1435 * resource, use the Coverage element.
1436 *
1437 * @param string $subject
1438 */
1439 function setSubject($subject) {
1440 if ($this->isFinalized) {
1441 return;
1442 }
1443 $this->opf->addDCMeta(DublinCore::SUBJECT, $this->decodeHtmlEntities($subject));
1444 }
1445
1446 /**
1447 * Book source URL, optional.
1448 *
1449 * A related resource from which the described resource is derived.
1450 *
1451 * The described resource may be derived from the related resource in whole
1452 * or in part. Recommended best practice is to identify the related
1453 * resource by means of a string conforming to a formal identification system.
1454 *
1455 * Used for the dc:source metadata parameter in the OPF file
1456 *
1457 * @param string $sourceURL
1458 * @access public
1459 * @return bool $success
1460 */
1461 function setSourceURL($sourceURL) {
1462 if ($this->isFinalized) {
1463 return FALSE;
1464 }
1465 $this->sourceURL = $sourceURL;
1466 return TRUE;
1467 }
1468
1469 /**
1470 * Get Book sourceURL.
1471 *
1472 * @access public
1473 * @return $sourceURL
1474 */
1475 function getSourceURL() {
1476 return $this->sourceURL;
1477 }
1478
1479 /**
1480 * Coverage, optional.
1481 *
1482 * The spatial or temporal topic of the resource, the spatial applicability
1483 * of the resource, or the jurisdiction under which the resource is relevant.
1484 *
1485 * Spatial topic and spatial applicability may be a named place or a location
1486 * specified by its geographic coordinates. Temporal topic may be a named
1487 * period, date, or date range. A jurisdiction may be a named administrative
1488 * entity or a geographic place to which the resource applies. Recommended
1489 * best practice is to use a controlled vocabulary such as the Thesaurus of
1490 * Geographic Names [TGN]. Where appropriate, named places or time periods
1491 * can be used in preference to numeric identifiers such as sets of
1492 * coordinates or date ranges.
1493 *
1494 * Used for the dc:coverage metadata parameter in the OPF file
1495 *
1496 * Same as ->addDublinCoreMetadata(DublinCore::COVERAGE, $coverage);
1497 *
1498 * @param string $coverage
1499 * @access public
1500 * @return bool $success
1501 */
1502 function setCoverage($coverage) {
1503 if ($this->isFinalized) {
1504 return FALSE;
1505 }
1506 $this->coverage = $coverage;
1507 return TRUE;
1508 }
1509
1510 /**
1511 * Get Book coverage.
1512 *
1513 * @access public
1514 * @return $coverage
1515 */
1516 function getCoverage() {
1517 return $this->coverage;
1518 }
1519
1520 /**
1521 * Set book Relation.
1522 *
1523 * A related resource.
1524 *
1525 * Recommended best practice is to identify the related resource by means
1526 * of a string conforming to a formal identification system.
1527 *
1528 * @param string $relation
1529 */
1530 function setRelation($relation) {
1531 if ($this->isFinalized) {
1532 return;
1533 }
1534 $this->relation = $relation;
1535 }
1536
1537 /**
1538 * Get the book relation.
1539 *
1540 * @return string The relation.
1541 */
1542 function getRelation() {
1543 return $this->relation;
1544 }
1545
1546 /**
1547 * Set book Generator.
1548 *
1549 * The generator is a meta tag added to the ncx file, it is not visible
1550 * from within the book, but is a kind of electronic watermark.
1551 *
1552 * @param string $generator
1553 */
1554 function setGenerator($generator) {
1555 if ($this->isFinalized) {
1556 return;
1557 }
1558 $this->generator = $generator;
1559 }
1560
1561 /**
1562 * Get the book relation.
1563 *
1564 * @return string The generator identity string.
1565 */
1566 function getGenerator() {
1567 return $this->generator;
1568 }
1569
1570 /**
1571 * Set ePub date formate to the short yyyy-mm-dd form, for compliance with
1572 * a bug in EpubCheck, prior to its version 1.1.
1573 *
1574 * The latest version of ePubCheck can be obtained here:
1575 * http://code.google.com/p/epubcheck/
1576 *
1577 * @access public
1578 * @return bool $success
1579 */
1580 function setShortDateFormat() {
1581 if ($this->isFinalized) {
1582 return FALSE;
1583 }
1584 $this->dateformat = $this->dateformatShort;
1585 return TRUE;
1586 }
1587
1588 /**
1589 * @Deprecated
1590 */
1591 function setIgnoreEmptyBuffer($ignoreEmptyBuffer = TRUE) {
1592 die ("Function was deprecated, functionality is no longer needed.");
1593 }
1594
1595 /**
1596 * Set the references title for the ePub 3 landmarks section
1597 *
1598 * @param string $referencesTitle
1599 * @param string $referencesId
1600 * @param string $referencesClass
1601 * @return bool
1602 */
1603 function setReferencesTitle($referencesTitle = "Guide", $referencesId = "", $referencesClass = "references") {
1604 if ($this->isFinalized) {
1605 return FALSE;
1606 }
1607 $this->ncx->referencesTitle = is_string($referencesTitle) ? trim($referencesTitle) : "Guide";
1608 $this->ncx->referencesId = is_string($referencesId) ? trim($referencesId) : "references";
1609 $this->ncx->referencesClass = is_string($referencesClass) ? trim($referencesClass) : "references";
1610 return TRUE;
1611 }
1612
1613 /**
1614 * Set the references title for the ePub 3 landmarks section
1615 *
1616 * @param bool $referencesTitle
1617 */
1618 function setisReferencesAddedToToc($isReferencesAddedToToc = TRUE) {
1619 if ($this->isFinalized) {
1620 return FALSE;
1621 }
1622 $this->isReferencesAddedToToc = $isReferencesAddedToToc === TRUE;
1623 return TRUE;
1624 }
1625
1626 /**
1627 * Get Book status.
1628 *
1629 * @access public
1630 * @return bool
1631 */
1632 function isFinalized() {
1633 return $this->isFinalized;
1634 }
1635
1636 /**
1637 * Build the Table of Contents. This is not strictly necessary, as most eReaders will build it from the navigation structure in the .ncx file.
1638 *
1639 * @param string $cssFileName Include a link to this css file in the TOC html.
1640 * @param string $tocCSSClass The TOC is a <div>, if you need special formatting, you can add a css class for that div. Default is "toc".
1641 * @param string $title Title of the Table of contents. Default is "Table of Contents". Use this for ie. languages other than English.
1642 * @param bool $addReferences include reference pages in the TOC, using the $referencesOrder array to determine the order of the pages in the TOC. Default is TRUE.
1643 * @param bool $addToIndex Add the TOC to the NCX index at the current leve/position. Default is FALSE
1644 * @param string $tocFileName Change teh default name of the TOC file. The default is "TOC.xhtml"
1645 */
1646 function buildTOC($cssFileName = NULL, $tocCSSClass = "toc", $title = "Table of Contents", $addReferences = TRUE, $addToIndex = FALSE, $tocFileName = "TOC.xhtml") {
1647 if ($this->isFinalized) {
1648 return FALSE;
1649 }
1650 $this->buildTOC = TRUE;
1651 $this->tocTitle = $title;
1652 $this->tocFileName = $this->normalizeFileName($tocFileName);
1653 if (!empty($cssFileName)) {
1654 $this->tocCSSFileName = $this->normalizeFileName($cssFileName);
1655 }
1656 $this->tocCSSClass = $tocCSSClass;
1657 $this->tocAddReferences = $addReferences;
1658
1659 $this->opf->addItemRef("ref_" . Reference::TABLE_OF_CONTENTS, FALSE);
1660 $this->opf->addReference(Reference::TABLE_OF_CONTENTS, $title, $this->tocFileName);
1661
1662 if ($addToIndex) {
1663 $navPoint = new NavPoint($this->decodeHtmlEntities($title), $this->tocFileName, "ref_" . Reference::TABLE_OF_CONTENTS);
1664 $this->ncx->addNavPoint($navPoint);
1665 } else {
1666 $this->ncx->referencesList[Reference::TABLE_OF_CONTENTS] = $this->tocFileName;
1667 $this->ncx->referencesName[Reference::TABLE_OF_CONTENTS] = $title;
1668 }
1669 }
1670
1671 private function finalizeTOC() {
1672 if (!$this->buildTOC) {
1673 return FALSE;
1674 }
1675
1676 if (empty($this->tocTitle)) {
1677 $this->tocTitle = "Table of Contents";
1678 }
1679
1680 $tocData = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
1681
1682 if ($this->isEPubVersion2()) {
1683 $tocData .= "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n"
1684 . " \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"
1685 . "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n"
1686 . "<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n";
1687 } else {
1688 $tocData .= "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n"
1689 . "<head>\n<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n";
1690 }
1691
1692 if (!empty($this->tocCssFileName)) {
1693 $tocData .= "<link rel=\"stylesheet\" type=\"text/css\" href=\"" . $this->tocCssFileName . "\" />\n";
1694 }
1695
1696 $tocData .= "<title>" . $this->tocTitle . "</title>\n"
1697 . "</head>\n"
1698 . "<body>\n"
1699 . "<h3>" . $this->tocTitle . "</h3>\n<div";
1700
1701 if (!empty($this->tocCSSClass)) {
1702 $tocData .= " class=\"" . $this->tocCSSClass . "\"";
1703 }
1704 $tocData .= ">\n";
1705
1706 while (list($item, $descriptive) = each($this->referencesOrder)) {
1707 if ($item === "text") {
1708 while (list($chapterName, $navPoint) = each($this->ncx->chapterList)) {
1709 $fileName = $navPoint->getContentSrc();
1710 $level = $navPoint->getLevel() -2;
1711 $tocData .= "\t<p>" . str_repeat(" &#160; &#160; &#160;", $level) . "<a href=\"" . $this->sanitizeFileName($fileName) . "\">" . $chapterName . "</a></p>\n";
1712 }
1713 } else if ($this->tocAddReferences === TRUE) {
1714 if (array_key_exists($item, $this->ncx->referencesList)) {
1715 $tocData .= "\t<p><a href=\"" . $this->ncx->referencesList[$item] . "\">" . $descriptive . "</a></p>\n";
1716 } else if ($item === "toc") {
1717 $tocData .= "\t<p><a href=\"TOC.xhtml\">" . $this->tocTitle . "</a></p>\n";
1718 } else if ($item === "cover" && $this->isCoverImageSet) {
1719 $tocData .= "\t<p><a href=\"CoverPage.xhtml\">" . $descriptive . "</a></p>\n";
1720 }
1721 }
1722 }
1723 $tocData .= "</div>\n</body>\n</html>\n";
1724
1725 $this->addReferencePage($this->tocTitle, $this->tocFileName, $tocData, Reference::TABLE_OF_CONTENTS);
1726
1727 }
1728
1729 /**
1730 * @return bool
1731 */
1732 function isEPubVersion2() {
1733 return $this->bookVersion === EPub::BOOK_VERSION_EPUB2;
1734 }
1735
1736 /**
1737 * @param string $cssFileName
1738 * @param string $title
1739 * @return string
1740 */
1741 function buildEPub3TOC($cssFileName = NULL, $title = "Table of Contents") {
1742 $this->ncx->referencesOrder = $this->referencesOrder;
1743 $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title));
1744 return $this->ncx->finalizeEPub3($title, $cssFileName);
1745 }
1746
1747 /**
1748 * @param string $fileName
1749 * @param string $tocData
1750 * @return bool
1751 */
1752 function addEPub3TOC($fileName, $tocData) {
1753 if ($this->isEPubVersion2() || $this->isFinalized || array_key_exists($fileName, $this->fileList)) {
1754 return FALSE;
1755 }
1756 $fileName = Zip::getRelativePath($fileName);
1757 $fileName = preg_replace('#^[/\.]+#i', "", $fileName);
1758
1759 $this->zip->addFile($tocData, $this->bookRoot.$fileName);
1760
1761 $this->fileList[$fileName] = $fileName;
1762 $this->opf->addItem("toc", $fileName, "application/xhtml+xml", "nav");
1763 return TRUE;
1764 }
1765
1766 /**
1767 * Check for mandatory parameters and finalize the e-book.
1768 * Once finalized, the book is locked for further additions.
1769 *
1770 * @return bool $success
1771 */
1772 function finalize() {
1773 if ($this->isFinalized || $this->chapterCount == 0 || empty($this->title) || empty($this->language)) {
1774 return FALSE;
1775 }
1776
1777 if (empty($this->identifier) || empty($this->identifierType)) {
1778 $this->setIdentifier($this->createUUID(4), EPub::IDENTIFIER_UUID);
1779 }
1780
1781 if ($this->date == 0) {
1782 $this->date = time();
1783 }
1784
1785 if (empty($this->sourceURL)) {
1786 $this->sourceURL = $this->getCurrentPageURL();
1787 }
1788
1789 if (empty($this->publisherURL)) {
1790 $this->sourceURL = $this->getCurrentServerURL();
1791 }
1792
1793 // Generate OPF data:
1794 $this->opf->setIdent("BookId");
1795 $this->opf->initialize($this->title, $this->language, $this->identifier, $this->identifierType);
1796
1797 $DCdate = new DublinCore(DublinCore::DATE, gmdate($this->dateformat, $this->date));
1798 $DCdate->addOpfAttr("event", "publication");
1799 $this->opf->metadata->addDublinCore($DCdate);
1800
1801 if (!empty($this->description)) {
1802 $this->opf->addDCMeta(DublinCore::DESCRIPTION, $this->decodeHtmlEntities($this->description));
1803 }
1804
1805 if (!empty($this->publisherName)) {
1806 $this->opf->addDCMeta(DublinCore::PUBLISHER, $this->decodeHtmlEntities($this->publisherName));
1807 }
1808
1809 if (!empty($this->publisherURL)) {
1810 $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->publisherURL));
1811 }
1812
1813 if (!empty($this->author)) {
1814 $author = $this->decodeHtmlEntities($this->author);
1815 $this->opf->addCreator($author, $this->decodeHtmlEntities($this->authorSortKey), MarcCode::AUTHOR);
1816 $this->ncx->setDocAuthor($author);
1817 }
1818
1819 if (!empty($this->rights)) {
1820 $this->opf->addDCMeta(DublinCore::RIGHTS, $this->decodeHtmlEntities($this->rights));
1821 }
1822
1823 if (!empty($this->coverage)) {
1824 $this->opf->addDCMeta(DublinCore::COVERAGE, $this->decodeHtmlEntities($this->coverage));
1825 }
1826
1827 if (!empty($this->sourceURL)) {
1828 $this->opf->addDCMeta(DublinCore::SOURCE, $this->sourceURL);
1829 }
1830
1831 if (!empty($this->relation)) {
1832 $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->relation));
1833 }
1834
1835 if ($this->isCoverImageSet) {
1836 $this->opf->addMeta("cover", "coverImage");
1837 }
1838
1839 if (!empty($this->generator)) {
1840 $gen = $this->decodeHtmlEntities($this->generator);
1841 $this->opf->addMeta("generator", $gen);
1842 $this->ncx->addMetaEntry("dtb:generator", $gen);
1843 }
1844
1845 if ($this->EPubMark) {
1846 $this->opf->addMeta("generator", "EPub (Version " . self::VERSION . ") by A. Grandt, http://www.phpclasses.org/package/6115");
1847 }
1848
1849 reset($this->ncx->chapterList);
1850 list($firstChapterName, $firstChapterNavPoint) = each($this->ncx->chapterList);
1851 $firstChapterFileName = $firstChapterNavPoint->getContentSrc();
1852 $this->opf->addReference(Reference::TEXT, $this->decodeHtmlEntities($firstChapterName), $firstChapterFileName);
1853
1854 $this->ncx->setUid($this->identifier);
1855
1856 $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title));
1857
1858 $this->ncx->referencesOrder = $this->referencesOrder;
1859 if ($this->isReferencesAddedToToc) {
1860 $this->ncx->finalizeReferences();
1861 }
1862
1863 $this->finalizeTOC();
1864
1865 if (!$this->isEPubVersion2()) {
1866 $this->addEPub3TOC("epub3toc.xhtml", $this->buildEPub3TOC());
1867 }
1868
1869 $opfFinal = $this->fixEncoding($this->opf->finalize());
1870 $ncxFinal = $this->fixEncoding($this->ncx->finalize());
1871
1872 if (mb_detect_encoding($opfFinal, 'UTF-8', true) === "UTF-8") {
1873 $this->zip->addFile($opfFinal, $this->bookRoot."book.opf");
1874 } else {
1875 $this->zip->addFile(mb_convert_encoding($opfFinal, "UTF-8"), $this->bookRoot."book.opf");
1876 }
1877
1878 if (mb_detect_encoding($ncxFinal, 'UTF-8', true) === "UTF-8") {
1879 $this->zip->addFile($ncxFinal, $this->bookRoot."book.ncx");
1880 } else {
1881 $this->zip->addFile(mb_convert_encoding($ncxFinal, "UTF-8"), $this->bookRoot."book.ncx");
1882 }
1883
1884 $this->opf = NULL;
1885 $this->ncx = NULL;
1886
1887 $this->isFinalized = TRUE;
1888 return TRUE;
1889 }
1890
1891 /**
1892 * Ensure the encoded string is a valid UTF-8 string.
1893 *
1894 * Note, that a mb_detect_encoding on the returned string will still return ASCII if the entire string is comprized of characters in the 1-127 range.
1895 *
1896 * @link: http://snippetdb.com/php/convert-string-to-utf-8-for-mysql
1897 * @param string $in_str
1898 * @return string converted string.
1899 */
1900 function fixEncoding($in_str) {
1901 if (mb_detect_encoding($in_str) == "UTF-8" && mb_check_encoding($in_str,"UTF-8")) {
1902 return $in_str;
1903 } else {
1904 return utf8_encode($in_str);
1905 }
1906 }
1907
1908 /**
1909 * Return the finalized book.
1910 *
1911 * @return string with the book in binary form.
1912 */
1913 function getBook() {
1914 if (!$this->isFinalized) {
1915 $this->finalize();
1916 }
1917
1918 return $this->zip->getZipData();
1919 }
1920
1921 /**
1922 * Remove disallowed characters from string to get a nearly safe filename
1923 *
1924 * @param string $fileName
1925 * @return mixed|string
1926 */
1927 function sanitizeFileName($fileName) {
1928 $fileName1 = str_replace($this->forbiddenCharacters, '', $fileName);
1929 $fileName2 = preg_replace('/[\s-]+/', '-', $fileName1);
1930 return trim($fileName2, '.-_');
1931
1932 }
1933
1934 /**
1935 * Cleanup the filepath, and remove leading . and / characters.
1936 *
1937 * Sometimes, when a path is generated from multiple fragments,
1938 * you can get something like "../data/html/../images/image.jpeg"
1939 * ePub files don't work well with that, this will normalize that
1940 * example path to "data/images/image.jpeg"
1941 *
1942 * @param string $fileName
1943 * @return string normalized filename
1944 */
1945 function normalizeFileName($fileName) {
1946 return preg_replace('#^[/\.]+#i', "", Zip::getRelativePath($fileName));
1947 }
1948
1949 /**
1950 * Save the ePub file to local disk.
1951 *
1952 * @param string $fileName
1953 * @param string $baseDir If empty baseDir is absolute to server path, if omitted it's relative to script path
1954 * @return The sent file name if successfull, FALSE if it failed.
1955 */
1956 function saveBook($fileName, $baseDir = '.') {
1957
1958 // Make fileName safe
1959 $fileName = $this->sanitizeFileName($fileName);
1960
1961 // Finalize book, if it's not done already
1962 if (!$this->isFinalized) {
1963 $this->finalize();
1964 }
1965
1966 if (stripos(strrev($fileName), "bupe.") !== 0) {
1967 $fileName .= ".epub";
1968 }
1969
1970 // Try to open file access
1971 $fh = fopen($baseDir.'/'.$fileName, "w");
1972
1973 if ($fh) {
1974 fputs($fh, $this->getBook());
1975 fclose($fh);
1976
1977 // if file is written return TRUE
1978 return $fileName;
1979 }
1980
1981 // return FALSE by default
1982 return FALSE;
1983 }
1984
1985 /**
1986 * Return the finalized book size.
1987 *
1988 * @return string
1989 */
1990 function getBookSize() {
1991 if (!$this->isFinalized) {
1992 $this->finalize();
1993 }
1994
1995 return $this->zip->getArchiveSize();
1996 }
1997
1998 /**
1999 * Send the book as a zip download
2000 *
2001 * Sending will fail if the output buffer is in use. You can override this limit by
2002 * calling setIgnoreEmptyBuffer(TRUE), though the function will still fail if that
2003 * buffer is not empty.
2004 *
2005 * @param string $fileName The name of the book without the .epub at the end.
2006 * @return The sent file name if successfull, FALSE if it failed.
2007 */
2008 function sendBook($fileName) {
2009 if (!$this->isFinalized) {
2010 $this->finalize();
2011 }
2012
2013 if (stripos(strrev($fileName), "bupe.") !== 0) {
2014 $fileName .= ".epub";
2015 }
2016
2017 if (TRUE === $this->zip->sendZip($fileName, "application/epub+zip")) {
2018 return $fileName;
2019 }
2020 return FALSE;
2021 }
2022
2023 /**
2024 * Generates an UUID.
2025 *
2026 * Default version (4) will generate a random UUID, version 3 will URL based UUID.
2027 *
2028 * Added for convinience
2029 *
2030 * @param int $bookVersion UUID version to retrieve, See lib.uuid.manual.html for details.
2031 * @param string $url
2032 * @return string The formatted uuid
2033 */
2034 function createUUID($bookVersion = 4, $url = NULL) {
2035 include_once("lib.uuid.php");
2036 return UUID::mint($bookVersion, $url, UUID::nsURL);
2037 }
2038
2039 /**
2040 * Get the url of the current page.
2041 * Example use: Default Source URL
2042 *
2043 * $return string Page URL.
2044 */
2045 function getCurrentPageURL() {
2046 $pageURL = $this->getCurrentServerURL() . filter_input(INPUT_SERVER, "REQUEST_URI");
2047 return $pageURL;
2048 }
2049
2050 /**
2051 * Get the url of the server.
2052 * Example use: Default Publisher URL
2053 *
2054 * $return string Server URL.
2055 */
2056 function getCurrentServerURL() {
2057 $serverURL = 'http';
2058 $https = filter_input(INPUT_SERVER, "HTTPS");
2059 $port = filter_input(INPUT_SERVER, "SERVER_PORT");
2060
2061 if ($https === "on") {
2062 $serverURL .= "s";
2063 }
2064 $serverURL .= "://" . filter_input(INPUT_SERVER, "SERVER_NAME");
2065 if ($port != "80") {
2066 $serverURL .= ":" . $port;
2067 }
2068 return $serverURL . '/';
2069 }
2070
2071 /**
2072 * Try to determine the mimetype of the file path.
2073 *
2074 * @param string $source Path
2075 * @return string mimetype, or FALSE.
2076 */
2077 function getMime($source) {
2078 return $this->mimetypes[pathinfo($source, PATHINFO_EXTENSION)];
2079 }
2080
2081 /**
2082 * Get an image from a file or url, return it resized if the image exceeds the $maxImageWidth or $maxImageHeight directives.
2083 *
2084 * The return value is an array.
2085 * ['width'] is the width of the image.
2086 * ['height'] is the height of the image.
2087 * ['mime'] is the mime type of the image. Resized images are always in jpeg format.
2088 * ['image'] is the image data.
2089 * ['ext'] is the extension of the image file.
2090 *
2091 * @param string $source path or url to file.
2092 * $return array
2093 */
2094 function getImage($source) {
2095 $width = -1;
2096 $height = -1;
2097 $mime = "application/octet-stream";
2098 $type = FALSE;
2099 $ext = "";
2100
2101
2102 $image = $this->getFileContents($source);
2103
2104 if ($image !== FALSE && strlen($image) > 0) {
2105 $imageFile = imagecreatefromstring($image);
2106 if ($imageFile !== false) {
2107 $width = ImageSX($imageFile);
2108 $height = ImageSY($imageFile);
2109 }
2110 if ($this->isExifInstalled) {
2111 @$type = exif_imagetype($source);
2112 $mime = image_type_to_mime_type($type);
2113 }
2114 if ($mime === "application/octet-stream") {
2115 $mime = $this->image_file_type_from_binary($image);
2116 }
2117 if ($mime === "application/octet-stream") {
2118 $mime = $this->getMimeTypeFromUrl($source);
2119 }
2120 } else {
2121 return FALSE;
2122 }
2123
2124 if ($width <= 0 || $height <= 0) {
2125 return FALSE;
2126 }
2127
2128 $ratio = 1;
2129
2130 if ($this->isGdInstalled) {
2131 if ($width > $this->maxImageWidth) {
2132 $ratio = $this->maxImageWidth/$width;
2133 }
2134 if ($height*$ratio > $this->maxImageHeight) {
2135 $ratio = $this->maxImageHeight/$height;
2136 }
2137
2138 if ($ratio < 1 || empty($mime) || ($this->isGifImagesEnabled !== FALSE && $mime == "image/gif")) {
2139 $image_o = imagecreatefromstring($image);
2140 $image_p = imagecreatetruecolor($width*$ratio, $height*$ratio);
2141
2142 if ($mime == "image/png") {
2143 imagealphablending($image_p, false);
2144 imagesavealpha($image_p, true);
2145 imagealphablending($image_o, true);
2146
2147 imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height);
2148 ob_start();
2149 imagepng($image_p, NULL, 9);
2150 $image = ob_get_contents();
2151 ob_end_clean();
2152
2153 $ext = "png";
2154 } else {
2155 imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height);
2156 ob_start();
2157 imagejpeg($image_p, NULL, 80);
2158 $image = ob_get_contents();
2159 ob_end_clean();
2160
2161 $mime = "image/jpeg";
2162 $ext = "jpg";
2163 }
2164 imagedestroy($image_o);
2165 imagedestroy($image_p);
2166 }
2167 }
2168
2169 if ($ext === "") {
2170 static $mimeToExt = array (
2171 'image/jpeg' => 'jpg',
2172 'image/gif' => 'gif',
2173 'image/png' => 'png'
2174 );
2175
2176 if (isset($mimeToExt[$mime])) {
2177 $ext = $mimeToExt[$mime];
2178 }
2179 }
2180
2181 $rv = array();
2182 $rv['width'] = $width*$ratio;
2183 $rv['height'] = $height*$ratio;
2184 $rv['mime'] = $mime;
2185 $rv['image'] = $image;
2186 $rv['ext'] = $ext;
2187
2188 return $rv;
2189 }
2190
2191 /**
2192 * Get file contents, using curl if available, else file_get_contents
2193 *
2194 * @param string $source
2195 * @return bool
2196 */
2197 function getFileContents($source, $toTempFile = FALSE) {
2198 $isExternal = preg_match('#^(http|ftp)s?://#i', $source) == 1;
2199
2200 if ($isExternal && $this->isCurlInstalled) {
2201 $ch = curl_init();
2202 $outFile = NULL;
2203 $fp = NULL;
2204 $res = FALSE;
2205 $info = array('http_code' => 500);
2206
2207 curl_setopt($ch, CURLOPT_HEADER, 0);
2208 curl_setopt($ch, CURLOPT_URL, str_replace(" ","%20",$source));
2209 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
2210 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
2211 curl_setopt($ch, CURLOPT_BUFFERSIZE, 4096);
2212
2213 if ($toTempFile) {
2214 $outFile = tempnam(sys_get_temp_dir(), "EPub_v" . EPub::VERSION . "_");
2215 $fp = fopen($outFile, "w+b");
2216 curl_setopt($ch, CURLOPT_FILE, $fp);
2217
2218 $res = curl_exec($ch);
2219 $info = curl_getinfo($ch);
2220
2221 curl_close($ch);
2222 fclose($fp);
2223 } else {
2224 $res = curl_exec($ch);
2225 $info = curl_getinfo($ch);
2226
2227 curl_close($ch);
2228 }
2229
2230 if ($info['http_code'] == 200 && $res != false) {
2231 if ($toTempFile) {
2232 return $outFile;
2233 }
2234 return $res;
2235 }
2236 return FALSE;
2237 }
2238
2239 if ($this->isFileGetContentsInstalled && (!$isExternal || $this->isFileGetContentsExtInstalled)) {
2240 @$data = file_get_contents($source);
2241 return $data;
2242 }
2243 return FALSE;
2244 }
2245
2246 /**
2247 * get mime type from image data
2248 *
2249 * By fireweasel found on http://stackoverflow.com/questions/2207095/get-image-mimetype-from-resource-in-php-gd
2250 * @staticvar array $type
2251 * @param object $binary
2252 * @return string
2253 */
2254 function image_file_type_from_binary($binary) {
2255 $hits = 0;
2256 if (!preg_match(
2257 '/\A(?:(\xff\xd8\xff)|(GIF8[79]a)|(\x89PNG\x0d\x0a)|(BM)|(\x49\x49(?:\x2a\x00|\x00\x4a))|(FORM.{4}ILBM))/',
2258 $binary, $hits)) {
2259 return 'application/octet-stream';
2260 }
2261 static $type = array (
2262 1 => 'image/jpeg',
2263 2 => 'image/gif',
2264 3 => 'image/png',
2265 4 => 'image/x-windows-bmp',
2266 5 => 'image/tiff',
2267 6 => 'image/x-ilbm',
2268 );
2269 return $type[count($hits) - 1];
2270 }
2271
2272 /**
2273 * @param string $source URL Source
2274 * @return string MimeType
2275 */
2276 function getMimeTypeFromUrl($source) {
2277 $ext = FALSE;
2278
2279 $srev = strrev($source);
2280 $pos = strpos($srev, "?");
2281 if ($pos !== FALSE) {
2282 $srev = substr($srev, $pos+1);
2283 }
2284
2285 $pos = strpos($srev, ".");
2286 if ($pos !== FALSE) {
2287 $ext = strtolower(strrev(substr($srev, 0, $pos)));
2288 }
2289
2290 if ($ext !== FALSE) {
2291 return $this->getMimeTypeFromExtension($ext);
2292 }
2293 return "application/octet-stream";
2294 }
2295
2296 /**
2297 * @param string $ext Extension
2298 * @return string MimeType
2299 */
2300 function getMimeTypeFromExtension($ext) {
2301 switch ($ext) {
2302 case "jpg":
2303 case "jpe":
2304 case "jpeg":
2305 return 'image/jpeg';
2306 case "gif":
2307 return 'image/gif';
2308 case "png":
2309 return 'image/png';
2310 case "bmp":
2311 return 'image/x-windows-bmp';
2312 case "tif":
2313 case "tiff":
2314 case "cpt":
2315 return 'image/tiff';
2316 case "lbm":
2317 case "ilbm":
2318 return 'image/x-ilbm';
2319 default:
2320 return "application/octet-stream";
2321 }
2322 }
2323
2324 /**
2325 * Encode html code to use html entities, safeguarding it from potential character encoding peoblems
2326 * This function is a bit different from the vanilla htmlentities function in that it does not encode html tags.
2327 *
2328 * The regexp is taken from the PHP Manual discussion, it was written by user "busbyjon".
2329 * http://www.php.net/manual/en/function.htmlentities.php#90111
2330 *
2331 * @param string $string string to encode.
2332 */
2333 public function encodeHtml($string) {
2334 $string = strtr($string, $this->html_encoding_characters);
2335
2336 //return preg_replace("/&amp;(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&\\1", $string);
2337 //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&amp;", $string);
2338 return $string;
2339 }
2340
2341 /**
2342 * Helper function to create a DOM fragment with given markup.
2343 *
2344 * @author Adam Schmalhofer
2345 *
2346 * @param DOMDocument $dom
2347 * @param string $markup
2348 * @return DOMNode fragment in a node.
2349 */
2350 protected function createDomFragment($dom, $markup) {
2351 $node = $dom->createDocumentFragment();
2352 $node->appendXML($markup);
2353 return $node;
2354 }
2355
2356 /**
2357 * Retrieve an array of file names currently added to the book.
2358 * $key is the filename used in the book
2359 * $value is the original filename, will be the same as $key for most entries
2360 *
2361 * @return array file list
2362 */
2363 function getFileList() {
2364 return $this->fileList;
2365 }
2366
2367 /**
2368 * @deprecated Use Zip::getRelativePath($relPath) instead.
2369 */
2370 function relPath($relPath) {
2371 die ("Function was deprecated, use Zip::getRelativePath(\$relPath); instead");
2372 }
2373
2374 /**
2375 * Set default chapter target size.
2376 * Default is 250000 bytes, and minimum is 10240 bytes.
2377 *
2378 * @param int $size segment size in bytes
2379 * @return void
2380 */
2381 function setSplitSize($size) {
2382 $this->splitDefaultSize = (int)$size;
2383 if ($size < 10240) {
2384 $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
2385 }
2386 }
2387
2388 /**
2389 * Get the chapter target size.
2390 *
2391 * @return $size
2392 */
2393 function getSplitSize() {
2394 return $this->splitDefaultSize;
2395 }
2396
2397 /**
2398 * Remove all non essential html tags and entities.
2399 *
2400 * @global type $htmlEntities
2401 * @param string $string
2402 * @return string with the stripped entities.
2403 */
2404 function decodeHtmlEntities($string) {
2405 global $htmlEntities;
2406
2407 $string = preg_replace('~\s*<br\s*/*\s*>\s*~i', "\n", $string);
2408 $string = preg_replace('~\s*</(p|div)\s*>\s*~i', "\n\n", $string);
2409 $string = preg_replace('~<[^>]*>~', '', $string);
2410
2411 $string = strtr($string, $htmlEntities);
2412
2413 $string = str_replace('&', '&amp;', $string);
2414 $string = str_replace('&amp;amp;', '&amp;', $string);
2415 $string = preg_replace('~&amp;(#x*[a-fA-F0-9]+;)~', '&\1', $string);
2416 $string = str_replace('<', '&lt;', $string);
2417 $string = str_replace('>', '&gt;', $string);
2418
2419 return $string;
2420 }
2421
2422 /**
2423 * Simply remove all HTML tags, brute force and no finesse.
2424 *
2425 * @param string $string html
2426 * @return string
2427 */
2428 function html2text($string) {
2429 return preg_replace('~<[^>]*>~', '', $string);
2430 }
2431
2432 /**
2433 * @return string
2434 */
2435 function getLog() {
2436 return $this->log->getLog();
2437 }
2438}
diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
new file mode 100644
index 00000000..1d44f238
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php
@@ -0,0 +1,201 @@
1<?php
2/**
3 * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
4 * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
5 * Split size is considered max target size. The actual size is the result of an even split across the resulting files.
6 *
7 * @author A. Grandt <php@grandt.com>
8 * @copyright 2009-2014 A. Grandt
9 * @license GNU LGPL 2.1
10 * @link http://www.phpclasses.org/package/6115
11 * @link https://github.com/Grandt/PHPePub
12 * @version 3.20
13 */
14class EPubChapterSplitter {
15 const VERSION = 3.20;
16
17 private $splitDefaultSize = 250000;
18 private $bookVersion = EPub::BOOK_VERSION_EPUB2;
19
20 /**
21 *
22 * Enter description here ...
23 *
24 * @param unknown_type $ident
25 */
26 function setVersion($bookVersion) {
27 $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2;
28 }
29
30 /**
31 * Set default chapter target size.
32 * Default is 250000 bytes, and minimum is 10240 bytes.
33 *
34 * @param $size segment size in bytes
35 * @return void
36 */
37 function setSplitSize($size) {
38 $this->splitDefaultSize = (int)$size;
39 if ($size < 10240) {
40 $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
41 }
42 }
43
44 /**
45 * Get the chapter target size.
46 *
47 * @return $size
48 */
49 function getSplitSize() {
50 return $this->splitDefaultSize;
51 }
52
53 /**
54 * Split $chapter into multiple parts.
55 *
56 * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
57 * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
58 *
59 * @param String $chapter XHTML file
60 * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
61 * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
62 *
63 * @return array with 1 or more parts
64 */
65 function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
66 $chapterData = array();
67 $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
68 if ($splitOnSearchString && !$isSearchRegexp) {
69 $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
70 }
71
72 if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
73 return array($chapter);
74 }
75
76 $xmlDoc = new DOMDocument();
77 @$xmlDoc->loadHTML($chapter);
78
79 $head = $xmlDoc->getElementsByTagName("head");
80 $body = $xmlDoc->getElementsByTagName("body");
81
82 $htmlPos = stripos($chapter, "<html");
83 $htmlEndPos = stripos($chapter, ">", $htmlPos);
84 $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
85 if (strpos(trim($newXML), "<?xml ") === FALSE) {
86 $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
87 }
88 $headerLength = strlen($newXML);
89
90 $files = array();
91 $chapterNames = array();
92 $domDepth = 0;
93 $domPath = array();
94 $domClonedPath = array();
95
96 $curFile = $xmlDoc->createDocumentFragment();
97 $files[] = $curFile;
98 $curParent = $curFile;
99 $curSize = 0;
100
101 $bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
102 $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
103
104 $partSize = $this->splitDefaultSize - $headLen;
105
106 if ($bodyLen > $partSize) {
107 $parts = ceil($bodyLen / $partSize);
108 $partSize = ($bodyLen / $parts) - $headLen;
109 }
110
111 $node = $body->item(0)->firstChild;
112
113 do {
114 $nodeData = $xmlDoc->saveXML($node);
115 $nodeLen = strlen($nodeData);
116
117 if ($nodeLen > $partSize && $node->hasChildNodes()) {
118 $domPath[] = $node;
119 $domClonedPath[] = $node->cloneNode(false);
120 $domDepth++;
121
122 $node = $node->firstChild;
123 }
124
125 $node2 = $node->nextSibling;
126
127 if ($node != null && $node->nodeName != "#text") {
128 $doSplit = false;
129 if ($splitOnSearchString) {
130 $doSplit = preg_match($searchString, $nodeData) == 1;
131 if ($doSplit) {
132 $chapterNames[] = trim($nodeData);
133 }
134 }
135
136 if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
137 $curFile = $xmlDoc->createDocumentFragment();
138 $files[] = $curFile;
139 $curParent = $curFile;
140 if ($domDepth > 0) {
141 reset($domPath);
142 reset($domClonedPath);
143 $oneDomClonedPath = each($domClonedPath);
144 while ($oneDomClonedPath) {
145 list($k, $v) = $oneDomClonedPath;
146 $newParent = $v->cloneNode(false);
147 $curParent->appendChild($newParent);
148 $curParent = $newParent;
149 $oneDomClonedPath = each($domClonedPath);
150 }
151 }
152 $curSize = strlen($xmlDoc->saveXML($curFile));
153 }
154 $curParent->appendChild($node->cloneNode(true));
155 $curSize += $nodeLen;
156 }
157
158 $node = $node2;
159 while ($node == null && $domDepth > 0) {
160 $domDepth--;
161 $node = end($domPath)->nextSibling;
162 array_pop($domPath);
163 array_pop($domClonedPath);
164 $curParent = $curParent->parentNode;
165 }
166 } while ($node != null);
167
168 $curFile = null;
169 $curSize = 0;
170
171 $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
172 $xml->lookupPrefix("http://www.w3.org/1999/xhtml");
173 $xml->preserveWhiteSpace = false;
174 $xml->formatOutput = true;
175
176 for ($idx = 0; $idx < count($files); $idx++) {
177 $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
178 $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
179 $xml2Doc->loadXML($newXML);
180 $html = $xml2Doc->getElementsByTagName("html")->item(0);
181 $html->appendChild($xml2Doc->importNode($head->item(0), true));
182 $body = $xml2Doc->createElement("body");
183 $html->appendChild($body);
184 $body->appendChild($xml2Doc->importNode($files[$idx], true));
185
186 // force pretty printing and correct formatting, should not be needed, but it is.
187 $xml->loadXML($xml2Doc->saveXML());
188
189 $doc = $xml->saveXML();
190
191 if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) {
192 $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc);
193 }
194
195 $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc;
196 }
197
198 return $chapterData;
199 }
200}
201?>
diff --git a/inc/3rdparty/libraries/PHPePub/Logger.php b/inc/3rdparty/libraries/PHPePub/Logger.php
new file mode 100644
index 00000000..314019cb
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/Logger.php
@@ -0,0 +1,92 @@
1<?php
2/**
3 * Simple log line aggregator.
4 *
5 * @author A. Grandt <php@grandt.com>
6 * @copyright 2012-2013 A. Grandt
7 * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else.
8 * @version 1.00
9 */
10class Logger {
11 const VERSION = 1.00;
12
13 private $log = "";
14 private $tStart;
15 private $tLast;
16 private $name = NULL;
17 private $isLogging = FALSE;
18 private $isDebugging = FALSE;
19
20 /**
21 * Class constructor.
22 *
23 * @return void
24 */
25 function __construct($name = NULL, $isLogging = FALSE) {
26 if ($name === NULL) {
27 $this->name = "";
28 } else {
29 $this->name = $name . " : ";
30 }
31 $this->isLogging = $isLogging;
32 $this->start();
33 }
34
35 /**
36 * Class destructor
37 *
38 * @return void
39 * @TODO make sure elements in the destructor match the current class elements
40 */
41 function __destruct() {
42 unset($this->log);
43 }
44
45 function start() {
46 /* Prepare Logging. Just in case it's used. later */
47 if ($this->isLogging) {
48 $this->tStart = gettimeofday();
49 $this->tLast = $this->tStart;
50 $this->log = "<h1>Log: " . $this->name . "</h1>\n<pre>Started: " . gmdate("D, d M Y H:i:s T", $this->tStart['sec']) . "\n &#916; Start ; &#916; Last ;";
51 $this->logLine("Start");
52 }
53 }
54
55 function dumpInstalledModules() {
56 if ($this->isLogging) {
57 $isCurlInstalled = extension_loaded('curl') && function_exists('curl_version');
58 $isGdInstalled = extension_loaded('gd') && function_exists('gd_info');
59 $isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype');
60 $isFileGetContentsInstalled = function_exists('file_get_contents');
61 $isFileGetContentsExtInstalled = $isFileGetContentsInstalled && ini_get('allow_url_fopen');
62
63 $this->logLine("isCurlInstalled...............: " . ($isCurlInstalled ? "Yes" : "No"));
64 $this->logLine("isGdInstalled.................: " . ($isGdInstalled ? "Yes" : "No"));
65 $this->logLine("isExifInstalled...............: " . ($isExifInstalled ? "Yes" : "No"));
66 $this->logLine("isFileGetContentsInstalled....: " . ($isFileGetContentsInstalled ? "Yes" : "No"));
67 $this->logLine("isFileGetContentsExtInstalled.: " . ($isFileGetContentsExtInstalled ? "Yes" : "No"));
68 }
69 }
70
71 function logLine($line) {
72 if ($this->isLogging) {
73 $tTemp = gettimeofday();
74 $tS = $this->tStart['sec'] + (((int)($this->tStart['usec']/100))/10000);
75 $tL = $this->tLast['sec'] + (((int)($this->tLast['usec']/100))/10000);
76 $tT = $tTemp['sec'] + (((int)($tTemp['usec']/100))/10000);
77
78 $logline = sprintf("\n+%08.04f; +%08.04f; ", ($tT-$tS), ($tT-$tL)) . $this->name . $line;
79 $this->log .= $logline;
80 $this->tLast = $tTemp;
81
82 if ($this->isDebugging) {
83 echo "<pre>" . $logline . "\n</pre>\n";
84 }
85 }
86 }
87
88 function getLog() {
89 return $this->log;
90 }
91}
92?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/PHPePub/Zip.php b/inc/3rdparty/libraries/PHPePub/Zip.php
new file mode 100644
index 00000000..01e03566
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/Zip.php
@@ -0,0 +1,818 @@
1<?php
2/**
3 * Class to create and manage a Zip file.
4 *
5 * Initially inspired by CreateZipFile by Rochak Chauhan www.rochakchauhan.com (http://www.phpclasses.org/browse/package/2322.html)
6 * and
7 * http://www.pkware.com/documents/casestudies/APPNOTE.TXT Zip file specification.
8 *
9 * License: GNU LGPL, Attribution required for commercial implementations, requested for everything else.
10 *
11 * @author A. Grandt <php@grandt.com>
12 * @copyright 2009-2014 A. Grandt
13 * @license GNU LGPL 2.1
14 * @link http://www.phpclasses.org/package/6110
15 * @link https://github.com/Grandt/PHPZip
16 * @version 1.60
17 */
18class Zip {
19 const VERSION = 1.60;
20
21 const ZIP_LOCAL_FILE_HEADER = "\x50\x4b\x03\x04"; // Local file header signature
22 const ZIP_CENTRAL_FILE_HEADER = "\x50\x4b\x01\x02"; // Central file header signature
23 const ZIP_END_OF_CENTRAL_DIRECTORY = "\x50\x4b\x05\x06\x00\x00\x00\x00"; //end of Central directory record
24
25 const EXT_FILE_ATTR_DIR = 010173200020; // Permission 755 drwxr-xr-x = (((S_IFDIR | 0755) << 16) | S_DOS_D);
26 const EXT_FILE_ATTR_FILE = 020151000040; // Permission 644 -rw-r--r-- = (((S_IFREG | 0644) << 16) | S_DOS_A);
27
28 const ATTR_VERSION_TO_EXTRACT = "\x14\x00"; // Version needed to extract
29 const ATTR_MADE_BY_VERSION = "\x1E\x03"; // Made By Version
30
31 // Unix file types
32 const S_IFIFO = 0010000; // named pipe (fifo)
33 const S_IFCHR = 0020000; // character special
34 const S_IFDIR = 0040000; // directory
35 const S_IFBLK = 0060000; // block special
36 const S_IFREG = 0100000; // regular
37 const S_IFLNK = 0120000; // symbolic link
38 const S_IFSOCK = 0140000; // socket
39
40 // setuid/setgid/sticky bits, the same as for chmod:
41
42 const S_ISUID = 0004000; // set user id on execution
43 const S_ISGID = 0002000; // set group id on execution
44 const S_ISTXT = 0001000; // sticky bit
45
46 // And of course, the other 12 bits are for the permissions, the same as for chmod:
47 // When addding these up, you can also just write the permissions as a simgle octal number
48 // ie. 0755. The leading 0 specifies octal notation.
49 const S_IRWXU = 0000700; // RWX mask for owner
50 const S_IRUSR = 0000400; // R for owner
51 const S_IWUSR = 0000200; // W for owner
52 const S_IXUSR = 0000100; // X for owner
53 const S_IRWXG = 0000070; // RWX mask for group
54 const S_IRGRP = 0000040; // R for group
55 const S_IWGRP = 0000020; // W for group
56 const S_IXGRP = 0000010; // X for group
57 const S_IRWXO = 0000007; // RWX mask for other
58 const S_IROTH = 0000004; // R for other
59 const S_IWOTH = 0000002; // W for other
60 const S_IXOTH = 0000001; // X for other
61 const S_ISVTX = 0001000; // save swapped text even after use
62
63 // Filetype, sticky and permissions are added up, and shifted 16 bits left BEFORE adding the DOS flags.
64
65 // DOS file type flags, we really only use the S_DOS_D flag.
66
67 const S_DOS_A = 0000040; // DOS flag for Archive
68 const S_DOS_D = 0000020; // DOS flag for Directory
69 const S_DOS_V = 0000010; // DOS flag for Volume
70 const S_DOS_S = 0000004; // DOS flag for System
71 const S_DOS_H = 0000002; // DOS flag for Hidden
72 const S_DOS_R = 0000001; // DOS flag for Read Only
73
74 private $zipMemoryThreshold = 1048576; // Autocreate tempfile if the zip data exceeds 1048576 bytes (1 MB)
75
76 private $zipData = NULL;
77 private $zipFile = NULL;
78 private $zipComment = NULL;
79 private $cdRec = array(); // central directory
80 private $offset = 0;
81 private $isFinalized = FALSE;
82 private $addExtraField = TRUE;
83
84 private $streamChunkSize = 65536;
85 private $streamFilePath = NULL;
86 private $streamTimestamp = NULL;
87 private $streamFileComment = NULL;
88 private $streamFile = NULL;
89 private $streamData = NULL;
90 private $streamFileLength = 0;
91 private $streamExtFileAttr = null;
92
93 /**
94 * Constructor.
95 *
96 * @param boolean $useZipFile Write temp zip data to tempFile? Default FALSE
97 */
98 function __construct($useZipFile = FALSE) {
99 if ($useZipFile) {
100 $this->zipFile = tmpfile();
101 } else {
102 $this->zipData = "";
103 }
104 }
105
106 function __destruct() {
107 if (is_resource($this->zipFile)) {
108 fclose($this->zipFile);
109 }
110 $this->zipData = NULL;
111 }
112
113 /**
114 * Extra fields on the Zip directory records are Unix time codes needed for compatibility on the default Mac zip archive tool.
115 * These are enabled as default, as they do no harm elsewhere and only add 26 bytes per file added.
116 *
117 * @param bool $setExtraField TRUE (default) will enable adding of extra fields, anything else will disable it.
118 */
119 function setExtraField($setExtraField = TRUE) {
120 $this->addExtraField = ($setExtraField === TRUE);
121 }
122
123 /**
124 * Set Zip archive comment.
125 *
126 * @param string $newComment New comment. NULL to clear.
127 * @return bool $success
128 */
129 public function setComment($newComment = NULL) {
130 if ($this->isFinalized) {
131 return FALSE;
132 }
133 $this->zipComment = $newComment;
134
135 return TRUE;
136 }
137
138 /**
139 * Set zip file to write zip data to.
140 * This will cause all present and future data written to this class to be written to this file.
141 * This can be used at any time, even after the Zip Archive have been finalized. Any previous file will be closed.
142 * Warning: If the given file already exists, it will be overwritten.
143 *
144 * @param string $fileName
145 * @return bool $success
146 */
147 public function setZipFile($fileName) {
148 if (is_file($fileName)) {
149 unlink($fileName);
150 }
151 $fd=fopen($fileName, "x+b");
152 if (is_resource($this->zipFile)) {
153 rewind($this->zipFile);
154 while (!feof($this->zipFile)) {
155 fwrite($fd, fread($this->zipFile, $this->streamChunkSize));
156 }
157
158 fclose($this->zipFile);
159 } else {
160 fwrite($fd, $this->zipData);
161 $this->zipData = NULL;
162 }
163 $this->zipFile = $fd;
164
165 return TRUE;
166 }
167
168 /**
169 * Add an empty directory entry to the zip archive.
170 * Basically this is only used if an empty directory is added.
171 *
172 * @param string $directoryPath Directory Path and name to be added to the archive.
173 * @param int $timestamp (Optional) Timestamp for the added directory, if omitted or set to 0, the current time will be used.
174 * @param string $fileComment (Optional) Comment to be added to the archive for this directory. To use fileComment, timestamp must be given.
175 * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this.
176 * @return bool $success
177 */
178 public function addDirectory($directoryPath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_DIR) {
179 if ($this->isFinalized) {
180 return FALSE;
181 }
182 $directoryPath = str_replace("\\", "/", $directoryPath);
183 $directoryPath = rtrim($directoryPath, "/");
184
185 if (strlen($directoryPath) > 0) {
186 $this->buildZipEntry($directoryPath.'/', $fileComment, "\x00\x00", "\x00\x00", $timestamp, "\x00\x00\x00\x00", 0, 0, $extFileAttr);
187 return TRUE;
188 }
189 return FALSE;
190 }
191
192 /**
193 * Add a file to the archive at the specified location and file name.
194 *
195 * @param string $data File data.
196 * @param string $filePath Filepath and name to be used in the archive.
197 * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used.
198 * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given.
199 * @param bool $compress (Optional) Compress file, if set to FALSE the file will only be stored. Default TRUE.
200 * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this.
201 * @return bool $success
202 */
203 public function addFile($data, $filePath, $timestamp = 0, $fileComment = NULL, $compress = TRUE, $extFileAttr = self::EXT_FILE_ATTR_FILE) {
204 if ($this->isFinalized) {
205 return FALSE;
206 }
207
208 if (is_resource($data) && get_resource_type($data) == "stream") {
209 $this->addLargeFile($data, $filePath, $timestamp, $fileComment, $extFileAttr);
210 return FALSE;
211 }
212
213 $gzData = "";
214 $gzType = "\x08\x00"; // Compression type 8 = deflate
215 $gpFlags = "\x00\x00"; // General Purpose bit flags for compression type 8 it is: 0=Normal, 1=Maximum, 2=Fast, 3=super fast compression.
216 $dataLength = strlen($data);
217 $fileCRC32 = pack("V", crc32($data));
218
219 if ($compress) {
220 $gzTmp = gzcompress($data);
221 $gzData = substr(substr($gzTmp, 0, strlen($gzTmp) - 4), 2); // gzcompress adds a 2 byte header and 4 byte CRC we can't use.
222 // The 2 byte header does contain useful data, though in this case the 2 parameters we'd be interrested in will always be 8 for compression type, and 2 for General purpose flag.
223 $gzLength = strlen($gzData);
224 } else {
225 $gzLength = $dataLength;
226 }
227
228 if ($gzLength >= $dataLength) {
229 $gzLength = $dataLength;
230 $gzData = $data;
231 $gzType = "\x00\x00"; // Compression type 0 = stored
232 $gpFlags = "\x00\x00"; // Compression type 0 = stored
233 }
234
235 if (!is_resource($this->zipFile) && ($this->offset + $gzLength) > $this->zipMemoryThreshold) {
236 $this->zipflush();
237 }
238
239 $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr);
240
241 $this->zipwrite($gzData);
242
243 return TRUE;
244 }
245
246 /**
247 * Add the content to a directory.
248 *
249 * @author Adam Schmalhofer <Adam.Schmalhofer@gmx.de>
250 * @author A. Grandt
251 *
252 * @param string $realPath Path on the file system.
253 * @param string $zipPath Filepath and name to be used in the archive.
254 * @param bool $recursive Add content recursively, default is TRUE.
255 * @param bool $followSymlinks Follow and add symbolic links, if they are accessible, default is TRUE.
256 * @param array &$addedFiles Reference to the added files, this is used to prevent duplicates, efault is an empty array.
257 * If you start the function by parsing an array, the array will be populated with the realPath
258 * and zipPath kay/value pairs added to the archive by the function.
259 * @param bool $overrideFilePermissions Force the use of the file/dir permissions set in the $extDirAttr
260 * and $extFileAttr parameters.
261 * @param int $extDirAttr Permissions for directories.
262 * @param int $extFileAttr Permissions for files.
263 */
264 public function addDirectoryContent($realPath, $zipPath, $recursive = TRUE, $followSymlinks = TRUE, &$addedFiles = array(),
265 $overrideFilePermissions = FALSE, $extDirAttr = self::EXT_FILE_ATTR_DIR, $extFileAttr = self::EXT_FILE_ATTR_FILE) {
266 if (file_exists($realPath) && !isset($addedFiles[realpath($realPath)])) {
267 if (is_dir($realPath)) {
268 if ($overrideFilePermissions) {
269 $this->addDirectory($zipPath, 0, null, $extDirAttr);
270 } else {
271 $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($realPath));
272 }
273 }
274
275 $addedFiles[realpath($realPath)] = $zipPath;
276
277 $iter = new DirectoryIterator($realPath);
278 foreach ($iter as $file) {
279 if ($file->isDot()) {
280 continue;
281 }
282 $newRealPath = $file->getPathname();
283 $newZipPath = self::pathJoin($zipPath, $file->getFilename());
284
285 if (file_exists($newRealPath) && ($followSymlinks === TRUE || !is_link($newRealPath))) {
286 if ($file->isFile()) {
287 $addedFiles[realpath($newRealPath)] = $newZipPath;
288 if ($overrideFilePermissions) {
289 $this->addLargeFile($newRealPath, $newZipPath, 0, null, $extFileAttr);
290 } else {
291 $this->addLargeFile($newRealPath, $newZipPath, 0, null, self::getFileExtAttr($newRealPath));
292 }
293 } else if ($recursive === TRUE) {
294 $this->addDirectoryContent($newRealPath, $newZipPath, $recursive, $followSymlinks, $addedFiles, $overrideFilePermissions, $extDirAttr, $extFileAttr);
295 } else {
296 if ($overrideFilePermissions) {
297 $this->addDirectory($zipPath, 0, null, $extDirAttr);
298 } else {
299 $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($newRealPath));
300 }
301 }
302 }
303 }
304 }
305 }
306
307 /**
308 * Add a file to the archive at the specified location and file name.
309 *
310 * @param string $dataFile File name/path.
311 * @param string $filePath Filepath and name to be used in the archive.
312 * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used.
313 * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given.
314 * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this.
315 * @return bool $success
316 */
317 public function addLargeFile($dataFile, $filePath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_FILE) {
318 if ($this->isFinalized) {
319 return FALSE;
320 }
321
322 if (is_string($dataFile) && is_file($dataFile)) {
323 $this->processFile($dataFile, $filePath, $timestamp, $fileComment, $extFileAttr);
324 } else if (is_resource($dataFile) && get_resource_type($dataFile) == "stream") {
325 $fh = $dataFile;
326 $this->openStream($filePath, $timestamp, $fileComment, $extFileAttr);
327
328 while (!feof($fh)) {
329 $this->addStreamData(fread($fh, $this->streamChunkSize));
330 }
331 $this->closeStream($this->addExtraField);
332 }
333 return TRUE;
334 }
335
336 /**
337 * Create a stream to be used for large entries.
338 *
339 * @param string $filePath Filepath and name to be used in the archive.
340 * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used.
341 * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given.
342 * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this.
343 * @return bool $success
344 */
345 public function openStream($filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) {
346 if (!function_exists('sys_get_temp_dir')) {
347 die ("ERROR: Zip " . self::VERSION . " requires PHP version 5.2.1 or above if large files are used.");
348 }
349
350 if ($this->isFinalized) {
351 return FALSE;
352 }
353
354 $this->zipflush();
355
356 if (strlen($this->streamFilePath) > 0) {
357 $this->closeStream();
358 }
359
360 $this->streamFile = tempnam(sys_get_temp_dir(), 'Zip');
361 $this->streamData = fopen($this->streamFile, "wb");
362 $this->streamFilePath = $filePath;
363 $this->streamTimestamp = $timestamp;
364 $this->streamFileComment = $fileComment;
365 $this->streamFileLength = 0;
366 $this->streamExtFileAttr = $extFileAttr;
367
368 return TRUE;
369 }
370
371 /**
372 * Add data to the open stream.
373 *
374 * @param string $data
375 * @return mixed length in bytes added or FALSE if the archive is finalized or there are no open stream.
376 */
377 public function addStreamData($data) {
378 if ($this->isFinalized || strlen($this->streamFilePath) == 0) {
379 return FALSE;
380 }
381
382 $length = fwrite($this->streamData, $data, strlen($data));
383 if ($length != strlen($data)) {
384 die ("<p>Length mismatch</p>\n");
385 }
386 $this->streamFileLength += $length;
387
388 return $length;
389 }
390
391 /**
392 * Close the current stream.
393 *
394 * @return bool $success
395 */
396 public function closeStream() {
397 if ($this->isFinalized || strlen($this->streamFilePath) == 0) {
398 return FALSE;
399 }
400
401 fflush($this->streamData);
402 fclose($this->streamData);
403
404 $this->processFile($this->streamFile, $this->streamFilePath, $this->streamTimestamp, $this->streamFileComment, $this->streamExtFileAttr);
405
406 $this->streamData = null;
407 $this->streamFilePath = null;
408 $this->streamTimestamp = null;
409 $this->streamFileComment = null;
410 $this->streamFileLength = 0;
411 $this->streamExtFileAttr = null;
412
413 // Windows is a little slow at times, so a millisecond later, we can unlink this.
414 unlink($this->streamFile);
415
416 $this->streamFile = null;
417
418 return TRUE;
419 }
420
421 private function processFile($dataFile, $filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) {
422 if ($this->isFinalized) {
423 return FALSE;
424 }
425
426 $tempzip = tempnam(sys_get_temp_dir(), 'ZipStream');
427
428 $zip = new ZipArchive;
429 if ($zip->open($tempzip) === TRUE) {
430 $zip->addFile($dataFile, 'file');
431 $zip->close();
432 }
433
434 $file_handle = fopen($tempzip, "rb");
435 $stats = fstat($file_handle);
436 $eof = $stats['size']-72;
437
438 fseek($file_handle, 6);
439
440 $gpFlags = fread($file_handle, 2);
441 $gzType = fread($file_handle, 2);
442 fread($file_handle, 4);
443 $fileCRC32 = fread($file_handle, 4);
444 $v = unpack("Vval", fread($file_handle, 4));
445 $gzLength = $v['val'];
446 $v = unpack("Vval", fread($file_handle, 4));
447 $dataLength = $v['val'];
448
449 $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr);
450
451 fseek($file_handle, 34);
452 $pos = 34;
453
454 while (!feof($file_handle) && $pos < $eof) {
455 $datalen = $this->streamChunkSize;
456 if ($pos + $this->streamChunkSize > $eof) {
457 $datalen = $eof-$pos;
458 }
459 $data = fread($file_handle, $datalen);
460 $pos += $datalen;
461
462 $this->zipwrite($data);
463 }
464
465 fclose($file_handle);
466
467 unlink($tempzip);
468 }
469
470 /**
471 * Close the archive.
472 * A closed archive can no longer have new files added to it.
473 *
474 * @return bool $success
475 */
476 public function finalize() {
477 if (!$this->isFinalized) {
478 if (strlen($this->streamFilePath) > 0) {
479 $this->closeStream();
480 }
481 $cd = implode("", $this->cdRec);
482
483 $cdRecSize = pack("v", sizeof($this->cdRec));
484 $cdRec = $cd . self::ZIP_END_OF_CENTRAL_DIRECTORY
485 . $cdRecSize . $cdRecSize
486 . pack("VV", strlen($cd), $this->offset);
487 if (!empty($this->zipComment)) {
488 $cdRec .= pack("v", strlen($this->zipComment)) . $this->zipComment;
489 } else {
490 $cdRec .= "\x00\x00";
491 }
492
493 $this->zipwrite($cdRec);
494
495 $this->isFinalized = TRUE;
496 $this->cdRec = NULL;
497
498 return TRUE;
499 }
500 return FALSE;
501 }
502
503 /**
504 * Get the handle ressource for the archive zip file.
505 * If the zip haven't been finalized yet, this will cause it to become finalized
506 *
507 * @return zip file handle
508 */
509 public function getZipFile() {
510 if (!$this->isFinalized) {
511 $this->finalize();
512 }
513
514 $this->zipflush();
515
516 rewind($this->zipFile);
517
518 return $this->zipFile;
519 }
520
521 /**
522 * Get the zip file contents
523 * If the zip haven't been finalized yet, this will cause it to become finalized
524 *
525 * @return zip data
526 */
527 public function getZipData() {
528 if (!$this->isFinalized) {
529 $this->finalize();
530 }
531 if (!is_resource($this->zipFile)) {
532 return $this->zipData;
533 } else {
534 rewind($this->zipFile);
535 $filestat = fstat($this->zipFile);
536 return fread($this->zipFile, $filestat['size']);
537 }
538 }
539
540 /**
541 * Send the archive as a zip download
542 *
543 * @param String $fileName The name of the Zip archive, in ISO-8859-1 (or ASCII) encoding, ie. "archive.zip". Optional, defaults to NULL, which means that no ISO-8859-1 encoded file name will be specified.
544 * @param String $contentType Content mime type. Optional, defaults to "application/zip".
545 * @param String $utf8FileName The name of the Zip archive, in UTF-8 encoding. Optional, defaults to NULL, which means that no UTF-8 encoded file name will be specified.
546 * @param bool $inline Use Content-Disposition with "inline" instead of "attached". Optional, defaults to FALSE.
547 * @return bool $success
548 */
549 function sendZip($fileName = null, $contentType = "application/zip", $utf8FileName = null, $inline = false) {
550 if (!$this->isFinalized) {
551 $this->finalize();
552 }
553
554 $headerFile = null;
555 $headerLine = null;
556 if (!headers_sent($headerFile, $headerLine) or die("<p><strong>Error:</strong> Unable to send file $fileName. HTML Headers have already been sent from <strong>$headerFile</strong> in line <strong>$headerLine</strong></p>")) {
557 if ((ob_get_contents() === FALSE || ob_get_contents() == '') or die("\n<p><strong>Error:</strong> Unable to send file <strong>$fileName</strong>. Output buffer contains the following text (typically warnings or errors):<br>" . htmlentities(ob_get_contents()) . "</p>")) {
558 if (ini_get('zlib.output_compression')) {
559 ini_set('zlib.output_compression', 'Off');
560 }
561
562 header("Pragma: public");
563 header("Last-Modified: " . gmdate("D, d M Y H:i:s T"));
564 header("Expires: 0");
565 header("Accept-Ranges: bytes");
566 header("Connection: close");
567 header("Content-Type: " . $contentType);
568 $cd = "Content-Disposition: ";
569 if ($inline) {
570 $cd .= "inline";
571 } else{
572 $cd .= "attached";
573 }
574 if ($fileName) {
575 $cd .= '; filename="' . $fileName . '"';
576 }
577 if ($utf8FileName) {
578 $cd .= "; filename*=UTF-8''" . rawurlencode($utf8FileName);
579 }
580 header($cd);
581 header("Content-Length: ". $this->getArchiveSize());
582
583 if (!is_resource($this->zipFile)) {
584 echo $this->zipData;
585 } else {
586 rewind($this->zipFile);
587
588 while (!feof($this->zipFile)) {
589 echo fread($this->zipFile, $this->streamChunkSize);
590 }
591 }
592 }
593 return TRUE;
594 }
595 return FALSE;
596 }
597
598 /**
599 * Return the current size of the archive
600 *
601 * @return $size Size of the archive
602 */
603 public function getArchiveSize() {
604 if (!is_resource($this->zipFile)) {
605 return strlen($this->zipData);
606 }
607 $filestat = fstat($this->zipFile);
608
609 return $filestat['size'];
610 }
611
612 /**
613 * Calculate the 2 byte dostime used in the zip entries.
614 *
615 * @param int $timestamp
616 * @return 2-byte encoded DOS Date
617 */
618 private function getDosTime($timestamp = 0) {
619 $timestamp = (int)$timestamp;
620 $oldTZ = @date_default_timezone_get();
621 date_default_timezone_set('UTC');
622 $date = ($timestamp == 0 ? getdate() : getdate($timestamp));
623 date_default_timezone_set($oldTZ);
624 if ($date["year"] >= 1980) {
625 return pack("V", (($date["mday"] + ($date["mon"] << 5) + (($date["year"]-1980) << 9)) << 16) |
626 (($date["seconds"] >> 1) + ($date["minutes"] << 5) + ($date["hours"] << 11)));
627 }
628 return "\x00\x00\x00\x00";
629 }
630
631 /**
632 * Build the Zip file structures
633 *
634 * @param string $filePath
635 * @param string $fileComment
636 * @param string $gpFlags
637 * @param string $gzType
638 * @param int $timestamp
639 * @param string $fileCRC32
640 * @param int $gzLength
641 * @param int $dataLength
642 * @param int $extFileAttr Use self::EXT_FILE_ATTR_FILE for files, self::EXT_FILE_ATTR_DIR for Directories.
643 */
644 private function buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr) {
645 $filePath = str_replace("\\", "/", $filePath);
646 $fileCommentLength = (empty($fileComment) ? 0 : strlen($fileComment));
647 $timestamp = (int)$timestamp;
648 $timestamp = ($timestamp == 0 ? time() : $timestamp);
649
650 $dosTime = $this->getDosTime($timestamp);
651 $tsPack = pack("V", $timestamp);
652
653 $ux = "\x75\x78\x0B\x00\x01\x04\xE8\x03\x00\x00\x04\x00\x00\x00\x00";
654
655 if (!isset($gpFlags) || strlen($gpFlags) != 2) {
656 $gpFlags = "\x00\x00";
657 }
658
659 $isFileUTF8 = mb_check_encoding($filePath, "UTF-8") && !mb_check_encoding($filePath, "ASCII");
660 $isCommentUTF8 = !empty($fileComment) && mb_check_encoding($fileComment, "UTF-8") && !mb_check_encoding($fileComment, "ASCII");
661 if ($isFileUTF8 || $isCommentUTF8) {
662 $flag = 0;
663 $gpFlagsV = unpack("vflags", $gpFlags);
664 if (isset($gpFlagsV['flags'])) {
665 $flag = $gpFlagsV['flags'];
666 }
667 $gpFlags = pack("v", $flag | (1 << 11));
668 }
669
670 $header = $gpFlags . $gzType . $dosTime. $fileCRC32
671 . pack("VVv", $gzLength, $dataLength, strlen($filePath)); // File name length
672
673 $zipEntry = self::ZIP_LOCAL_FILE_HEADER;
674 $zipEntry .= self::ATTR_VERSION_TO_EXTRACT;
675 $zipEntry .= $header;
676 $zipEntry .= pack("v", ($this->addExtraField ? 28 : 0)); // Extra field length
677 $zipEntry .= $filePath; // FileName
678 // Extra fields
679 if ($this->addExtraField) {
680 $zipEntry .= "\x55\x54\x09\x00\x03" . $tsPack . $tsPack . $ux;
681 }
682 $this->zipwrite($zipEntry);
683
684 $cdEntry = self::ZIP_CENTRAL_FILE_HEADER;
685 $cdEntry .= self::ATTR_MADE_BY_VERSION;
686 $cdEntry .= ($dataLength === 0 ? "\x0A\x00" : self::ATTR_VERSION_TO_EXTRACT);
687 $cdEntry .= $header;
688 $cdEntry .= pack("v", ($this->addExtraField ? 24 : 0)); // Extra field length
689 $cdEntry .= pack("v", $fileCommentLength); // File comment length
690 $cdEntry .= "\x00\x00"; // Disk number start
691 $cdEntry .= "\x00\x00"; // internal file attributes
692 $cdEntry .= pack("V", $extFileAttr); // External file attributes
693 $cdEntry .= pack("V", $this->offset); // Relative offset of local header
694 $cdEntry .= $filePath; // FileName
695 // Extra fields
696 if ($this->addExtraField) {
697 $cdEntry .= "\x55\x54\x05\x00\x03" . $tsPack . $ux;
698 }
699 if (!empty($fileComment)) {
700 $cdEntry .= $fileComment; // Comment
701 }
702
703 $this->cdRec[] = $cdEntry;
704 $this->offset += strlen($zipEntry) + $gzLength;
705 }
706
707 private function zipwrite($data) {
708 if (!is_resource($this->zipFile)) {
709 $this->zipData .= $data;
710 } else {
711 fwrite($this->zipFile, $data);
712 fflush($this->zipFile);
713 }
714 }
715
716 private function zipflush() {
717 if (!is_resource($this->zipFile)) {
718 $this->zipFile = tmpfile();
719 fwrite($this->zipFile, $this->zipData);
720 $this->zipData = NULL;
721 }
722 }
723
724 /**
725 * Join $file to $dir path, and clean up any excess slashes.
726 *
727 * @param string $dir
728 * @param string $file
729 */
730 public static function pathJoin($dir, $file) {
731 if (empty($dir) || empty($file)) {
732 return self::getRelativePath($dir . $file);
733 }
734 return self::getRelativePath($dir . '/' . $file);
735 }
736
737 /**
738 * Clean up a path, removing any unnecessary elements such as /./, // or redundant ../ segments.
739 * If the path starts with a "/", it is deemed an absolute path and any /../ in the beginning is stripped off.
740 * The returned path will not end in a "/".
741 *
742 * Sometimes, when a path is generated from multiple fragments,
743 * you can get something like "../data/html/../images/image.jpeg"
744 * This will normalize that example path to "../data/images/image.jpeg"
745 *
746 * @param string $path The path to clean up
747 * @return string the clean path
748 */
749 public static function getRelativePath($path) {
750 $path = preg_replace("#/+\.?/+#", "/", str_replace("\\", "/", $path));
751 $dirs = explode("/", rtrim(preg_replace('#^(?:\./)+#', '', $path), '/'));
752
753 $offset = 0;
754 $sub = 0;
755 $subOffset = 0;
756 $root = "";
757
758 if (empty($dirs[0])) {
759 $root = "/";
760 $dirs = array_splice($dirs, 1);
761 } else if (preg_match("#[A-Za-z]:#", $dirs[0])) {
762 $root = strtoupper($dirs[0]) . "/";
763 $dirs = array_splice($dirs, 1);
764 }
765
766 $newDirs = array();
767 foreach ($dirs as $dir) {
768 if ($dir !== "..") {
769 $subOffset--;
770 $newDirs[++$offset] = $dir;
771 } else {
772 $subOffset++;
773 if (--$offset < 0) {
774 $offset = 0;
775 if ($subOffset > $sub) {
776 $sub++;
777 }
778 }
779 }
780 }
781
782 if (empty($root)) {
783 $root = str_repeat("../", $sub);
784 }
785 return $root . implode("/", array_slice($newDirs, 0, $offset));
786 }
787
788 /**
789 * Create the file permissions for a file or directory, for use in the extFileAttr parameters.
790 *
791 * @param int $owner Unix permisions for owner (octal from 00 to 07)
792 * @param int $group Unix permisions for group (octal from 00 to 07)
793 * @param int $other Unix permisions for others (octal from 00 to 07)
794 * @param bool $isFile
795 * @return EXTRERNAL_REF field.
796 */
797 public static function generateExtAttr($owner = 07, $group = 05, $other = 05, $isFile = true) {
798 $fp = $isFile ? self::S_IFREG : self::S_IFDIR;
799 $fp |= (($owner & 07) << 6) | (($group & 07) << 3) | ($other & 07);
800
801 return ($fp << 16) | ($isFile ? self::S_DOS_A : self::S_DOS_D);
802 }
803
804 /**
805 * Get the file permissions for a file or directory, for use in the extFileAttr parameters.
806 *
807 * @param string $filename
808 * @return external ref field, or FALSE if the file is not found.
809 */
810 public static function getFileExtAttr($filename) {
811 if (file_exists($filename)) {
812 $fp = fileperms($filename) << 16;
813 return $fp | (is_dir($filename) ? self::S_DOS_D : self::S_DOS_A);
814 }
815 return FALSE;
816 }
817}
818?>
diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt
new file mode 100644
index 00000000..9424a83e
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt
@@ -0,0 +1,31 @@
1 DrUUID RFC4122 library for PHP5
2 by J. King (http://jkingweb.ca/)
3 Licensed under MIT license
4
5 See http://jkingweb.ca/code/php/lib.uuid/
6 for documentation
7
8 Last revised 2010-02-15
9
10Copyright (c) 2009 J. King
11
12Permission is hereby granted, free of charge, to any person
13obtaining a copy of this software and associated documentation
14files (the "Software"), to deal in the Software without
15restriction, including without limitation the rights to use,
16copy, modify, merge, publish, distribute, sublicense, and/or sell
17copies of the Software, and to permit persons to whom the
18Software is furnished to do so, subject to the following
19conditions:
20
21The above copyright notice and this permission notice shall be
22included in all copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
26OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
28HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
29WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
30FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
31OTHER DEALINGS IN THE SOFTWARE.
diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.php b/inc/3rdparty/libraries/PHPePub/lib.uuid.php
new file mode 100644
index 00000000..c6a8de52
--- /dev/null
+++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.php
@@ -0,0 +1,314 @@
1<?php
2/*
3 DrUUID RFC4122 library for PHP5
4by J. King (http://jkingweb.ca/)
5Licensed under MIT license
6
7See http://jkingweb.ca/code/php/lib.uuid/
8for documentation
9
10Last revised 2010-02-15
11*/
12
13/*
14 Copyright (c) 2009 J. King
15
16Permission is hereby granted, free of charge, to any person
17obtaining a copy of this software and associated documentation
18files (the "Software"), to deal in the Software without
19restriction, including without limitation the rights to use,
20copy, modify, merge, publish, distribute, sublicense, and/or sell
21copies of the Software, and to permit persons to whom the
22Software is furnished to do so, subject to the following
23conditions:
24
25The above copyright notice and this permission notice shall be
26included in all copies or substantial portions of the Software.
27
28THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
29EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
30OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
31NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
32HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
33WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
35OTHER DEALINGS IN THE SOFTWARE.
36*/
37
38
39class UUID {
40 const MD5 = 3;
41 const SHA1 = 5;
42 const clearVer = 15; // 00001111 Clears all bits of version byte with AND
43 const clearVar = 63; // 00111111 Clears all relevant bits of variant byte with AND
44 const varRes = 224; // 11100000 Variant reserved for future use
45 const varMS = 192; // 11000000 Microsft GUID variant
46 const varRFC = 128; // 10000000 The RFC 4122 variant (this variant)
47 const varNCS = 0; // 00000000 The NCS compatibility variant
48 const version1 = 16; // 00010000
49 const version3 = 48; // 00110000
50 const version4 = 64; // 01000000
51 const version5 = 80; // 01010000
52 const interval = 0x01b21dd213814000; // Time (in 100ns steps) between the start of the UTC and Unix epochs
53 const nsDNS = '6ba7b810-9dad-11d1-80b4-00c04fd430c8';
54 const nsURL = '6ba7b811-9dad-11d1-80b4-00c04fd430c8';
55 const nsOID = '6ba7b812-9dad-11d1-80b4-00c04fd430c8';
56 const nsX500 = '6ba7b814-9dad-11d1-80b4-00c04fd430c8';
57 protected static $randomFunc = 'randomTwister';
58 protected static $randomSource = NULL;
59 //instance properties
60 protected $bytes;
61 protected $hex;
62 protected $string;
63 protected $urn;
64 protected $version;
65 protected $variant;
66 protected $node;
67 protected $time;
68
69 public static function mint($ver = 1, $node = NULL, $ns = NULL) {
70 /* Create a new UUID based on provided data. */
71 switch((int) $ver) {
72 case 1:
73 return new self(self::mintTime($node));
74 case 2:
75 // Version 2 is not supported
76 throw new UUIDException("Version 2 is unsupported.");
77 case 3:
78 return new self(self::mintName(self::MD5, $node, $ns));
79 case 4:
80 return new self(self::mintRand());
81 case 5:
82 return new self(self::mintName(self::SHA1, $node, $ns));
83 default:
84 throw new UUIDException("Selected version is invalid or unsupported.");
85 }
86 }
87
88 public static function import($uuid) {
89 /* Import an existing UUID. */
90 return new self(self::makeBin($uuid, 16));
91 }
92
93 public static function compare($a, $b) {
94 /* Compares the binary representations of two UUIDs.
95 The comparison will return true if they are bit-exact,
96 or if neither is valid. */
97 if (self::makeBin($a, 16)==self::makeBin($b, 16)) {
98 return TRUE;
99 } else {
100 return FALSE;
101 }
102 }
103
104 public function __toString() {
105 return $this->string;
106 }
107
108 public function __get($var) {
109 switch($var) {
110 case "bytes":
111 return $this->bytes;
112 case "hex":
113 return bin2hex($this->bytes);
114 case "string":
115 return $this->__toString();
116 case "urn":
117 return "urn:uuid:".$this->__toString();
118 case "version":
119 return ord($this->bytes[6]) >> 4;
120 case "variant":
121 $byte = ord($this->bytes[8]);
122 if ($byte >= self::varRes) {
123 return 3;
124 }
125 if ($byte >= self::varMS) {
126 return 2;
127 }
128 if ($byte >= self::varRFC) {
129 return 1;
130 }
131 return 0;
132 case "node":
133 if (ord($this->bytes[6])>>4==1) {
134 return bin2hex(substr($this->bytes,10));
135 } else {
136 return NULL;
137 }
138 case "time":
139 if (ord($this->bytes[6])>>4==1) {
140 // Restore contiguous big-endian byte order
141 $time = bin2hex($this->bytes[6].$this->bytes[7].$this->bytes[4].$this->bytes[5].$this->bytes[0].$this->bytes[1].$this->bytes[2].$this->bytes[3]);
142 // Clear version flag
143 $time[0] = "0";
144 // Do some reverse arithmetic to get a Unix timestamp
145 $time = (hexdec($time) - self::interval) / 10000000;
146 return $time;
147 } else {
148 return NULL;
149 }
150 default:
151 return NULL;
152 }
153 }
154
155 protected function __construct($uuid) {
156 if (strlen($uuid) != 16) {
157 throw new UUIDException("Input must be a 128-bit integer.");
158 }
159 $this->bytes = $uuid;
160 // Optimize the most common use
161 $this->string =
162 bin2hex(substr($uuid,0,4))."-".
163 bin2hex(substr($uuid,4,2))."-".
164 bin2hex(substr($uuid,6,2))."-".
165 bin2hex(substr($uuid,8,2))."-".
166 bin2hex(substr($uuid,10,6));
167 }
168
169 protected static function mintTime($node = NULL) {
170 /* Generates a Version 1 UUID.
171 These are derived from the time at which they were generated. */
172 // Get time since Gregorian calendar reform in 100ns intervals
173 // This is exceedingly difficult because of PHP's (and pack()'s)
174 // integer size limits.
175 // Note that this will never be more accurate than to the microsecond.
176 $time = microtime(1) * 10000000 + self::interval;
177 // Convert to a string representation
178 $time = sprintf("%F", $time);
179 preg_match("/^\d+/", $time, $time); //strip decimal point
180 // And now to a 64-bit binary representation
181 $time = base_convert($time[0], 10, 16);
182 $time = pack("H*", str_pad($time, 16, "0", STR_PAD_LEFT));
183 // Reorder bytes to their proper locations in the UUID
184 $uuid = $time[4].$time[5].$time[6].$time[7].$time[2].$time[3].$time[0].$time[1];
185 // Generate a random clock sequence
186 $uuid .= self::randomBytes(2);
187 // set variant
188 $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC);
189 // set version
190 $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version1);
191 // Set the final 'node' parameter, a MAC address
192 if ($node) {
193 $node = self::makeBin($node, 6);
194 }
195 if (!$node) {
196 // If no node was provided or if the node was invalid,
197 // generate a random MAC address and set the multicast bit
198 $node = self::randomBytes(6);
199 $node[0] = pack("C", ord($node[0]) | 1);
200 }
201 $uuid .= $node;
202 return $uuid;
203 }
204
205 protected static function mintRand() {
206 /* Generate a Version 4 UUID.
207 These are derived soly from random numbers. */
208 // generate random fields
209 $uuid = self::randomBytes(16);
210 // set variant
211 $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC);
212 // set version
213 $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version4);
214 return $uuid;
215 }
216
217 protected static function mintName($ver, $node, $ns) {
218 /* Generates a Version 3 or Version 5 UUID.
219 These are derived from a hash of a name and its namespace, in binary form. */
220 if (!$node) {
221 throw new UUIDException("A name-string is required for Version 3 or 5 UUIDs.");
222 }
223 // if the namespace UUID isn't binary, make it so
224 $ns = self::makeBin($ns, 16);
225 if (!$ns) {
226 throw new UUIDException("A binary namespace is required for Version 3 or 5 UUIDs.");
227 }
228 $uuid = null;
229 $version = self::version3;
230 switch($ver) {
231 case self::MD5:
232 $version = self::version3;
233 $uuid = md5($ns.$node,1);
234 break;
235 case self::SHA1:
236 $version = self::version5;
237 $uuid = substr(sha1($ns.$node,1),0, 16);
238 break;
239 }
240 // set variant
241 $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC);
242 // set version
243 $uuid[6] = chr(ord($uuid[6]) & self::clearVer | $version);
244 return ($uuid);
245 }
246
247 protected static function makeBin($str, $len) {
248 /* Insure that an input string is either binary or hexadecimal.
249 Returns binary representation, or false on failure. */
250 if ($str instanceof self) {
251 return $str->bytes;
252 }
253 if (strlen($str)==$len) {
254 return $str;
255 } else {
256 $str = preg_replace("/^urn:uuid:/is", "", $str); // strip URN scheme and namespace
257 }
258 $str = preg_replace("/[^a-f0-9]/is", "", $str); // strip non-hex characters
259 if (strlen($str) != ($len * 2)) {
260 return FALSE;
261 } else {
262 return pack("H*", $str);
263 }
264 }
265
266 public static function initRandom() {
267 /* Look for a system-provided source of randomness, which is usually crytographically secure.
268 /dev/urandom is tried first simply out of bias for Linux systems. */
269 if (is_readable('/dev/urandom')) {
270 self::$randomSource = fopen('/dev/urandom', 'rb');
271 self::$randomFunc = 'randomFRead';
272 }
273 else if (class_exists('COM', 0)) {
274 try {
275 self::$randomSource = new COM('CAPICOM.Utilities.1'); // See http://msdn.microsoft.com/en-us/library/aa388182(VS.85).aspx
276 self::$randomFunc = 'randomCOM';
277 }
278 catch(Exception $e) {
279 }
280 }
281 return self::$randomFunc;
282 }
283
284 public static function randomBytes($bytes) {
285 return call_user_func(array('self', self::$randomFunc), $bytes);
286 }
287
288 protected static function randomTwister($bytes) {
289 /* Get the specified number of random bytes, using mt_rand().
290 Randomness is returned as a string of bytes. */
291 $rand = "";
292 for ($a = 0; $a < $bytes; $a++) {
293 $rand .= chr(mt_rand(0, 255));
294 }
295 return $rand;
296 }
297
298 protected static function randomFRead($bytes) {
299 /* Get the specified number of random bytes using a file handle
300 previously opened with UUID::initRandom().
301 Randomness is returned as a string of bytes. */
302 return fread(self::$randomSource, $bytes);
303 }
304
305 protected static function randomCOM($bytes) {
306 /* Get the specified number of random bytes using Windows'
307 randomness source via a COM object previously created by UUID::initRandom().
308 Randomness is returned as a string of bytes. */
309 return base64_decode(self::$randomSource->GetRandom($bytes,0)); // straight binary mysteriously doesn't work, hence the base64
310 }
311}
312
313class UUIDException extends Exception {
314}
diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
index ddd33bb5..21e693e7 100644
--- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
+++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
@@ -1,728 +1,727 @@
1<?php 1<?php
2/** 2/**
3 * Content Extractor 3 * Content Extractor
4 * 4 *
5 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 5 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
6 * to extract content from HTML files. 6 * to extract content from HTML files.
7 * 7 *
8 * @version 1.0 8 * @version 1.0
9 * @date 2013-02-05 9 * @date 2013-02-05
10 * @author Keyvan Minoukadeh 10 * @author Keyvan Minoukadeh
11 * @copyright 2013 Keyvan Minoukadeh 11 * @copyright 2013 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
13 */ 13 */
14 14
15class ContentExtractor 15class ContentExtractor
16{ 16{
17 protected static $tidy_config = array( 17 protected static $tidy_config = array(
18 'clean' => true, 18 'clean' => true,
19 'output-xhtml' => true, 19 'output-xhtml' => true,
20 'logical-emphasis' => true, 20 'logical-emphasis' => true,
21 'show-body-only' => false, 21 'show-body-only' => false,
22 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 22 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
23 'new-inline-tags' => 'mark, time, meter, progress, data', 23 'new-inline-tags' => 'mark, time, meter, progress, data',
24 'wrap' => 0, 24 'wrap' => 0,
25 'drop-empty-paras' => true, 25 'drop-empty-paras' => true,
26 'drop-proprietary-attributes' => false, 26 'drop-proprietary-attributes' => false,
27 'enclose-text' => true, 27 'enclose-text' => true,
28 'enclose-block-text' => true, 28 'enclose-block-text' => true,
29 'merge-divs' => true, 29 'merge-divs' => true,
30 'merge-spans' => true, 30 'merge-spans' => true,
31 'char-encoding' => 'utf8', 31 'char-encoding' => 'utf8',
32 'hide-comments' => true 32 'hide-comments' => true
33 ); 33 );
34 protected $html; 34 protected $html;
35 protected $config; 35 protected $config;
36 protected $title; 36 protected $title;
37 protected $author = array(); 37 protected $author = array();
38 protected $language; 38 protected $language;
39 protected $date; 39 protected $date;
40 protected $body; 40 protected $body;
41 protected $success = false; 41 protected $success = false;
42 protected $nextPageUrl; 42 protected $nextPageUrl;
43 public $allowedParsers = array('libxml', 'html5lib'); 43 public $allowedParsers = array('libxml', 'html5lib');
44 public $fingerprints = array(); 44 public $fingerprints = array();
45 public $readability; 45 public $readability;
46 public $debug = false; 46 public $debug = false;
47 public $debugVerbose = false; 47 public $debugVerbose = false;
48 48
49 function __construct($path, $fallback=null) { 49 function __construct($path, $fallback=null) {
50 SiteConfig::set_config_path($path, $fallback); 50 SiteConfig::set_config_path($path, $fallback);
51 } 51 }
52 52
53 protected function debug($msg) { 53 protected function debug($msg) {
54 if ($this->debug) { 54 if ($this->debug) {
55 $mem = round(memory_get_usage()/1024, 2); 55 $mem = round(memory_get_usage()/1024, 2);
56 $memPeak = round(memory_get_peak_usage()/1024, 2); 56 $memPeak = round(memory_get_peak_usage()/1024, 2);
57 echo '* ',$msg; 57 echo '* ',$msg;
58 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; 58 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
59 echo "\n"; 59 echo "\n";
60 ob_flush(); 60 ob_flush();
61 flush(); 61 flush();
62 } 62 }
63 } 63 }
64 64
65 public function reset() { 65 public function reset() {
66 $this->html = null; 66 $this->html = null;
67 $this->readability = null; 67 $this->readability = null;
68 $this->config = null; 68 $this->config = null;
69 $this->title = null; 69 $this->title = null;
70 $this->body = null; 70 $this->body = null;
71 $this->author = array(); 71 $this->author = array();
72 $this->language = null; 72 $this->language = null;
73 $this->date = null; 73 $this->date = null;
74 $this->nextPageUrl = null; 74 $this->nextPageUrl = null;
75 $this->success = false; 75 $this->success = false;
76 } 76 }
77 77
78 public function findHostUsingFingerprints($html) { 78 public function findHostUsingFingerprints($html) {
79 $this->debug('Checking fingerprints...'); 79 $this->debug('Checking fingerprints...');
80 $head = substr($html, 0, 8000); 80 $head = substr($html, 0, 8000);
81 foreach ($this->fingerprints as $_fp => $_fphost) { 81 foreach ($this->fingerprints as $_fp => $_fphost) {
82 $lookin = 'html'; 82 $lookin = 'html';
83 if (is_array($_fphost)) { 83 if (is_array($_fphost)) {
84 if (isset($_fphost['head']) && $_fphost['head']) { 84 if (isset($_fphost['head']) && $_fphost['head']) {
85 $lookin = 'head'; 85 $lookin = 'head';
86 } 86 }
87 $_fphost = $_fphost['hostname']; 87 $_fphost = $_fphost['hostname'];
88 } 88 }
89 if (strpos($$lookin, $_fp) !== false) { 89 if (strpos($$lookin, $_fp) !== false) {
90 $this->debug("Found match: $_fphost"); 90 $this->debug("Found match: $_fphost");
91 return $_fphost; 91 return $_fphost;
92 } 92 }
93 } 93 }
94 $this->debug('No fingerprint matches'); 94 $this->debug('No fingerprint matches');
95 return false; 95 return false;
96 } 96 }
97 97
98 // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) 98 // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
99 public function buildSiteConfig($url, $html='', $add_to_cache=true) { 99 public function buildSiteConfig($url, $html='', $add_to_cache=true) {
100 // extract host name 100 // extract host name
101 $host = @parse_url($url, PHP_URL_HOST); 101 $host = @parse_url($url, PHP_URL_HOST);
102 $host = strtolower($host); 102 $host = strtolower($host);
103 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); 103 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
104 // is merged version already cached? 104 // is merged version already cached?
105 if (SiteConfig::is_cached("$host.merged")) { 105 if (SiteConfig::is_cached("$host.merged")) {
106 $this->debug("Returning cached and merged site config for $host"); 106 $this->debug("Returning cached and merged site config for $host");
107 return SiteConfig::build("$host.merged"); 107 return SiteConfig::build("$host.merged");
108 } 108 }
109 // let's build from site_config/custom/ and standard/ 109 // let's build from site_config/custom/ and standard/
110 $config = SiteConfig::build($host); 110 $config = SiteConfig::build($host);
111 if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { 111 if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
112 SiteConfig::add_to_cache($host, $config); 112 SiteConfig::add_to_cache($host, $config);
113 } 113 }
114 // if no match, use defaults 114 // if no match, use defaults
115 if (!$config) $config = new SiteConfig(); 115 if (!$config) $config = new SiteConfig();
116 // load fingerprint config? 116 // load fingerprint config?
117 if ($config->autodetect_on_failure()) { 117 if ($config->autodetect_on_failure()) {
118 // check HTML for fingerprints 118 // check HTML for fingerprints
119 if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { 119 if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
120 if ($config_fingerprint = SiteConfig::build($_fphost)) { 120 if ($config_fingerprint = SiteConfig::build($_fphost)) {
121 $this->debug("Appending site config settings from $_fphost (fingerprint match)"); 121 $this->debug("Appending site config settings from $_fphost (fingerprint match)");
122 $config->append($config_fingerprint); 122 $config->append($config_fingerprint);
123 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { 123 if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
124 //$config_fingerprint->cache_in_apc = true; 124 //$config_fingerprint->cache_in_apc = true;
125 SiteConfig::add_to_cache($_fphost, $config_fingerprint); 125 SiteConfig::add_to_cache($_fphost, $config_fingerprint);
126 } 126 }
127 } 127 }
128 } 128 }
129 } 129 }
130 // load global config? 130 // load global config?
131 if ($config->autodetect_on_failure()) { 131 if ($config->autodetect_on_failure()) {
132 if ($config_global = SiteConfig::build('global', true)) { 132 if ($config_global = SiteConfig::build('global', true)) {
133 $this->debug('Appending site config settings from global.txt'); 133 $this->debug('Appending site config settings from global.txt');
134 $config->append($config_global); 134 $config->append($config_global);
135 if ($add_to_cache && !SiteConfig::is_cached('global')) { 135 if ($add_to_cache && !SiteConfig::is_cached('global')) {
136 //$config_global->cache_in_apc = true; 136 //$config_global->cache_in_apc = true;
137 SiteConfig::add_to_cache('global', $config_global); 137 SiteConfig::add_to_cache('global', $config_global);
138 } 138 }
139 } 139 }
140 } 140 }
141 // store copy of merged config 141 // store copy of merged config
142 if ($add_to_cache) { 142 if ($add_to_cache) {
143 // do not store in APC if wildcard match 143 // do not store in APC if wildcard match
144 $use_apc = ($host == $config->cache_key); 144 $use_apc = ($host == $config->cache_key);
145 $config->cache_key = null; 145 $config->cache_key = null;
146 SiteConfig::add_to_cache("$host.merged", $config, $use_apc); 146 SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
147 } 147 }
148 return $config; 148 return $config;
149 } 149 }
150 150
151 // returns true on success, false on failure 151 // returns true on success, false on failure
152 // $smart_tidy indicates that if tidy is used and no results are produced, we will 152 // $smart_tidy indicates that if tidy is used and no results are produced, we will
153 // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time 153 // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
154 // but it has problems of its own which we try to avoid with this option. 154 // but it has problems of its own which we try to avoid with this option.
155 public function process($html, $url, $smart_tidy=true) { 155 public function process($html, $url, $smart_tidy=true) {
156 $this->reset(); 156 $this->reset();
157 $this->config = $this->buildSiteConfig($url, $html); 157 $this->config = $this->buildSiteConfig($url, $html);
158 158
159 // do string replacements 159 // do string replacements
160 if (!empty($this->config->find_string)) { 160 if (!empty($this->config->find_string)) {
161 if (count($this->config->find_string) == count($this->config->replace_string)) { 161 if (count($this->config->find_string) == count($this->config->replace_string)) {
162 $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); 162 $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
163 $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); 163 $this->debug("Strings replaced: $_count (find_string and/or replace_string)");
164 } else { 164 } else {
165 $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); 165 $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
166 } 166 }
167 unset($_count); 167 unset($_count);
168 } 168 }
169 169
170 // use tidy (if it exists)? 170 // use tidy (if it exists)?
171 // This fixes problems with some sites which would otherwise 171 // This fixes problems with some sites which would otherwise
172 // trouble DOMDocument's HTML parsing. (Although sometimes it 172 // trouble DOMDocument's HTML parsing. (Although sometimes it
173 // makes matters worse, which is why you can override it in site config files.) 173 // makes matters worse, which is why you can override it in site config files.)
174 $tidied = false; 174 $tidied = false;
175 if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { 175 if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
176 $this->debug('Using Tidy'); 176 $this->debug('Using Tidy');
177 $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); 177 $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
178 if (tidy_clean_repair($tidy)) { 178 if (tidy_clean_repair($tidy)) {
179 $original_html = $html; 179 $original_html = $html;
180 $tidied = true; 180 $tidied = true;
181 $html = $tidy->value; 181 $html = $tidy->value;
182 } 182 }
183 unset($tidy); 183 unset($tidy);
184 } 184 }
185 185
186 // load and parse html 186 // load and parse html
187 $_parser = $this->config->parser(); 187 $_parser = $this->config->parser();
188 if (!in_array($_parser, $this->allowedParsers)) { 188 if (!in_array($_parser, $this->allowedParsers)) {
189 $this->debug("HTML parser $_parser not listed, using libxml instead"); 189 $this->debug("HTML parser $_parser not listed, using libxml instead");
190 $_parser = 'libxml'; 190 $_parser = 'libxml';
191 } 191 }
192 $this->debug("Attempting to parse HTML with $_parser"); 192 $this->debug("Attempting to parse HTML with $_parser");
193 $this->readability = new Readability($html, $url, $_parser); 193 $this->readability = new Readability($html, $url, $_parser);
194 194
195 // we use xpath to find elements in the given HTML document 195 // we use xpath to find elements in the given HTML document
196 // see http://en.wikipedia.org/wiki/XPath_1.0 196 // see http://en.wikipedia.org/wiki/XPath_1.0
197 $xpath = new DOMXPath($this->readability->dom); 197 $xpath = new DOMXPath($this->readability->dom);
198 198
199 // try to get next page link 199 // try to get next page link
200 foreach ($this->config->next_page_link as $pattern) { 200 foreach ($this->config->next_page_link as $pattern) {
201 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 201 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
202 if (is_string($elems)) { 202 if (is_string($elems)) {
203 $this->nextPageUrl = trim($elems); 203 $this->nextPageUrl = trim($elems);
204 break; 204 break;
205 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 205 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
206 foreach ($elems as $item) { 206 foreach ($elems as $item) {
207 if ($item instanceof DOMElement && $item->hasAttribute('href')) { 207 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
208 $this->nextPageUrl = $item->getAttribute('href'); 208 $this->nextPageUrl = $item->getAttribute('href');
209 break 2; 209 break 2;
210 } elseif ($item instanceof DOMAttr && $item->value) { 210 } elseif ($item instanceof DOMAttr && $item->value) {
211 $this->nextPageUrl = $item->value; 211 $this->nextPageUrl = $item->value;
212 break 2; 212 break 2;
213 } 213 }
214 } 214 }
215 } 215 }
216 } 216 }
217 217
218 // try to get title 218 // try to get title
219 foreach ($this->config->title as $pattern) { 219 foreach ($this->config->title as $pattern) {
220 // $this->debug("Trying $pattern"); 220 // $this->debug("Trying $pattern");
221 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 221 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
222 if (is_string($elems)) { 222 if (is_string($elems)) {
223 $this->title = trim($elems); 223 $this->title = trim($elems);
224 $this->debug('Title expression evaluated as string: '.$this->title); 224 $this->debug('Title expression evaluated as string: '.$this->title);
225 $this->debug("...XPath match: $pattern"); 225 $this->debug("...XPath match: $pattern");
226 break; 226 break;
227 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 227 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
228 $this->title = $elems->item(0)->textContent; 228 $this->title = $elems->item(0)->textContent;
229 $this->debug('Title matched: '.$this->title); 229 $this->debug('Title matched: '.$this->title);
230 $this->debug("...XPath match: $pattern"); 230 $this->debug("...XPath match: $pattern");
231 // remove title from document 231 // remove title from document
232 try { 232 try {
233 $elems->item(0)->parentNode->removeChild($elems->item(0)); 233 @$elems->item(0)->parentNode->removeChild($elems->item(0));
234 } catch (DOMException $e) { 234 } catch (DOMException $e) {
235 // do nothing 235 // do nothing
236 } 236 }
237 break; 237 break;
238 } 238 }
239 } 239 }
240 240
241 // try to get author (if it hasn't already been set) 241 // try to get author (if it hasn't already been set)
242 if (empty($this->author)) { 242 if (empty($this->author)) {
243 foreach ($this->config->author as $pattern) { 243 foreach ($this->config->author as $pattern) {
244 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 244 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
245 if (is_string($elems)) { 245 if (is_string($elems)) {
246 if (trim($elems) != '') { 246 if (trim($elems) != '') {
247 $this->author[] = trim($elems); 247 $this->author[] = trim($elems);
248 $this->debug('Author expression evaluated as string: '.trim($elems)); 248 $this->debug('Author expression evaluated as string: '.trim($elems));
249 $this->debug("...XPath match: $pattern"); 249 $this->debug("...XPath match: $pattern");
250 break; 250 break;
251 } 251 }
252 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 252 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
253 foreach ($elems as $elem) { 253 foreach ($elems as $elem) {
254 if (!isset($elem->parentNode)) continue; 254 if (!isset($elem->parentNode)) continue;
255 $this->author[] = trim($elem->textContent); 255 $this->author[] = trim($elem->textContent);
256 $this->debug('Author matched: '.trim($elem->textContent)); 256 $this->debug('Author matched: '.trim($elem->textContent));
257 } 257 }
258 if (!empty($this->author)) { 258 if (!empty($this->author)) {
259 $this->debug("...XPath match: $pattern"); 259 $this->debug("...XPath match: $pattern");
260 break; 260 break;
261 } 261 }
262 } 262 }
263 } 263 }
264 } 264 }
265 265
266 // try to get language 266 // try to get language
267 $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); 267 $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
268 foreach ($_lang_xpath as $pattern) { 268 foreach ($_lang_xpath as $pattern) {
269 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 269 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
270 if (is_string($elems)) { 270 if (is_string($elems)) {
271 if (trim($elems) != '') { 271 if (trim($elems) != '') {
272 $this->language = trim($elems); 272 $this->language = trim($elems);
273 $this->debug('Language matched: '.$this->language); 273 $this->debug('Language matched: '.$this->language);
274 break; 274 break;
275 } 275 }
276 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 276 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
277 foreach ($elems as $elem) { 277 foreach ($elems as $elem) {
278 if (!isset($elem->parentNode)) continue; 278 if (!isset($elem->parentNode)) continue;
279 $this->language = trim($elem->textContent); 279 $this->language = trim($elem->textContent);
280 $this->debug('Language matched: '.$this->language); 280 $this->debug('Language matched: '.$this->language);
281 } 281 }
282 if ($this->language) break; 282 if ($this->language) break;
283 } 283 }
284 } 284 }
285 285
286 // try to get date 286 // try to get date
287 foreach ($this->config->date as $pattern) { 287 foreach ($this->config->date as $pattern) {
288 $elems = @$xpath->evaluate($pattern, $this->readability->dom); 288 $elems = @$xpath->evaluate($pattern, $this->readability->dom);
289 if (is_string($elems)) { 289 if (is_string($elems)) {
290 $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); 290 $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));
291 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { 291 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
292 $this->date = $elems->item(0)->textContent; 292 $this->date = $elems->item(0)->textContent;
293 $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); 293 $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
294 // remove date from document 294 // remove date from document
295 // $elems->item(0)->parentNode->removeChild($elems->item(0)); 295 // $elems->item(0)->parentNode->removeChild($elems->item(0));
296 } 296 }
297 if (!$this->date) { 297 if (!$this->date) {
298 $this->date = null; 298 $this->date = null;
299 } else { 299 } else {
300 $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); 300 $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
301 $this->debug("...XPath match: $pattern"); 301 $this->debug("...XPath match: $pattern");
302 break; 302 break;
303 } 303 }
304 } 304 }
305 305
306 // strip elements (using xpath expressions) 306 // strip elements (using xpath expressions)
307 foreach ($this->config->strip as $pattern) { 307 foreach ($this->config->strip as $pattern) {
308 $elems = @$xpath->query($pattern, $this->readability->dom); 308 $elems = @$xpath->query($pattern, $this->readability->dom);
309 // check for matches 309 // check for matches
310 if ($elems && $elems->length > 0) { 310 if ($elems && $elems->length > 0) {
311 $this->debug('Stripping '.$elems->length.' elements (strip)'); 311 $this->debug('Stripping '.$elems->length.' elements (strip)');
312 for ($i=$elems->length-1; $i >= 0; $i--) { 312 for ($i=$elems->length-1; $i >= 0; $i--) {
313 $elems->item($i)->parentNode->removeChild($elems->item($i)); 313 $elems->item($i)->parentNode->removeChild($elems->item($i));
314 } 314 }
315 } 315 }
316 } 316 }
317 317
318 // strip elements (using id and class attribute values) 318 // strip elements (using id and class attribute values)
319 foreach ($this->config->strip_id_or_class as $string) { 319 foreach ($this->config->strip_id_or_class as $string) {
320 $string = strtr($string, array("'"=>'', '"'=>'')); 320 $string = strtr($string, array("'"=>'', '"'=>''));
321 $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); 321 $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
322 // check for matches 322 // check for matches
323 if ($elems && $elems->length > 0) { 323 if ($elems && $elems->length > 0) {
324 $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); 324 $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
325 for ($i=$elems->length-1; $i >= 0; $i--) { 325 for ($i=$elems->length-1; $i >= 0; $i--) {
326 $elems->item($i)->parentNode->removeChild($elems->item($i)); 326 $elems->item($i)->parentNode->removeChild($elems->item($i));
327 } 327 }
328 } 328 }
329 } 329 }
330 330
331 // strip images (using src attribute values) 331 // strip images (using src attribute values)
332 foreach ($this->config->strip_image_src as $string) { 332 foreach ($this->config->strip_image_src as $string) {
333 $string = strtr($string, array("'"=>'', '"'=>'')); 333 $string = strtr($string, array("'"=>'', '"'=>''));
334 $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); 334 $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
335 // check for matches 335 // check for matches
336 if ($elems && $elems->length > 0) { 336 if ($elems && $elems->length > 0) {
337 $this->debug('Stripping '.$elems->length.' image elements'); 337 $this->debug('Stripping '.$elems->length.' image elements');
338 for ($i=$elems->length-1; $i >= 0; $i--) { 338 for ($i=$elems->length-1; $i >= 0; $i--) {
339 $elems->item($i)->parentNode->removeChild($elems->item($i)); 339 $elems->item($i)->parentNode->removeChild($elems->item($i));
340 } 340 }
341 } 341 }
342 } 342 }
343 // strip elements using Readability.com and Instapaper.com ignore class names 343 // strip elements using Readability.com and Instapaper.com ignore class names
344 // .entry-unrelated and .instapaper_ignore 344 // .entry-unrelated and .instapaper_ignore
345 // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines 345 // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
346 // and http://blog.instapaper.com/post/730281947 346 // and http://blog.instapaper.com/post/730281947
347 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); 347 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
348 // check for matches 348 // check for matches
349 if ($elems && $elems->length > 0) { 349 if ($elems && $elems->length > 0) {
350 $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); 350 $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
351 for ($i=$elems->length-1; $i >= 0; $i--) { 351 for ($i=$elems->length-1; $i >= 0; $i--) {
352 $elems->item($i)->parentNode->removeChild($elems->item($i)); 352 $elems->item($i)->parentNode->removeChild($elems->item($i));
353 } 353 }
354 } 354 }
355 355
356 // strip elements that contain style="display: none;" 356 // strip elements that contain style="display: none;"
357 $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); 357 $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
358 // check for matches 358 // check for matches
359 if ($elems && $elems->length > 0) { 359 if ($elems && $elems->length > 0) {
360 $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); 360 $this->debug('Stripping '.$elems->length.' elements with inline display:none style');
361 for ($i=$elems->length-1; $i >= 0; $i--) { 361 for ($i=$elems->length-1; $i >= 0; $i--) {
362 $elems->item($i)->parentNode->removeChild($elems->item($i)); 362 $elems->item($i)->parentNode->removeChild($elems->item($i));
363 } 363 }
364 } 364 }
365 365
366 // try to get body 366 // try to get body
367 foreach ($this->config->body as $pattern) { 367 foreach ($this->config->body as $pattern) {
368 $elems = @$xpath->query($pattern, $this->readability->dom); 368 $elems = @$xpath->query($pattern, $this->readability->dom);
369 // check for matches 369 // check for matches
370 if ($elems && $elems->length > 0) { 370 if ($elems && $elems->length > 0) {
371 $this->debug('Body matched'); 371 $this->debug('Body matched');
372 $this->debug("...XPath match: $pattern"); 372 $this->debug("...XPath match: $pattern");
373 if ($elems->length == 1) { 373 if ($elems->length == 1) {
374 $this->body = $elems->item(0); 374 $this->body = $elems->item(0);
375 // prune (clean up elements that may not be content) 375 // prune (clean up elements that may not be content)
376 if ($this->config->prune()) { 376 if ($this->config->prune()) {
377 $this->debug('...pruning content'); 377 $this->debug('...pruning content');
378 $this->readability->prepArticle($this->body); 378 $this->readability->prepArticle($this->body);
379 } 379 }
380 break; 380 break;
381 } else { 381 } else {
382 $this->body = $this->readability->dom->createElement('div'); 382 $this->body = $this->readability->dom->createElement('div');
383 $this->debug($elems->length.' body elems found'); 383 $this->debug($elems->length.' body elems found');
384 foreach ($elems as $elem) { 384 foreach ($elems as $elem) {
385 if (!isset($elem->parentNode)) continue; 385 if (!isset($elem->parentNode)) continue;
386 $isDescendant = false; 386 $isDescendant = false;
387 foreach ($this->body->childNodes as $parent) { 387 foreach ($this->body->childNodes as $parent) {
388 if ($this->isDescendant($parent, $elem)) { 388 if ($this->isDescendant($parent, $elem)) {
389 $isDescendant = true; 389 $isDescendant = true;
390 break; 390 break;
391 } 391 }
392 } 392 }
393 if ($isDescendant) { 393 if ($isDescendant) {
394 $this->debug('...element is child of another body element, skipping.'); 394 $this->debug('...element is child of another body element, skipping.');
395 } else { 395 } else {
396 // prune (clean up elements that may not be content) 396 // prune (clean up elements that may not be content)
397 if ($this->config->prune()) { 397 if ($this->config->prune()) {
398 $this->debug('Pruning content'); 398 $this->debug('Pruning content');
399 $this->readability->prepArticle($elem); 399 $this->readability->prepArticle($elem);
400 } 400 }
401 $this->debug('...element added to body'); 401 $this->debug('...element added to body');
402 $this->body->appendChild($elem); 402 $this->body->appendChild($elem);
403 } 403 }
404 } 404 }
405 if ($this->body->hasChildNodes()) break; 405 if ($this->body->hasChildNodes()) break;
406 } 406 }
407 } 407 }
408 } 408 }
409 409
410 // auto detect? 410 // auto detect?
411 $detect_title = $detect_body = $detect_author = $detect_date = false; 411 $detect_title = $detect_body = $detect_author = $detect_date = false;
412 // detect title? 412 // detect title?
413 if (!isset($this->title)) { 413 if (!isset($this->title)) {
414 if (empty($this->config->title) || $this->config->autodetect_on_failure()) { 414 if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
415 $detect_title = true; 415 $detect_title = true;
416 } 416 }
417 } 417 }
418 // detect body? 418 // detect body?
419 if (!isset($this->body)) { 419 if (!isset($this->body)) {
420 if (empty($this->config->body) || $this->config->autodetect_on_failure()) { 420 if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
421 $detect_body = true; 421 $detect_body = true;
422 } 422 }
423 } 423 }
424 // detect author? 424 // detect author?
425 if (empty($this->author)) { 425 if (empty($this->author)) {
426 if (empty($this->config->author) || $this->config->autodetect_on_failure()) { 426 if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
427 $detect_author = true; 427 $detect_author = true;
428 } 428 }
429 } 429 }
430 // detect date? 430 // detect date?
431 if (!isset($this->date)) { 431 if (!isset($this->date)) {
432 if (empty($this->config->date) || $this->config->autodetect_on_failure()) { 432 if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
433 $detect_date = true; 433 $detect_date = true;
434 } 434 }
435 } 435 }
436 436
437 // check for hNews 437 // check for hNews
438 if ($detect_title || $detect_body) { 438 if ($detect_title || $detect_body) {
439 // check for hentry 439 // check for hentry
440 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); 440 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
441 if ($elems && $elems->length > 0) { 441 if ($elems && $elems->length > 0) {
442 $this->debug('hNews: found hentry'); 442 $this->debug('hNews: found hentry');
443 $hentry = $elems->item(0); 443 $hentry = $elems->item(0);
444 444
445 if ($detect_title) { 445 if ($detect_title) {
446 // check for entry-title 446 // check for entry-title
447 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); 447 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
448 if ($elems && $elems->length > 0) { 448 if ($elems && $elems->length > 0) {
449 $this->title = $elems->item(0)->textContent; 449 $this->title = $elems->item(0)->textContent;
450 $this->debug('hNews: found entry-title: '.$this->title); 450 $this->debug('hNews: found entry-title: '.$this->title);
451 // remove title from document 451 // remove title from document
452 $elems->item(0)->parentNode->removeChild($elems->item(0)); 452 $elems->item(0)->parentNode->removeChild($elems->item(0));
453 $detect_title = false; 453 $detect_title = false;
454 } 454 }
455 } 455 }
456 456
457 if ($detect_date) { 457 if ($detect_date) {
458 // check for time element with pubdate attribute 458 // check for time element with pubdate attribute
459 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); 459 $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
460 if ($elems && $elems->length > 0) { 460 if ($elems && $elems->length > 0) {
461 $this->date = strtotime(trim($elems->item(0)->textContent)); 461 $this->date = strtotime(trim($elems->item(0)->textContent));
462 // remove date from document 462 // remove date from document
463 //$elems->item(0)->parentNode->removeChild($elems->item(0)); 463 //$elems->item(0)->parentNode->removeChild($elems->item(0));
464 if ($this->date) { 464 if ($this->date) {
465 $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); 465 $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
466 $detect_date = false; 466 $detect_date = false;
467 } else { 467 } else {
468 $this->date = null; 468 $this->date = null;
469 } 469 }
470 } 470 }
471 } 471 }
472 472
473 if ($detect_author) { 473 if ($detect_author) {
474 // check for time element with pubdate attribute 474 // check for time element with pubdate attribute
475 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); 475 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
476 if ($elems && $elems->length > 0) { 476 if ($elems && $elems->length > 0) {
477 $author = $elems->item(0); 477 $author = $elems->item(0);
478 $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); 478 $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
479 if ($fn && $fn->length > 0) { 479 if ($fn && $fn->length > 0) {
480 foreach ($fn as $_fn) { 480 foreach ($fn as $_fn) {
481 if (trim($_fn->textContent) != '') { 481 if (trim($_fn->textContent) != '') {
482 $this->author[] = trim($_fn->textContent); 482 $this->author[] = trim($_fn->textContent);
483 $this->debug('hNews: found author: '.trim($_fn->textContent)); 483 $this->debug('hNews: found author: '.trim($_fn->textContent));
484 } 484 }
485 } 485 }
486 } else { 486 } else {
487 if (trim($author->textContent) != '') { 487 if (trim($author->textContent) != '') {
488 $this->author[] = trim($author->textContent); 488 $this->author[] = trim($author->textContent);
489 $this->debug('hNews: found author: '.trim($author->textContent)); 489 $this->debug('hNews: found author: '.trim($author->textContent));
490 } 490 }
491 } 491 }
492 $detect_author = empty($this->author); 492 $detect_author = empty($this->author);
493 } 493 }
494 } 494 }
495 495
496 // check for entry-content. 496 // check for entry-content.
497 // according to hAtom spec, if there are multiple elements marked entry-content, 497 // according to hAtom spec, if there are multiple elements marked entry-content,
498 // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content 498 // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
499 if ($detect_body) { 499 if ($detect_body) {
500 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); 500 $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
501 if ($elems && $elems->length > 0) { 501 if ($elems && $elems->length > 0) {
502 $this->debug('hNews: found entry-content'); 502 $this->debug('hNews: found entry-content');
503 if ($elems->length == 1) { 503 if ($elems->length == 1) {
504 // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) 504 // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
505 $e = $elems->item(0); 505 $e = $elems->item(0);
506 if (($e->tagName == 'img') || (trim($e->textContent) != '')) { 506 if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
507 $this->body = $elems->item(0); 507 $this->body = $elems->item(0);
508 // prune (clean up elements that may not be content) 508 // prune (clean up elements that may not be content)
509 if ($this->config->prune()) { 509 if ($this->config->prune()) {
510 $this->debug('Pruning content'); 510 $this->debug('Pruning content');
511 $this->readability->prepArticle($this->body); 511 $this->readability->prepArticle($this->body);
512 } 512 }
513 $detect_body = false; 513 $detect_body = false;
514 } else { 514 } else {
515 $this->debug('hNews: skipping entry-content - appears not to contain content'); 515 $this->debug('hNews: skipping entry-content - appears not to contain content');
516 } 516 }
517 unset($e); 517 unset($e);
518 } else { 518 } else {
519 $this->body = $this->readability->dom->createElement('div'); 519 $this->body = $this->readability->dom->createElement('div');
520 $this->debug($elems->length.' entry-content elems found'); 520 $this->debug($elems->length.' entry-content elems found');
521 foreach ($elems as $elem) { 521 foreach ($elems as $elem) {
522 if (!isset($elem->parentNode)) continue; 522 if (!isset($elem->parentNode)) continue;
523 $isDescendant = false; 523 $isDescendant = false;
524 foreach ($this->body->childNodes as $parent) { 524 foreach ($this->body->childNodes as $parent) {
525 if ($this->isDescendant($parent, $elem)) { 525 if ($this->isDescendant($parent, $elem)) {
526 $isDescendant = true; 526 $isDescendant = true;
527 break; 527 break;
528 } 528 }
529 } 529 }
530 if ($isDescendant) { 530 if ($isDescendant) {
531 $this->debug('Element is child of another body element, skipping.'); 531 $this->debug('Element is child of another body element, skipping.');
532 } else { 532 } else {
533 // prune (clean up elements that may not be content) 533 // prune (clean up elements that may not be content)
534 if ($this->config->prune()) { 534 if ($this->config->prune()) {
535 $this->debug('Pruning content'); 535 $this->debug('Pruning content');
536 $this->readability->prepArticle($elem); 536 $this->readability->prepArticle($elem);
537 } 537 }
538 $this->debug('Element added to body'); 538 $this->debug('Element added to body');
539 $this->body->appendChild($elem); 539 $this->body->appendChild($elem);
540 } 540 }
541 } 541 }
542 $detect_body = false; 542 $detect_body = false;
543 } 543 }
544 } 544 }
545 } 545 }
546 } 546 }
547 } 547 }
548 548
549 // check for elements marked with instapaper_title 549 // check for elements marked with instapaper_title
550 if ($detect_title) { 550 if ($detect_title) {
551 // check for instapaper_title 551 // check for instapaper_title
552 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); 552 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
553 if ($elems && $elems->length > 0) { 553 if ($elems && $elems->length > 0) {
554 $this->title = $elems->item(0)->textContent; 554 $this->title = $elems->item(0)->textContent;
555 $this->debug('Title found (.instapaper_title): '.$this->title); 555 $this->debug('Title found (.instapaper_title): '.$this->title);
556 // remove title from document 556 // remove title from document
557 $elems->item(0)->parentNode->removeChild($elems->item(0)); 557 $elems->item(0)->parentNode->removeChild($elems->item(0));
558 $detect_title = false; 558 $detect_title = false;
559 } 559 }
560 } 560 }
561 // check for elements marked with instapaper_body 561 // check for elements marked with instapaper_body
562 if ($detect_body) { 562 if ($detect_body) {
563 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); 563 $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
564 if ($elems && $elems->length > 0) { 564 if ($elems && $elems->length > 0) {
565 $this->debug('body found (.instapaper_body)'); 565 $this->debug('body found (.instapaper_body)');
566 $this->body = $elems->item(0); 566 $this->body = $elems->item(0);
567 // prune (clean up elements that may not be content) 567 // prune (clean up elements that may not be content)
568 if ($this->config->prune()) { 568 if ($this->config->prune()) {
569 $this->debug('Pruning content'); 569 $this->debug('Pruning content');
570 $this->readability->prepArticle($this->body); 570 $this->readability->prepArticle($this->body);
571 } 571 }
572 $detect_body = false; 572 $detect_body = false;
573 } 573 }
574 } 574 }
575 575
576 // Find author in rel="author" marked element 576 // Find author in rel="author" marked element
577 // We only use this if there's exactly one. 577 // We only use this if there's exactly one.
578 // If there's more than one, it could indicate more than 578 // If there's more than one, it could indicate more than
579 // one author, but it could also indicate that we're processing 579 // one author, but it could also indicate that we're processing
580 // a page listing different articles with different authors. 580 // a page listing different articles with different authors.
581 if ($detect_author) { 581 if ($detect_author) {
582 $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); 582 $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
583 if ($elems && $elems->length == 1) { 583 if ($elems && $elems->length == 1) {
584 $author = trim($elems->item(0)->textContent); 584 $author = trim($elems->item(0)->textContent);
585 if ($author != '') { 585 if ($author != '') {
586 $this->debug("Author found (rel=\"author\"): $author"); 586 $this->debug("Author found (rel=\"author\"): $author");
587 $this->author[] = $author; 587 $this->author[] = $author;
588 $detect_author = false; 588 $detect_author = false;
589 } 589 }
590 } 590 }
591 } 591 }
592 592
593 // Find date in pubdate marked time element 593 // Find date in pubdate marked time element
594 // For the same reason given above, we only use this 594 // For the same reason given above, we only use this
595 // if there's exactly one element. 595 // if there's exactly one element.
596 if ($detect_date) { 596 if ($detect_date) {
597 $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); 597 $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
598 if ($elems && $elems->length == 1) { 598 if ($elems && $elems->length == 1) {
599 $this->date = strtotime(trim($elems->item(0)->textContent)); 599 $this->date = strtotime(trim($elems->item(0)->textContent));
600 // remove date from document 600 // remove date from document
601 //$elems->item(0)->parentNode->removeChild($elems->item(0)); 601 //$elems->item(0)->parentNode->removeChild($elems->item(0));
602 if ($this->date) { 602 if ($this->date) {
603 $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); 603 $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
604 $detect_date = false; 604 $detect_date = false;
605 } else { 605 } else {
606 $this->date = null; 606 $this->date = null;
607 } 607 }
608 } 608 }
609 } 609 }
610 610
611 // still missing title or body, so we detect using Readability 611 // still missing title or body, so we detect using Readability
612 if ($detect_title || $detect_body) { 612 if ($detect_title || $detect_body) {
613 $this->debug('Using Readability'); 613 $this->debug('Using Readability');
614 // clone body if we're only using Readability for title (otherwise it may interfere with body element) 614 // clone body if we're only using Readability for title (otherwise it may interfere with body element)
615 if (isset($this->body)) $this->body = $this->body->cloneNode(true); 615 if (isset($this->body)) $this->body = $this->body->cloneNode(true);
616 $success = $this->readability->init(); 616 $success = $this->readability->init();
617 } 617 }
618 if ($detect_title) { 618 if ($detect_title) {
619 $this->debug('Detecting title'); 619 $this->debug('Detecting title');
620 $this->title = $this->readability->getTitle()->textContent; 620 $this->title = $this->readability->getTitle()->textContent;
621 } 621 }
622 if ($detect_body && $success) { 622 if ($detect_body && $success) {
623 $this->debug('Detecting body'); 623 $this->debug('Detecting body');
624 $this->body = $this->readability->getContent(); 624 $this->body = $this->readability->getContent();
625 if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { 625 if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
626 $this->body = $this->body->firstChild; 626 $this->body = $this->body->firstChild;
627 } 627 }
628 // prune (clean up elements that may not be content) 628 // prune (clean up elements that may not be content)
629 if ($this->config->prune()) { 629 if ($this->config->prune()) {
630 $this->debug('Pruning content'); 630 $this->debug('Pruning content');
631 $this->readability->prepArticle($this->body); 631 $this->readability->prepArticle($this->body);
632 } 632 }
633 } 633 }
634 if (isset($this->body)) { 634 if (isset($this->body)) {
635 // remove scripts 635 // remove scripts
636 $this->readability->removeScripts($this->body); 636 $this->readability->removeScripts($this->body);
637 // remove any h1-h6 elements that appear as first thing in the body 637 // remove any h1-h6 elements that appear as first thing in the body
638 // and which match our title 638 // and which match our title
639 if (isset($this->title) && ($this->title != '')) { 639 if (isset($this->title) && ($this->title != '')) {
640 $firstChild = $this->body->firstChild; 640 $firstChild = $this->body->firstChild;
641 while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { 641 while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
642 $firstChild = $firstChild->nextSibling; 642 $firstChild = $firstChild->nextSibling;
643 } 643 }
644 if (($firstChild->nodeType === XML_ELEMENT_NODE) 644 if (($firstChild->nodeType === XML_ELEMENT_NODE)
645 && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) 645 && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
646 && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { 646 && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
647 $this->body->removeChild($firstChild); 647 $this->body->removeChild($firstChild);
648 } 648 }
649 } 649 }
650 // prevent self-closing iframes 650 // prevent self-closing iframes
651 $elems = $this->body->getElementsByTagName('iframe'); 651 $elems = $this->body->getElementsByTagName('iframe');
652 for ($i = $elems->length-1; $i >= 0; $i--) { 652 for ($i = $elems->length-1; $i >= 0; $i--) {
653 $e = $elems->item($i); 653 $e = $elems->item($i);
654 if (!$e->hasChildNodes()) { 654 if (!$e->hasChildNodes()) {
655 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); 655 $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
656 } 656 }
657 } 657 }
658 // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ 658 // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
659 // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src 659 // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
660 // inside the data-lazy-src attribute. It also places the original image inside a noscript element 660 // inside the data-lazy-src attribute. It also places the original image inside a noscript element
661 // next to the amended one. 661 // next to the amended one.
662 $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); 662 $elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
663 for ($i = $elems->length-1; $i >= 0; $i--) { 663 for ($i = $elems->length-1; $i >= 0; $i--) {
664 $e = $elems->item($i); 664 $e = $elems->item($i);
665 // let's see if we can grab image from noscript 665 // let's see if we can grab image from noscript
666 if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { 666 if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
667 $_new_elem = $e->ownerDocument->createDocumentFragment(); 667 $_new_elem = $e->ownerDocument->createDocumentFragment();
668 @$_new_elem->appendXML($e->nextSibling->innerHTML); 668 @$_new_elem->appendXML($e->nextSibling->innerHTML);
669 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); 669 $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
670 $e->parentNode->removeChild($e); 670 $e->parentNode->removeChild($e);
671 } else { 671 } else {
672 // Use data-lazy-src as src value 672 // Use data-lazy-src as src value
673 $e->setAttribute('src', $e->getAttribute('data-lazy-src')); 673 $e->setAttribute('src', $e->getAttribute('data-lazy-src'));
674 $e->removeAttribute('data-lazy-src'); 674 $e->removeAttribute('data-lazy-src');
675 } 675 }
676 } 676 }
677 677
678 $this->success = true; 678 $this->success = true;
679 } 679 }
680 680
681 // if we've had no success and we've used tidy, there's a chance 681 // if we've had no success and we've used tidy, there's a chance
682 // that tidy has messed up. So let's try again without tidy... 682 // that tidy has messed up. So let's try again without tidy...
683 if (!$this->success && $tidied && $smart_tidy) { 683 if (!$this->success && $tidied && $smart_tidy) {
684 $this->debug('Trying again without tidy'); 684 $this->debug('Trying again without tidy');
685 $this->process($original_html, $url, false); 685 $this->process($original_html, $url, false);
686 } 686 }
687 687
688 return $this->success; 688 return $this->success;
689 } 689 }
690 690
691 private function isDescendant(DOMElement $parent, DOMElement $child) { 691 private function isDescendant(DOMElement $parent, DOMElement $child) {
692 $node = $child->parentNode; 692 $node = $child->parentNode;
693 while ($node != null) { 693 while ($node != null) {
694 if ($node->isSameNode($parent)) return true; 694 if ($node->isSameNode($parent)) return true;
695 $node = $node->parentNode; 695 $node = $node->parentNode;
696 } 696 }
697 return false; 697 return false;
698 } 698 }
699 699
700 public function getContent() { 700 public function getContent() {
701 return $this->body; 701 return $this->body;
702 } 702 }
703 703
704 public function getTitle() { 704 public function getTitle() {
705 return $this->title; 705 return $this->title;
706 } 706 }
707 707
708 public function getAuthors() { 708 public function getAuthors() {
709 return $this->author; 709 return $this->author;
710 } 710 }
711 711
712 public function getLanguage() { 712 public function getLanguage() {
713 return $this->language; 713 return $this->language;
714 } 714 }
715 715
716 public function getDate() { 716 public function getDate() {
717 return $this->date; 717 return $this->date;
718 } 718 }
719 719
720 public function getSiteConfig() { 720 public function getSiteConfig() {
721 return $this->config; 721 return $this->config;
722 } 722 }
723 723
724 public function getNextPageUrl() { 724 public function getNextPageUrl() {
725 return $this->nextPageUrl; 725 return $this->nextPageUrl;
726 } 726 }
727} 727} \ No newline at end of file
728?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
index c5e300d7..1f6a7603 100644
--- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php
+++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
@@ -1,338 +1,343 @@
1<?php 1<?php
2/** 2/**
3 * Site Config 3 * Site Config
4 * 4 *
5 * Each instance of this class should hold extraction patterns and other directives 5 * Each instance of this class should hold extraction patterns and other directives
6 * for a website. See ContentExtractor class to see how it's used. 6 * for a website. See ContentExtractor class to see how it's used.
7 * 7 *
8 * @version 0.7 8 * @version 0.8
9 * @date 2012-08-27 9 * @date 2013-04-16
10 * @author Keyvan Minoukadeh 10 * @author Keyvan Minoukadeh
11 * @copyright 2012 Keyvan Minoukadeh 11 * @copyright 2013 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
13 */ 13 */
14 14
15class SiteConfig 15class SiteConfig
16{ 16{
17 // Use first matching element as title (0 or more xpath expressions) 17 // Use first matching element as title (0 or more xpath expressions)
18 public $title = array(); 18 public $title = array();
19 19
20 // Use first matching element as body (0 or more xpath expressions) 20 // Use first matching element as body (0 or more xpath expressions)
21 public $body = array(); 21 public $body = array();
22 22
23 // Use first matching element as author (0 or more xpath expressions) 23 // Use first matching element as author (0 or more xpath expressions)
24 public $author = array(); 24 public $author = array();
25 25
26 // Use first matching element as date (0 or more xpath expressions) 26 // Use first matching element as date (0 or more xpath expressions)
27 public $date = array(); 27 public $date = array();
28 28
29 // Strip elements matching these xpath expressions (0 or more) 29 // Strip elements matching these xpath expressions (0 or more)
30 public $strip = array(); 30 public $strip = array();
31 31
32 // Strip elements which contain these strings (0 or more) in the id or class attribute 32 // Strip elements which contain these strings (0 or more) in the id or class attribute
33 public $strip_id_or_class = array(); 33 public $strip_id_or_class = array();
34 34
35 // Strip images which contain these strings (0 or more) in the src attribute 35 // Strip images which contain these strings (0 or more) in the src attribute
36 public $strip_image_src = array(); 36 public $strip_image_src = array();
37 37
38 // Additional HTTP headers to send 38 // Additional HTTP headers to send
39 // NOT YET USED 39 // NOT YET USED
40 public $http_header = array(); 40 public $http_header = array();
41 41
42 // Process HTML with tidy before creating DOM (bool or null if undeclared) 42 // Process HTML with tidy before creating DOM (bool or null if undeclared)
43 public $tidy = null; 43 public $tidy = null;
44 44
45 protected $default_tidy = true; // used if undeclared 45 protected $default_tidy = true; // used if undeclared
46 46
47 // Autodetect title/body if xpath expressions fail to produce results. 47 // Autodetect title/body if xpath expressions fail to produce results.
48 // Note that this applies to title and body separately, ie. 48 // Note that this applies to title and body separately, ie.
49 // * if we get a body match but no title match, this option will determine whether we autodetect title 49 // * if we get a body match but no title match, this option will determine whether we autodetect title
50 // * if neither match, this determines whether we autodetect title and body. 50 // * if neither match, this determines whether we autodetect title and body.
51 // Also note that this only applies when there is at least one xpath expression in title or body, ie. 51 // Also note that this only applies when there is at least one xpath expression in title or body, ie.
52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) 52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. 53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). 54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
55 // bool or null if undeclared 55 // bool or null if undeclared
56 public $autodetect_on_failure = null; 56 public $autodetect_on_failure = null;
57 protected $default_autodetect_on_failure = true; // used if undeclared 57 protected $default_autodetect_on_failure = true; // used if undeclared
58 58
59 // Clean up content block - attempt to remove elements that appear to be superfluous 59 // Clean up content block - attempt to remove elements that appear to be superfluous
60 // bool or null if undeclared 60 // bool or null if undeclared
61 public $prune = null; 61 public $prune = null;
62 protected $default_prune = true; // used if undeclared 62 protected $default_prune = true; // used if undeclared
63 63
64 // Test URL - if present, can be used to test the config above 64 // Test URL - if present, can be used to test the config above
65 public $test_url = array(); 65 public $test_url = array();
66 66
67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article 67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article
68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to 68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page 69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page
70 // which displays the entire article on one page (e.g. 'print view'). 70 // which displays the entire article on one page (e.g. 'print view').
71 // This should be an XPath expression identifying the link to that page. If present and we find a match, 71 // This should be an XPath expression identifying the link to that page. If present and we find a match,
72 // we will retrieve that page and the rest of the options in this config will be applied to the new page. 72 // we will retrieve that page and the rest of the options in this config will be applied to the new page.
73 public $single_page_link = array(); 73 public $single_page_link = array();
74 74
75 public $next_page_link = array(); 75 public $next_page_link = array();
76 76
77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed 77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
78 public $single_page_link_in_feed = array(); 78 public $single_page_link_in_feed = array();
79 79
80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') 80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
81 // string or null if undeclared 81 // string or null if undeclared
82 public $parser = null; 82 public $parser = null;
83 protected $default_parser = 'libxml'; // used if undeclared 83 protected $default_parser = 'libxml'; // used if undeclared
84 84
85 // Strings to search for in HTML before processing begins (used with $replace_string) 85 // Strings to search for in HTML before processing begins (used with $replace_string)
86 public $find_string = array(); 86 public $find_string = array();
87 // Strings to replace those found in $find_string before HTML processing begins 87 // Strings to replace those found in $find_string before HTML processing begins
88 public $replace_string = array(); 88 public $replace_string = array();
89 89
90 // the options below cannot be set in the config files which this class represents 90 // the options below cannot be set in the config files which this class represents
91 91
92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not 92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not
93 public $cache_key = null; 93 public $cache_key = null;
94 public static $debug = false; 94 public static $debug = false;
95 protected static $apc = false; 95 protected static $apc = false;
96 protected static $config_path; 96 protected static $config_path;
97 protected static $config_path_fallback; 97 protected static $config_path_fallback;
98 protected static $config_cache = array(); 98 protected static $config_cache = array();
99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; 99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
100 100
101 protected static function debug($msg) { 101 protected static function debug($msg) {
102 if (self::$debug) { 102 if (self::$debug) {
103 //$mem = round(memory_get_usage()/1024, 2); 103 //$mem = round(memory_get_usage()/1024, 2);
104 //$memPeak = round(memory_get_peak_usage()/1024, 2); 104 //$memPeak = round(memory_get_peak_usage()/1024, 2);
105 echo '* ',$msg; 105 echo '* ',$msg;
106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; 106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n";
107 echo "\n"; 107 echo "\n";
108 ob_flush(); 108 ob_flush();
109 flush(); 109 flush();
110 } 110 }
111 } 111 }
112 112
113 // enable APC caching of certain site config files? 113 // enable APC caching of certain site config files?
114 // If enabled the following site config files will be 114 // If enabled the following site config files will be
115 // cached in APC cache (when requested for first time): 115 // cached in APC cache (when requested for first time):
116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/ 116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/
117 // * the site config files associated with HTML fingerprints 117 // * the site config files associated with HTML fingerprints
118 // * the global site config file 118 // * the global site config file
119 // returns true if enabled, false otherwise 119 // returns true if enabled, false otherwise
120 public static function use_apc($apc=true) { 120 public static function use_apc($apc=true) {
121 if (!function_exists('apc_add')) { 121 if (!function_exists('apc_add')) {
122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); 122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
123 return false; 123 return false;
124 } 124 }
125 self::$apc = $apc; 125 self::$apc = $apc;
126 return $apc; 126 return $apc;
127 } 127 }
128 128
129 // return bool or null 129 // return bool or null
130 public function tidy($use_default=true) { 130 public function tidy($use_default=true) {
131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; 131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
132 return $this->tidy; 132 return $this->tidy;
133 } 133 }
134 134
135 // return bool or null 135 // return bool or null
136 public function prune($use_default=true) { 136 public function prune($use_default=true) {
137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; 137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
138 return $this->prune; 138 return $this->prune;
139 } 139 }
140 140
141 // return string or null 141 // return string or null
142 public function parser($use_default=true) { 142 public function parser($use_default=true) {
143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; 143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
144 return $this->parser; 144 return $this->parser;
145 } 145 }
146 146
147 // return bool or null 147 // return bool or null
148 public function autodetect_on_failure($use_default=true) { 148 public function autodetect_on_failure($use_default=true) {
149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; 149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
150 return $this->autodetect_on_failure; 150 return $this->autodetect_on_failure;
151 } 151 }
152 152
153 public static function set_config_path($path, $fallback=null) { 153 public static function set_config_path($path, $fallback=null) {
154 self::$config_path = $path; 154 self::$config_path = $path;
155 self::$config_path_fallback = $fallback; 155 self::$config_path_fallback = $fallback;
156 } 156 }
157 157
158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { 158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
159 $key = strtolower($key); 159 $key = strtolower($key);
160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); 160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
161 if ($config->cache_key) $key = $config->cache_key; 161 if ($config->cache_key) $key = $config->cache_key;
162 self::$config_cache[$key] = $config; 162 self::$config_cache[$key] = $config;
163 if (self::$apc && $use_apc) { 163 if (self::$apc && $use_apc) {
164 self::debug("Adding site config to APC cache with key sc.$key"); 164 self::debug("Adding site config to APC cache with key sc.$key");
165 apc_add("sc.$key", $config); 165 apc_add("sc.$key", $config);
166 } 166 }
167 self::debug("Cached site config with key $key"); 167 self::debug("Cached site config with key $key");
168 } 168 }
169 169
170 public static function is_cached($key) { 170 public static function is_cached($key) {
171 $key = strtolower($key); 171 $key = strtolower($key);
172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); 172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
173 if (array_key_exists($key, self::$config_cache)) { 173 if (array_key_exists($key, self::$config_cache)) {
174 return true; 174 return true;
175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { 175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
176 return true; 176 return true;
177 } 177 }
178 return false; 178 return false;
179 } 179 }
180 180
181 public function append(SiteConfig $newconfig) { 181 public function append(SiteConfig $newconfig) {
182 // check for commands where we accept multiple statements (no test_url) 182 // check for commands where we accept multiple statements (no test_url)
183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { 183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
184 // append array elements for this config variable from $newconfig to this config 184 // append array elements for this config variable from $newconfig to this config
185 //$this->$var = $this->$var + $newconfig->$var; 185 //$this->$var = $this->$var + $newconfig->$var;
186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); 186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
187 } 187 }
188 // check for single statement commands 188 // check for single statement commands
189 // we do not overwrite existing non null values 189 // we do not overwrite existing non null values
190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { 190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
191 if ($this->$var === null) $this->$var = $newconfig->$var; 191 if ($this->$var === null) $this->$var = $newconfig->$var;
192 } 192 }
193 } 193 // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
194 194 foreach (array('find_string', 'replace_string') as $var) {
195 // returns SiteConfig instance if an appropriate one is found, false otherwise 195 // append array elements for this config variable from $newconfig to this config
196 // if $exact_host_match is true, we will not look for wildcard config matches 196 //$this->$var = $this->$var + $newconfig->$var;
197 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists 197 $this->$var = array_merge($this->$var, $newconfig->$var);
198 public static function build($host, $exact_host_match=false) { 198 }
199 $host = strtolower($host); 199 }
200 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); 200
201 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; 201 // returns SiteConfig instance if an appropriate one is found, false otherwise
202 // check for site configuration 202 // if $exact_host_match is true, we will not look for wildcard config matches
203 $try = array($host); 203 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
204 // should we look for wildcard matches 204 public static function build($host, $exact_host_match=false) {
205 if (!$exact_host_match) { 205 $host = strtolower($host);
206 $split = explode('.', $host); 206 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
207 if (count($split) > 1) { 207 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
208 array_shift($split); 208 // check for site configuration
209 $try[] = '.'.implode('.', $split); 209 $try = array($host);
210 } 210 // should we look for wildcard matches
211 } 211 if (!$exact_host_match) {
212 212 $split = explode('.', $host);
213 // look for site config file in primary folder 213 if (count($split) > 1) {
214 self::debug(". looking for site config for $host in primary folder"); 214 array_shift($split);
215 foreach ($try as $h) { 215 $try[] = '.'.implode('.', $split);
216 if (array_key_exists($h, self::$config_cache)) { 216 }
217 self::debug("... site config for $h already loaded in this request"); 217 }
218 return self::$config_cache[$h]; 218
219 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { 219 // look for site config file in primary folder
220 self::debug("... site config for $h in APC cache"); 220 self::debug(". looking for site config for $host in primary folder");
221 return $sconfig; 221 foreach ($try as $h) {
222 } elseif (file_exists(self::$config_path."/$h.txt")) { 222 if (array_key_exists($h, self::$config_cache)) {
223 self::debug("... found site config ($h.txt)"); 223 self::debug("... site config for $h already loaded in this request");
224 $file_primary = self::$config_path."/$h.txt"; 224 return self::$config_cache[$h];
225 $matched_name = $h; 225 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
226 break; 226 self::debug("... site config for $h in APC cache");
227 } 227 return $sconfig;
228 } 228 } elseif (file_exists(self::$config_path."/$h.txt")) {
229 229 self::debug("... found site config ($h.txt)");
230 // if we found site config, process it 230 $file_primary = self::$config_path."/$h.txt";
231 if (isset($file_primary)) { 231 $matched_name = $h;
232 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 232 break;
233 if (!$config_lines || !is_array($config_lines)) return false; 233 }
234 $config = self::build_from_array($config_lines); 234 }
235 // if APC caching is available and enabled, mark this for cache 235
236 //$config->cache_in_apc = true; 236 // if we found site config, process it
237 $config->cache_key = $matched_name; 237 if (isset($file_primary)) {
238 238 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
239 // if autodetec on failure is off (on by default) we do not need to look 239 if (!$config_lines || !is_array($config_lines)) return false;
240 // in secondary folder 240 $config = self::build_from_array($config_lines);
241 if (!$config->autodetect_on_failure()) { 241 // if APC caching is available and enabled, mark this for cache
242 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); 242 //$config->cache_in_apc = true;
243 return $config; 243 $config->cache_key = $matched_name;
244 } 244
245 } 245 // if autodetec on failure is off (on by default) we do not need to look
246 246 // in secondary folder
247 // look for site config file in secondary folder 247 if (!$config->autodetect_on_failure()) {
248 if (isset(self::$config_path_fallback)) { 248 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
249 self::debug(". looking for site config for $host in secondary folder"); 249 return $config;
250 foreach ($try as $h) { 250 }
251 if (file_exists(self::$config_path_fallback."/$h.txt")) { 251 }
252 self::debug("... found site config in secondary folder ($h.txt)"); 252
253 $file_secondary = self::$config_path_fallback."/$h.txt"; 253 // look for site config file in secondary folder
254 $matched_name = $h; 254 if (isset(self::$config_path_fallback)) {
255 break; 255 self::debug(". looking for site config for $host in secondary folder");
256 } 256 foreach ($try as $h) {
257 } 257 if (file_exists(self::$config_path_fallback."/$h.txt")) {
258 if (!isset($file_secondary)) { 258 self::debug("... found site config in secondary folder ($h.txt)");
259 self::debug("... no site config match in secondary folder"); 259 $file_secondary = self::$config_path_fallback."/$h.txt";
260 } 260 $matched_name = $h;
261 } 261 break;
262 262 }
263 // return false if no config file found 263 }
264 if (!isset($file_primary) && !isset($file_secondary)) { 264 if (!isset($file_secondary)) {
265 self::debug("... no site config match for $host"); 265 self::debug("... no site config match in secondary folder");
266 return false; 266 }
267 } 267 }
268 268
269 // return primary config if secondary not found 269 // return false if no config file found
270 if (!isset($file_secondary) && isset($config)) { 270 if (!isset($file_primary) && !isset($file_secondary)) {
271 return $config; 271 self::debug("... no site config match for $host");
272 } 272 return false;
273 273 }
274 // process secondary config file 274
275 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 275 // return primary config if secondary not found
276 if (!$config_lines || !is_array($config_lines)) { 276 if (!isset($file_secondary) && isset($config)) {
277 // failed to process secondary 277 return $config;
278 if (isset($config)) { 278 }
279 // return primary config 279
280 return $config; 280 // process secondary config file
281 } else { 281 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
282 return false; 282 if (!$config_lines || !is_array($config_lines)) {
283 } 283 // failed to process secondary
284 } 284 if (isset($config)) {
285 285 // return primary config
286 // merge with primary and return 286 return $config;
287 if (isset($config)) { 287 } else {
288 self::debug('. merging config files'); 288 return false;
289 $config->append(self::build_from_array($config_lines)); 289 }
290 return $config; 290 }
291 } else { 291
292 // return just secondary 292 // merge with primary and return
293 $config = self::build_from_array($config_lines); 293 if (isset($config)) {
294 // if APC caching is available and enabled, mark this for cache 294 self::debug('. merging config files');
295 //$config->cache_in_apc = true; 295 $config->append(self::build_from_array($config_lines));
296 $config->cache_key = $matched_name; 296 return $config;
297 return $config; 297 } else {
298 } 298 // return just secondary
299 } 299 $config = self::build_from_array($config_lines);
300 300 // if APC caching is available and enabled, mark this for cache
301 public static function build_from_array(array $lines) { 301 //$config->cache_in_apc = true;
302 $config = new SiteConfig(); 302 $config->cache_key = $matched_name;
303 foreach ($lines as $line) { 303 return $config;
304 $line = trim($line); 304 }
305 305 }
306 // skip comments, empty lines 306
307 if ($line == '' || $line[0] == '#') continue; 307 public static function build_from_array(array $lines) {
308 308 $config = new SiteConfig();
309 // get command 309 foreach ($lines as $line) {
310 $command = explode(':', $line, 2); 310 $line = trim($line);
311 // if there's no colon ':', skip this line 311
312 if (count($command) != 2) continue; 312 // skip comments, empty lines
313 $val = trim($command[1]); 313 if ($line == '' || $line[0] == '#') continue;
314 $command = trim($command[0]); 314
315 if ($command == '' || $val == '') continue; 315 // get command
316 316 $command = explode(':', $line, 2);
317 // check for commands where we accept multiple statements 317 // if there's no colon ':', skip this line
318 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { 318 if (count($command) != 2) continue;
319 array_push($config->$command, $val); 319 $val = trim($command[1]);
320 // check for single statement commands that evaluate to true or false 320 $command = trim($command[0]);
321 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { 321 if ($command == '' || $val == '') continue;
322 $config->$command = ($val == 'yes'); 322
323 // check for single statement commands stored as strings 323 // check for commands where we accept multiple statements
324 } elseif (in_array($command, array('parser'))) { 324 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
325 $config->$command = $val; 325 array_push($config->$command, $val);
326 // check for replace_string(find): replace 326 // check for single statement commands that evaluate to true or false
327 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { 327 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
328 if (in_array($match[1], array('replace_string'))) { 328 $config->$command = ($val == 'yes');
329 $command = $match[1]; 329 // check for single statement commands stored as strings
330 array_push($config->find_string, $match[2]); 330 } elseif (in_array($command, array('parser'))) {
331 array_push($config->$command, $val); 331 $config->$command = $val;
332 } 332 // check for replace_string(find): replace
333 } 333 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
334 } 334 if (in_array($match[1], array('replace_string'))) {
335 return $config; 335 $command = $match[1];
336 } 336 array_push($config->find_string, $match[2]);
337} 337 array_push($config->$command, $val);
338?> \ No newline at end of file 338 }
339 }
340 }
341 return $config;
342 }
343} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php
index 9373deeb..40786598 100644..100755
--- a/inc/3rdparty/libraries/feedwriter/FeedItem.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php
@@ -1,7 +1,7 @@
1<?php 1<?php
2 /** 2 /**
3 * Univarsel Feed Writer 3 * Univarsel Feed Writer
4 * 4 *
5 * FeedItem class - Used as feed element in FeedWriter class 5 * FeedItem class - Used as feed element in FeedWriter class
6 * 6 *
7 * @package UnivarselFeedWriter 7 * @package UnivarselFeedWriter
@@ -10,176 +10,195 @@
10 */ 10 */
11 class FeedItem 11 class FeedItem
12 { 12 {
13 private $elements = array(); //Collection of feed elements 13 private $elements = array(); //Collection of feed elements
14 private $version; 14 private $version;
15 15
16 /** 16 /**
17 * Constructor 17 * Constructor
18 * 18 *
19 * @param contant (RSS1/RSS2/ATOM) RSS2 is default. 19 * @param contant (RSS1/RSS2/ATOM) RSS2 is default.
20 */ 20 */
21 function __construct($version = RSS2) 21 function __construct($version = RSS2)
22 { 22 {
23 $this->version = $version; 23 $this->version = $version;
24 } 24 }
25 25
26 /** 26 /**
27 * Set element (overwrites existing elements with $elementName) 27 * Set element (overwrites existing elements with $elementName)
28 * 28 *
29 * @access public 29 * @access public
30 * @param srting The tag name of an element 30 * @param srting The tag name of an element
31 * @param srting The content of tag 31 * @param srting The content of tag
32 * @param array Attributes(if any) in 'attrName' => 'attrValue' format 32 * @param array Attributes(if any) in 'attrName' => 'attrValue' format
33 * @return void 33 * @return void
34 */ 34 */
35 public function setElement($elementName, $content, $attributes = null) 35 public function setElement($elementName, $content, $attributes = null)
36 { 36 {
37 if (isset($this->elements[$elementName])) { 37 if (isset($this->elements[$elementName])) {
38 unset($this->elements[$elementName]); 38 unset($this->elements[$elementName]);
39 } 39 }
40 $this->addElement($elementName, $content, $attributes); 40 $this->addElement($elementName, $content, $attributes);
41 } 41 }
42 42
43 /** 43 /**
44 * Add an element to elements array 44 * Add an element to elements array
45 * 45 *
46 * @access public 46 * @access public
47 * @param srting The tag name of an element 47 * @param srting The tag name of an element
48 * @param srting The content of tag 48 * @param srting The content of tag
49 * @param array Attributes(if any) in 'attrName' => 'attrValue' format 49 * @param array Attributes(if any) in 'attrName' => 'attrValue' format
50 * @return void 50 * @return void
51 */ 51 */
52 public function addElement($elementName, $content, $attributes = null) 52 public function addElement($elementName, $content, $attributes = null)
53 { 53 {
54 $i = 0; 54 $i = 0;
55 if (isset($this->elements[$elementName])) { 55 if (isset($this->elements[$elementName])) {
56 $i = count($this->elements[$elementName]); 56 $i = count($this->elements[$elementName]);
57 } else { 57 } else {
58 $this->elements[$elementName] = array(); 58 $this->elements[$elementName] = array();
59 } 59 }
60 $this->elements[$elementName][$i]['name'] = $elementName; 60 $this->elements[$elementName][$i]['name'] = $elementName;
61 $this->elements[$elementName][$i]['content'] = $content; 61 $this->elements[$elementName][$i]['content'] = $content;
62 $this->elements[$elementName][$i]['attributes'] = $attributes; 62 $this->elements[$elementName][$i]['attributes'] = $attributes;
63 } 63 }
64 64
65 /** 65 /**
66 * Set multiple feed elements from an array. 66 * Set multiple feed elements from an array.
67 * Elements which have attributes cannot be added by this method 67 * Elements which have attributes cannot be added by this method
68 * 68 *
69 * @access public 69 * @access public
70 * @param array array of elements in 'tagName' => 'tagContent' format. 70 * @param array array of elements in 'tagName' => 'tagContent' format.
71 * @return void 71 * @return void
72 */ 72 */
73 public function addElementArray($elementArray) 73 public function addElementArray($elementArray)
74 { 74 {
75 if(! is_array($elementArray)) return; 75 if(! is_array($elementArray)) return;
76 foreach ($elementArray as $elementName => $content) 76 foreach ($elementArray as $elementName => $content)
77 { 77 {
78 $this->addElement($elementName, $content); 78 $this->addElement($elementName, $content);
79 } 79 }
80 } 80 }
81 81
82 /** 82 /**
83 * Return the collection of elements in this feed item 83 * Return the collection of elements in this feed item
84 * 84 *
85 * @access public 85 * @access public
86 * @return array 86 * @return array
87 */ 87 */
88 public function getElements() 88 public function getElements()
89 { 89 {
90 return $this->elements; 90 return $this->elements;
91 } 91 }
92 92
93 // Wrapper functions ------------------------------------------------------ 93 // Wrapper functions ------------------------------------------------------
94 94
95 /** 95 /**
96 * Set the 'dscription' element of feed item 96 * Set the 'dscription' element of feed item
97 * 97 *
98 * @access public 98 * @access public
99 * @param string The content of 'description' element 99 * @param string The content of 'description' element
100 * @return void 100 * @return void
101 */ 101 */
102 public function setDescription($description) 102 public function setDescription($description)
103 { 103 {
104 $tag = 'description'; 104 $tag = ($this->version == ATOM)? 'summary' : 'description';
105 $this->setElement($tag, $description); 105 $this->setElement($tag, $description);
106 } 106 }
107 107
108 /** 108 /**
109 * @desc Set the 'title' element of feed item 109 * @desc Set the 'title' element of feed item
110 * @access public 110 * @access public
111 * @param string The content of 'title' element 111 * @param string The content of 'title' element
112 * @return void 112 * @return void
113 */ 113 */
114 public function setTitle($title) 114 public function setTitle($title)
115 { 115 {
116 $this->setElement('title', $title); 116 $this->setElement('title', $title);
117 } 117 }
118 118
119 /** 119 /**
120 * Set the 'date' element of feed item 120 * Set the 'date' element of feed item
121 * 121 *
122 * @access public 122 * @access public
123 * @param string The content of 'date' element 123 * @param string The content of 'date' element
124 * @return void 124 * @return void
125 */ 125 */
126 public function setDate($date) 126 public function setDate($date)
127 { 127 {
128 if(! is_numeric($date)) 128 if(! is_numeric($date))
129 { 129 {
130 $date = strtotime($date); 130 $date = strtotime($date);
131 } 131 }
132 132
133 if($this->version == RSS2) 133 if($this->version == ATOM)
134 { 134 {
135 $tag = 'pubDate'; 135 $tag = 'updated';
136 $value = date(DATE_RSS, $date); 136 $value = date(DATE_ATOM, $date);
137 } 137 }
138 else 138 elseif($this->version == RSS2)
139 { 139 {
140 $tag = 'dc:date'; 140 $tag = 'pubDate';
141 $value = date("Y-m-d", $date); 141 $value = date(DATE_RSS, $date);
142 } 142 }
143 143 else
144 $this->setElement($tag, $value); 144 {
145 } 145 $tag = 'dc:date';
146 146 $value = date("Y-m-d", $date);
147 /** 147 }
148 * Set the 'link' element of feed item 148
149 * 149 $this->setElement($tag, $value);
150 * @access public 150 }
151 * @param string The content of 'link' element 151
152 * @return void 152 /**
153 */ 153 * Set the 'link' element of feed item
154 public function setLink($link) 154 *
155 { 155 * @access public
156 if($this->version == RSS2 || $this->version == RSS1) 156 * @param string The content of 'link' element
157 { 157 * @return void
158 $this->setElement('link', $link); 158 */
159 } 159 public function setLink($link)
160 else 160 {
161 { 161 if($this->version == RSS2 || $this->version == RSS1)
162 $this->setElement('link','',array('href'=>$link)); 162 {
163 $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); 163 $this->setElement('link', $link);
164 } 164 $this->setElement('guid', $link);
165 165 }
166 } 166 else
167 167 {
168 /** 168 $this->setElement('link','',array('href'=>$link));
169 * Set the 'encloser' element of feed item 169 $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:'));
170 * For RSS 2.0 only 170 }
171 * 171
172 * @access public 172 }
173 * @param string The url attribute of encloser tag 173
174 * @param string The length attribute of encloser tag 174 /**
175 * @param string The type attribute of encloser tag 175 * Set the 'source' element of feed item
176 * @return void 176 *
177 */ 177 * @access public
178 public function setEncloser($url, $length, $type) 178 * @param string The content of 'source' element
179 { 179 * @return void
180 $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); 180 */
181 $this->setElement('enclosure','',$attributes); 181 public function setSource($link)
182 } 182 {
183 183 $attributes = array('url'=>$link);
184 $this->setElement('source', "wallabag",$attributes);
185 }
186
187 /**
188 * Set the 'encloser' element of feed item
189 * For RSS 2.0 only
190 *
191 * @access public
192 * @param string The url attribute of encloser tag
193 * @param string The length attribute of encloser tag
194 * @param string The type attribute of encloser tag
195 * @return void
196 */
197 public function setEncloser($url, $length, $type)
198 {
199 $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);
200 $this->setElement('enclosure','',$attributes);
201 }
202
184 } // end of class FeedItem 203 } // end of class FeedItem
185?> 204?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
index adb2526c..9446cddf 100644..100755
--- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
@@ -2,6 +2,7 @@
2define('RSS2', 1, true); 2define('RSS2', 1, true);
3define('JSON', 2, true); 3define('JSON', 2, true);
4define('JSONP', 3, true); 4define('JSONP', 3, true);
5define('ATOM', 4, true);
5 6
6 /** 7 /**
7 * Univarsel Feed Writer class 8 * Univarsel Feed Writer class
@@ -9,433 +10,444 @@ define('JSONP', 3, true);
9 * Genarate RSS2 or JSON (original: RSS 1.0, RSS2.0 and ATOM Feed) 10 * Genarate RSS2 or JSON (original: RSS 1.0, RSS2.0 and ATOM Feed)
10 * 11 *
11 * Modified for FiveFilters.org's Full-Text RSS project 12 * Modified for FiveFilters.org's Full-Text RSS project
12 * to allow for inclusion of hubs, JSON output. 13 * to allow for inclusion of hubs, JSON output.
13 * Stripped RSS1 and ATOM support. 14 * Stripped RSS1 and ATOM support.
14 * 15 *
15 * @package UnivarselFeedWriter 16 * @package UnivarselFeedWriter
16 * @author Anis uddin Ahmad <anisniit@gmail.com> 17 * @author Anis uddin Ahmad <anisniit@gmail.com>
17 * @link http://www.ajaxray.com/projects/rss 18 * @link http://www.ajaxray.com/projects/rss
18 */ 19 */
19 class FeedWriter 20 class FeedWriter
20 { 21 {
21 private $self = null; // self URL - http://feed2.w3.org/docs/warning/MissingAtomSelfLink.html 22 private $self = null; // self URL - http://feed2.w3.org/docs/warning/MissingAtomSelfLink.html
22 private $hubs = array(); // PubSubHubbub hubs 23 private $hubs = array(); // PubSubHubbub hubs
23 private $channels = array(); // Collection of channel elements 24 private $channels = array(); // Collection of channel elements
24 private $items = array(); // Collection of items as object of FeedItem class. 25 private $items = array(); // Collection of items as object of FeedItem class.
25 private $data = array(); // Store some other version wise data 26 private $data = array(); // Store some other version wise data
26 private $CDATAEncoding = array(); // The tag names which have to encoded as CDATA 27 private $CDATAEncoding = array(); // The tag names which have to encoded as CDATA
27 private $xsl = null; // stylesheet to render RSS (used by Chrome) 28 private $xsl = null; // stylesheet to render RSS (used by Chrome)
28 private $json = null; // JSON object 29 private $json = null; // JSON object
29 30
30 private $version = null; 31 private $version = null;
31 32
32 /** 33 /**
33 * Constructor 34 * Constructor
34 * 35 *
35 * @param constant the version constant (RSS2 or JSON). 36 * @param constant the version constant (RSS2 or JSON).
36 */ 37 */
37 function __construct($version = RSS2) 38 function __construct($version = RSS2)
38 { 39 {
39 $this->version = $version; 40 $this->version = $version;
40 41
41 // Setting default value for assential channel elements 42 // Setting default value for assential channel elements
42 $this->channels['title'] = $version . ' Feed'; 43 $this->channels['title'] = $version . ' Feed';
43 $this->channels['link'] = 'http://www.ajaxray.com/blog'; 44 $this->channels['link'] = 'http://www.ajaxray.com/blog';
44 45
45 //Tag names to encode in CDATA 46 //Tag names to encode in CDATA
46 $this->CDATAEncoding = array('description', 'content:encoded', 'content', 'subtitle', 'summary'); 47 $this->CDATAEncoding = array('description', 'content:encoded', 'content', 'subtitle', 'summary');
47 } 48 }
48 49
49 public function setFormat($format) { 50 public function setFormat($format) {
50 $this->version = $format; 51 $this->version = $format;
51 } 52 }
52 53
53 // Start # public functions --------------------------------------------- 54 // Start # public functions ---------------------------------------------
54 55
55 /** 56 /**
56 * Set a channel element 57 * Set a channel element
57 * @access public 58 * @access public
58 * @param srting name of the channel tag 59 * @param srting name of the channel tag
59 * @param string content of the channel tag 60 * @param string content of the channel tag
60 * @return void 61 * @return void
61 */ 62 */
62 public function setChannelElement($elementName, $content) 63 public function setChannelElement($elementName, $content)
63 { 64 {
64 $this->channels[$elementName] = $content ; 65 $this->channels[$elementName] = $content ;
65 } 66 }
66 67
67 /** 68 /**
68 * Set multiple channel elements from an array. Array elements 69 * Set multiple channel elements from an array. Array elements
69 * should be 'channelName' => 'channelContent' format. 70 * should be 'channelName' => 'channelContent' format.
70 * 71 *
71 * @access public 72 * @access public
72 * @param array array of channels 73 * @param array array of channels
73 * @return void 74 * @return void
74 */ 75 */
75 public function setChannelElementsFromArray($elementArray) 76 public function setChannelElementsFromArray($elementArray)
76 { 77 {
77 if(! is_array($elementArray)) return; 78 if(! is_array($elementArray)) return;
78 foreach ($elementArray as $elementName => $content) 79 foreach ($elementArray as $elementName => $content)
79 { 80 {
80 $this->setChannelElement($elementName, $content); 81 $this->setChannelElement($elementName, $content);
81 } 82 }
82 } 83 }
83 84
84 /** 85 /**
85 * Genarate the actual RSS/JSON file 86 * Genarate the actual RSS/JSON file
86 * 87 *
87 * @access public 88 * @access public
88 * @return void 89 * @return void
89 */ 90 */
90 public function genarateFeed() 91 public function genarateFeed($withHeaders = true)
91 { 92 {
92 if ($this->version == RSS2) { 93 if ($withHeaders) {
93 header('Content-type: text/xml; charset=UTF-8'); 94 if ($this->version == RSS2) {
94 // this line prevents Chrome 20 from prompting download 95 header('Content-type: text/xml; charset=UTF-8');
95 // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss 96 // this line prevents Chrome 20 from prompting download
96 header('X-content-type-options: nosniff'); 97 // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss
97 } elseif ($this->version == JSON) { 98 header('X-content-type-options: nosniff');
98 header('Content-type: application/json; charset=UTF-8'); 99 } elseif ($this->version == JSON) {
99 $this->json = new stdClass(); 100 header('Content-type: application/json; charset=UTF-8');
100 } elseif ($this->version == JSONP) { 101 } elseif ($this->version == JSONP) {
101 header('Content-type: application/javascript; charset=UTF-8'); 102 header('Content-type: application/javascript; charset=UTF-8');
102 $this->json = new stdClass(); 103 }
103 } 104 }
104 $this->printHead(); 105
105 $this->printChannels(); 106 if ($this->version == JSON || $this->version == JSONP) {
106 $this->printItems(); 107 $this->json = new stdClass();
107 $this->printTale(); 108 }
108 if ($this->version == JSON || $this->version == JSONP) { 109
109 echo json_encode($this->json); 110
110 } 111 $this->printHead();
111 } 112 $this->printChannels();
112 113 $this->printItems();
113 /** 114 $this->printTale();
114 * Create a new FeedItem. 115 if ($this->version == JSON || $this->version == JSONP) {
115 * 116 echo json_encode($this->json);
116 * @access public 117 }
117 * @return object instance of FeedItem class 118 }
118 */ 119
119 public function createNewItem() 120 public function &getItems()
120 { 121 {
121 $Item = new FeedItem($this->version); 122 return $this->items;
122 return $Item; 123 }
123 } 124
124 125 /**
125 /** 126 * Create a new FeedItem.
126 * Add a FeedItem to the main class 127 *
127 * 128 * @access public
128 * @access public 129 * @return object instance of FeedItem class
129 * @param object instance of FeedItem class 130 */
130 * @return void 131 public function createNewItem()
131 */ 132 {
132 public function addItem($feedItem) 133 $Item = new FeedItem($this->version);
133 { 134 return $Item;
134 $this->items[] = $feedItem; 135 }
135 } 136
136 137 /**
137 // Wrapper functions ------------------------------------------------------------------- 138 * Add a FeedItem to the main class
138 139 *
139 /** 140 * @access public
140 * Set the 'title' channel element 141 * @param object instance of FeedItem class
141 * 142 * @return void
142 * @access public 143 */
143 * @param srting value of 'title' channel tag 144 public function addItem($feedItem)
144 * @return void 145 {
145 */ 146 $this->items[] = $feedItem;
146 public function setTitle($title) 147 }
147 { 148
148 $this->setChannelElement('title', $title); 149 // Wrapper functions -------------------------------------------------------------------
149 } 150
150 151 /**
151 /** 152 * Set the 'title' channel element
152 * Add a hub to the channel element 153 *
153 * 154 * @access public
154 * @access public 155 * @param srting value of 'title' channel tag
155 * @param string URL 156 * @return void
156 * @return void 157 */
157 */ 158 public function setTitle($title)
158 public function addHub($hub) 159 {
159 { 160 $this->setChannelElement('title', $title);
160 $this->hubs[] = $hub; 161 }
161 } 162
162 163 /**
163 /** 164 * Add a hub to the channel element
164 * Set XSL URL 165 *
165 * 166 * @access public
166 * @access public 167 * @param string URL
167 * @param string URL 168 * @return void
168 * @return void 169 */
169 */ 170 public function addHub($hub)
170 public function setXsl($xsl) 171 {
171 { 172 $this->hubs[] = $hub;
172 $this->xsl = $xsl; 173 }
173 } 174
174 175 /**
175 /** 176 * Set XSL URL
176 * Set self URL 177 *
177 * 178 * @access public
178 * @access public 179 * @param string URL
179 * @param string URL 180 * @return void
180 * @return void 181 */
181 */ 182 public function setXsl($xsl)
182 public function setSelf($self) 183 {
183 { 184 $this->xsl = $xsl;
184 $this->self = $self; 185 }
185 } 186
186 187 /**
187 /** 188 * Set self URL
188 * Set the 'description' channel element 189 *
189 * 190 * @access public
190 * @access public 191 * @param string URL
191 * @param srting value of 'description' channel tag 192 * @return void
192 * @return void 193 */
193 */ 194 public function setSelf($self)
194 public function setDescription($desciption) 195 {
195 { 196 $this->self = $self;
196 $tag = ($this->version == ATOM)? 'subtitle' : 'description'; 197 }
197 $this->setChannelElement($tag, $desciption); 198
198 } 199 /**
199 200 * Set the 'description' channel element
200 /** 201 *
201 * Set the 'link' channel element 202 * @access public
202 * 203 * @param srting value of 'description' channel tag
203 * @access public 204 * @return void
204 * @param srting value of 'link' channel tag 205 */
205 * @return void 206 public function setDescription($description)
206 */ 207 {
207 public function setLink($link) 208 $tag = ($this->version == ATOM)? 'subtitle' : 'description';
208 { 209 $this->setChannelElement($tag, $description);
209 $this->setChannelElement('link', $link); 210 }
210 } 211
211 212 /**
212 /** 213 * Set the 'link' channel element
213 * Set the 'image' channel element 214 *
214 * 215 * @access public
215 * @access public 216 * @param srting value of 'link' channel tag
216 * @param srting title of image 217 * @return void
217 * @param srting link url of the imahe 218 */
218 * @param srting path url of the image 219 public function setLink($link)
219 * @return void 220 {
220 */ 221 $this->setChannelElement('link', $link);
221 public function setImage($title, $link, $url) 222 }
222 { 223
223 $this->setChannelElement('image', array('title'=>$title, 'link'=>$link, 'url'=>$url)); 224 /**
224 } 225 * Set the 'image' channel element
225 226 *
226 // End # public functions ---------------------------------------------- 227 * @access public
227 228 * @param srting title of image
228 // Start # private functions ---------------------------------------------- 229 * @param srting link url of the imahe
229 230 * @param srting path url of the image
230 /** 231 * @return void
231 * Prints the xml and rss namespace 232 */
232 * 233 public function setImage($title, $link, $url)
233 * @access private 234 {
234 * @return void 235 $this->setChannelElement('image', array('title'=>$title, 'link'=>$link, 'url'=>$url));
235 */ 236 }
236 private function printHead() 237
237 { 238 // End # public functions ----------------------------------------------
238 if ($this->version == RSS2) 239
239 { 240 // Start # private functions ----------------------------------------------
240 $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; 241
241 if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; 242 /**
242 $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; 243 * Prints the xml and rss namespace
243 echo $out; 244 *
244 } 245 * @access private
245 elseif ($this->version == JSON || $this->version == JSONP) 246 * @return void
246 { 247 */
247 $this->json->rss = array('@attributes' => array('version' => '2.0')); 248 private function printHead()
248 } 249 {
249 } 250 if ($this->version == RSS2)
250 251 {
251 /** 252 $out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
252 * Closes the open tags at the end of file 253 if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
253 * 254 $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
254 * @access private 255 echo $out;
255 * @return void 256 }
256 */ 257 elseif ($this->version == JSON || $this->version == JSONP)
257 private function printTale() 258 {
258 { 259 $this->json->rss = array('@attributes' => array('version' => '2.0'));
259 if ($this->version == RSS2) 260 }
260 { 261 }
261 echo '</channel>',PHP_EOL,'</rss>'; 262
262 } 263 /**
263 // do nothing for JSON 264 * Closes the open tags at the end of file
264 } 265 *
265 266 * @access private
266 /** 267 * @return void
267 * Creates a single node as xml format 268 */
268 * 269 private function printTale()
269 * @access private 270 {
270 * @param string name of the tag 271 if ($this->version == RSS2)
271 * @param mixed tag value as string or array of nested tags in 'tagName' => 'tagValue' format 272 {
272 * @param array Attributes(if any) in 'attrName' => 'attrValue' format 273 echo '</channel>',PHP_EOL,'</rss>';
273 * @return string formatted xml tag 274 }
274 */ 275 // do nothing for JSON
275 private function makeNode($tagName, $tagContent, $attributes = null) 276 }
276 { 277
277 if ($this->version == RSS2) 278 /**
278 { 279 * Creates a single node as xml format
279 $nodeText = ''; 280 *
280 $attrText = ''; 281 * @access private
281 if (is_array($attributes)) 282 * @param string name of the tag
282 { 283 * @param mixed tag value as string or array of nested tags in 'tagName' => 'tagValue' format
283 foreach ($attributes as $key => $value) 284 * @param array Attributes(if any) in 'attrName' => 'attrValue' format
284 { 285 * @return string formatted xml tag
285 $attrText .= " $key=\"$value\" "; 286 */
286 } 287 private function makeNode($tagName, $tagContent, $attributes = null)
287 } 288 {
288 $nodeText .= "<{$tagName}{$attrText}>"; 289 if ($this->version == RSS2)
289 if (is_array($tagContent)) 290 {
290 { 291 $nodeText = '';
291 foreach ($tagContent as $key => $value) 292 $attrText = '';
292 { 293 if (is_array($attributes))
293 $nodeText .= $this->makeNode($key, $value); 294 {
294 } 295 foreach ($attributes as $key => $value)
295 } 296 {
296 else 297 $attrText .= " $key=\"$value\" ";
297 { 298 }
298 //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent); 299 }
299 $nodeText .= htmlspecialchars($tagContent); 300 $nodeText .= "<{$tagName}{$attrText}>";
300 } 301 if (is_array($tagContent))
301 //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>"; 302 {
302 $nodeText .= "</$tagName>"; 303 foreach ($tagContent as $key => $value)
303 return $nodeText . PHP_EOL; 304 {
304 } 305 $nodeText .= $this->makeNode($key, $value);
305 elseif ($this->version == JSON || $this->version == JSONP) 306 }
306 { 307 }
307 $tagName = (string)$tagName; 308 else
308 $tagName = strtr($tagName, ':', '_'); 309 {
309 $node = null; 310 //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
310 if (!$tagContent && is_array($attributes) && count($attributes)) 311 $nodeText .= htmlspecialchars($tagContent);
311 { 312 }
312 $node = array('@attributes' => $this->json_keys($attributes)); 313 //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
313 } else { 314 $nodeText .= "</$tagName>";
314 if (is_array($tagContent)) { 315 return $nodeText . PHP_EOL;
315 $node = $this->json_keys($tagContent); 316 }
316 } else { 317 elseif ($this->version == JSON || $this->version == JSONP)
317 $node = $tagContent; 318 {
318 } 319 $tagName = (string)$tagName;
319 } 320 $tagName = strtr($tagName, ':', '_');
320 return $node; 321 $node = null;
321 } 322 if (!$tagContent && is_array($attributes) && count($attributes))
322 return ''; // should not get here 323 {
323 } 324 $node = array('@attributes' => $this->json_keys($attributes));
324 325 } else {
325 private function json_keys(array $array) { 326 if (is_array($tagContent)) {
326 $new = array(); 327 $node = $this->json_keys($tagContent);
327 foreach ($array as $key => $val) { 328 } else {
328 if (is_string($key)) $key = strtr($key, ':', '_'); 329 $node = $tagContent;
329 if (is_array($val)) { 330 }
330 $new[$key] = $this->json_keys($val); 331 }
331 } else { 332 return $node;
332 $new[$key] = $val; 333 }
333 } 334 return ''; // should not get here
334 } 335 }
335 return $new; 336
336 } 337 private function json_keys(array $array) {
337 338 $new = array();
338 /** 339 foreach ($array as $key => $val) {
339 * @desc Print channels 340 if (is_string($key)) $key = strtr($key, ':', '_');
340 * @access private 341 if (is_array($val)) {
341 * @return void 342 $new[$key] = $this->json_keys($val);
342 */ 343 } else {
343 private function printChannels() 344 $new[$key] = $val;
344 { 345 }
345 //Start channel tag 346 }
346 if ($this->version == RSS2) { 347 return $new;
347 echo '<channel>' . PHP_EOL; 348 }
348 // add hubs 349
349 foreach ($this->hubs as $hub) { 350 /**
350 //echo $this->makeNode('link', '', array('rel'=>'hub', 'href'=>$hub, 'xmlns'=>'http://www.w3.org/2005/Atom')); 351 * @desc Print channels
351 echo '<link rel="hub" href="'.htmlspecialchars($hub).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL; 352 * @access private
352 } 353 * @return void
353 // add self 354 */
354 if (isset($this->self)) { 355 private function printChannels()
355 //echo $this->makeNode('link', '', array('rel'=>'self', 'href'=>$this->self, 'xmlns'=>'http://www.w3.org/2005/Atom')); 356 {
356 echo '<link rel="self" href="'.htmlspecialchars($this->self).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL; 357 //Start channel tag
357 } 358 if ($this->version == RSS2) {
358 //Print Items of channel 359 echo '<channel>' . PHP_EOL;
359 foreach ($this->channels as $key => $value) 360 // add hubs
360 { 361 foreach ($this->hubs as $hub) {
361 echo $this->makeNode($key, $value); 362 //echo $this->makeNode('link', '', array('rel'=>'hub', 'href'=>$hub, 'xmlns'=>'http://www.w3.org/2005/Atom'));
362 } 363 echo '<link rel="hub" href="'.htmlspecialchars($hub).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL;
363 } elseif ($this->version == JSON || $this->version == JSONP) { 364 }
364 $this->json->rss['channel'] = (object)$this->json_keys($this->channels); 365 // add self
365 } 366 if (isset($this->self)) {
366 } 367 //echo $this->makeNode('link', '', array('rel'=>'self', 'href'=>$this->self, 'xmlns'=>'http://www.w3.org/2005/Atom'));
367 368 echo '<link rel="self" href="'.htmlspecialchars($this->self).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL;
368 /** 369 }
369 * Prints formatted feed items 370 //Print Items of channel
370 * 371 foreach ($this->channels as $key => $value)
371 * @access private 372 {
372 * @return void 373 echo $this->makeNode($key, $value);
373 */ 374 }
374 private function printItems() 375 } elseif ($this->version == JSON || $this->version == JSONP) {
375 { 376 $this->json->rss['channel'] = (object)$this->json_keys($this->channels);
376 foreach ($this->items as $item) { 377 }
377 $itemElements = $item->getElements(); 378 }
378 379
379 echo $this->startItem(); 380 /**
380 381 * Prints formatted feed items
381 if ($this->version == JSON || $this->version == JSONP) { 382 *
382 $json_item = array(); 383 * @access private
383 } 384 * @return void
384 385 */
385 foreach ($itemElements as $thisElement) { 386 private function printItems()
386 foreach ($thisElement as $instance) { 387 {
387 if ($this->version == RSS2) { 388 foreach ($this->items as $item) {
388 echo $this->makeNode($instance['name'], $instance['content'], $instance['attributes']); 389 $itemElements = $item->getElements();
389 } elseif ($this->version == JSON || $this->version == JSONP) { 390
390 $_json_node = $this->makeNode($instance['name'], $instance['content'], $instance['attributes']); 391 echo $this->startItem();
391 if (count($thisElement) > 1) { 392
392 $json_item[strtr($instance['name'], ':', '_')][] = $_json_node; 393 if ($this->version == JSON || $this->version == JSONP) {
393 } else { 394 $json_item = array();
394 $json_item[strtr($instance['name'], ':', '_')] = $_json_node; 395 }
395 } 396
396 } 397 foreach ($itemElements as $thisElement) {
397 } 398 foreach ($thisElement as $instance) {
398 } 399 if ($this->version == RSS2) {
399 echo $this->endItem(); 400 echo $this->makeNode($instance['name'], $instance['content'], $instance['attributes']);
400 if ($this->version == JSON || $this->version == JSONP) { 401 } elseif ($this->version == JSON || $this->version == JSONP) {
401 if (count($this->items) > 1) { 402 $_json_node = $this->makeNode($instance['name'], $instance['content'], $instance['attributes']);
402 $this->json->rss['channel']->item[] = $json_item; 403 if (count($thisElement) > 1) {
403 } else { 404 $json_item[strtr($instance['name'], ':', '_')][] = $_json_node;
404 $this->json->rss['channel']->item = $json_item; 405 } else {
405 } 406 $json_item[strtr($instance['name'], ':', '_')] = $_json_node;
406 } 407 }
407 } 408 }
408 } 409 }
409 410 }
410 /** 411 echo $this->endItem();
411 * Make the starting tag of channels 412 if ($this->version == JSON || $this->version == JSONP) {
412 * 413 if (count($this->items) > 1) {
413 * @access private 414 $this->json->rss['channel']->item[] = $json_item;
414 * @return void 415 } else {
415 */ 416 $this->json->rss['channel']->item = $json_item;
416 private function startItem() 417 }
417 { 418 }
418 if ($this->version == RSS2) 419 }
419 { 420 }
420 echo '<item>' . PHP_EOL; 421
421 } 422 /**
422 // nothing for JSON 423 * Make the starting tag of channels
423 } 424 *
424 425 * @access private
425 /** 426 * @return void
426 * Closes feed item tag 427 */
427 * 428 private function startItem()
428 * @access private 429 {
429 * @return void 430 if ($this->version == RSS2)
430 */ 431 {
431 private function endItem() 432 echo '<item>' . PHP_EOL;
432 { 433 }
433 if ($this->version == RSS2) 434 // nothing for JSON
434 { 435 }
435 echo '</item>' . PHP_EOL; 436
436 } 437 /**
437 // nothing for JSON 438 * Closes feed item tag
438 } 439 *
439 440 * @access private
440 // End # private functions ---------------------------------------------- 441 * @return void
442 */
443 private function endItem()
444 {
445 if ($this->version == RSS2)
446 {
447 echo '</item>' . PHP_EOL;
448 }
449 // nothing for JSON
450 }
451
452 // End # private functions ----------------------------------------------
441 } \ No newline at end of file 453 } \ No newline at end of file
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php
index 2f5244f9..c4a48b21 100644
--- a/inc/3rdparty/libraries/html5/TreeBuilder.php
+++ b/inc/3rdparty/libraries/html5/TreeBuilder.php
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
134 134
135 // Namespaces for foreign content 135 // Namespaces for foreign content
136 const NS_HTML = null; // to prevent DOM from requiring NS on everything 136 const NS_HTML = null; // to prevent DOM from requiring NS on everything
137 const NS_XHTML = 'http://www.w3.org/1999/xhtml';
137 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; 138 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
138 const NS_SVG = 'http://www.w3.org/2000/svg'; 139 const NS_SVG = 'http://www.w3.org/2000/svg';
139 const NS_XLINK = 'http://www.w3.org/1999/xlink'; 140 const NS_XLINK = 'http://www.w3.org/1999/xlink';
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
3157 } 3158 }
3158 3159
3159 private function insertElement($token, $append = true) { 3160 private function insertElement($token, $append = true) {
3160 $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); 3161 //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
3162 $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
3163 $el = $this->dom->createElementNS($namespaceURI, $token['name']);
3161 3164
3162 if (!empty($token['attr'])) { 3165 if (!empty($token['attr'])) {
3163 foreach($token['attr'] as $attr) { 3166 foreach($token['attr'] as $attr) {
3164 if(!$el->hasAttribute($attr['name'])) { 3167
3168 // mike@macgirvin.com 2011-11-17, check attribute name for
3169 // validity (ignoring extenders and combiners) as illegal chars in names
3170 // causes everything to abort
3171
3172 $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
3173 if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
3165 $el->setAttribute($attr['name'], $attr['value']); 3174 $el->setAttribute($attr['name'], $attr['value']);
3166 } 3175 }
3167 } 3176 }
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
index 83e94f14..e4d5f495 100644
--- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
+++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
@@ -1,404 +1,403 @@
1<?php 1<?php
2/** 2/**
3 * Cookie Jar 3 * Cookie Jar
4 * 4 *
5 * PHP class for handling cookies, as defined by the Netscape spec: 5 * PHP class for handling cookies, as defined by the Netscape spec:
6 * <http://curl.haxx.se/rfc/cookie_spec.html> 6 * <http://curl.haxx.se/rfc/cookie_spec.html>
7 * 7 *
8 * This class should be used to handle cookies (storing cookies from HTTP response messages, and 8 * This class should be used to handle cookies (storing cookies from HTTP response messages, and
9 * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org 9 * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org
10 * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ 10 * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
11 * 11 *
12 * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ 12 * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
13 * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. 13 * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
14 * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. 14 * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
15 * 15 *
16 * @version 0.5 16 * @version 0.5
17 * @date 2011-03-15 17 * @date 2011-03-15
18 * @see http://php.net/HttpRequestPool 18 * @see http://php.net/HttpRequestPool
19 * @author Keyvan Minoukadeh 19 * @author Keyvan Minoukadeh
20 * @copyright 2011 Keyvan Minoukadeh 20 * @copyright 2011 Keyvan Minoukadeh
21 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 21 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
22 */ 22 */
23 23
24class CookieJar 24class CookieJar
25{ 25{
26 /** 26 /**
27 * Cookies - array containing all cookies. 27 * Cookies - array containing all cookies.
28 * 28 *
29 * <pre> 29 * <pre>
30 * Cookies are stored like this: 30 * Cookies are stored like this:
31 * [domain][path][name] = array 31 * [domain][path][name] = array
32 * where array is: 32 * where array is:
33 * 0 => value, 1 => secure, 2 => expires 33 * 0 => value, 1 => secure, 2 => expires
34 * </pre> 34 * </pre>
35 * @var array 35 * @var array
36 * @access private 36 * @access private
37 */ 37 */
38 public $cookies = array(); 38 public $cookies = array();
39 public $debug = false; 39 public $debug = false;
40 40
41 /** 41 /**
42 * Constructor 42 * Constructor
43 */ 43 */
44 function __construct() { 44 function __construct() {
45 } 45 }
46 46
47 protected function debug($msg, $file=null, $line=null) { 47 protected function debug($msg, $file=null, $line=null) {
48 if ($this->debug) { 48 if ($this->debug) {
49 $mem = round(memory_get_usage()/1024, 2); 49 $mem = round(memory_get_usage()/1024, 2);
50 $memPeak = round(memory_get_peak_usage()/1024, 2); 50 $memPeak = round(memory_get_peak_usage()/1024, 2);
51 echo '* ',$msg; 51 echo '* ',$msg;
52 if (isset($file, $line)) echo " ($file line $line)"; 52 if (isset($file, $line)) echo " ($file line $line)";
53 echo ' - mem used: ',$mem," (peak: $memPeak)\n"; 53 echo ' - mem used: ',$mem," (peak: $memPeak)\n";
54 ob_flush(); 54 ob_flush();
55 flush(); 55 flush();
56 } 56 }
57 } 57 }
58 58
59 /** 59 /**
60 * Get matching cookies 60 * Get matching cookies
61 * 61 *
62 * Only use this method if you cannot use add_cookie_header(), for example, if you want to use 62 * Only use this method if you cannot use add_cookie_header(), for example, if you want to use
63 * this cookie jar class without using the request class. 63 * this cookie jar class without using the request class.
64 * 64 *
65 * @param array $param associative array containing 'domain', 'path', 'secure' keys 65 * @param array $param associative array containing 'domain', 'path', 'secure' keys
66 * @return string 66 * @return string
67 * @see add_cookie_header() 67 * @see add_cookie_header()
68 */ 68 */
69 public function getMatchingCookies($url) 69 public function getMatchingCookies($url)
70 { 70 {
71 if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { 71 if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
72 $param['domain'] = $parts['host']; 72 $param['domain'] = $parts['host'];
73 $param['path'] = $parts['path']; 73 $param['path'] = $parts['path'];
74 $param['secure'] = (strtolower($parts['scheme']) == 'https'); 74 $param['secure'] = (strtolower($parts['scheme']) == 'https');
75 unset($parts); 75 unset($parts);
76 } else { 76 } else {
77 return false; 77 return false;
78 } 78 }
79 // RFC 2965 notes: 79 // RFC 2965 notes:
80 // If multiple cookies satisfy the criteria above, they are ordered in 80 // If multiple cookies satisfy the criteria above, they are ordered in
81 // the Cookie header such that those with more specific Path attributes 81 // the Cookie header such that those with more specific Path attributes
82 // precede those with less specific. Ordering with respect to other 82 // precede those with less specific. Ordering with respect to other
83 // attributes (e.g., Domain) is unspecified. 83 // attributes (e.g., Domain) is unspecified.
84 $domain = $param['domain']; 84 $domain = $param['domain'];
85 if (strpos($domain, '.') === false) $domain .= '.local'; 85 if (strpos($domain, '.') === false) $domain .= '.local';
86 $request_path = $param['path']; 86 $request_path = $param['path'];
87 if ($request_path == '') $request_path = '/'; 87 if ($request_path == '') $request_path = '/';
88 $request_secure = $param['secure']; 88 $request_secure = $param['secure'];
89 $now = time(); 89 $now = time();
90 $matched_cookies = array(); 90 $matched_cookies = array();
91 // domain - find matching domains 91 // domain - find matching domains
92 $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); 92 $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
93 while (strpos($domain, '.') !== false) { 93 while (strpos($domain, '.') !== false) {
94 if (isset($this->cookies[$domain])) { 94 if (isset($this->cookies[$domain])) {
95 $this->debug(' domain match found: '.$domain); 95 $this->debug(' domain match found: '.$domain);
96 $cookies =& $this->cookies[$domain]; 96 $cookies =& $this->cookies[$domain];
97 } else { 97 } else {
98 $domain = $this->_reduce_domain($domain); 98 $domain = $this->_reduce_domain($domain);
99 continue; 99 continue;
100 } 100 }
101 // paths - find matching paths starting from most specific 101 // paths - find matching paths starting from most specific
102 $this->debug(' - Finding matching paths for '.$request_path); 102 $this->debug(' - Finding matching paths for '.$request_path);
103 $paths = array_keys($cookies); 103 $paths = array_keys($cookies);
104 usort($paths, array($this, '_cmp_length')); 104 usort($paths, array($this, '_cmp_length'));
105 foreach ($paths as $path) { 105 foreach ($paths as $path) {
106 // continue to next cookie if request path does not path-match cookie path 106 // continue to next cookie if request path does not path-match cookie path
107 if (!$this->_path_match($request_path, $path)) continue; 107 if (!$this->_path_match($request_path, $path)) continue;
108 // loop through cookie names 108 // loop through cookie names
109 $this->debug(' path match found: '.$path); 109 $this->debug(' path match found: '.$path);
110 foreach ($cookies[$path] as $name => $values) { 110 foreach ($cookies[$path] as $name => $values) {
111 // if this cookie is secure but request isn't, continue to next cookie 111 // if this cookie is secure but request isn't, continue to next cookie
112 if ($values[1] && !$request_secure) continue; 112 if ($values[1] && !$request_secure) continue;
113 // if cookie is not a session cookie and has expired, continue to next cookie 113 // if cookie is not a session cookie and has expired, continue to next cookie
114 if (is_int($values[2]) && ($values[2] < $now)) continue; 114 if (is_int($values[2]) && ($values[2] < $now)) continue;
115 // cookie matches request 115 // cookie matches request
116 $this->debug(' cookie match: '.$name.'='.$values[0]); 116 $this->debug(' cookie match: '.$name.'='.$values[0]);
117 $matched_cookies[] = $name.'='.$values[0]; 117 $matched_cookies[] = $name.'='.$values[0];
118 } 118 }
119 } 119 }
120 $domain = $this->_reduce_domain($domain); 120 $domain = $this->_reduce_domain($domain);
121 } 121 }
122 // return cookies 122 // return cookies
123 return implode('; ', $matched_cookies); 123 return implode('; ', $matched_cookies);
124 } 124 }
125 125
126 /** 126 /**
127 * Parse Set-Cookie values. 127 * Parse Set-Cookie values.
128 * 128 *
129 * Only use this method if you cannot use extract_cookies(), for example, if you want to use 129 * Only use this method if you cannot use extract_cookies(), for example, if you want to use
130 * this cookie jar class without using the response class. 130 * this cookie jar class without using the response class.
131 * 131 *
132 * @param array $set_cookies array holding 1 or more "Set-Cookie" header values 132 * @param array $set_cookies array holding 1 or more "Set-Cookie" header values
133 * @param array $param associative array containing 'host', 'path' keys 133 * @param array $param associative array containing 'host', 'path' keys
134 * @return void 134 * @return void
135 * @see extract_cookies() 135 * @see extract_cookies()
136 */ 136 */
137 public function storeCookies($url, $set_cookies) 137 public function storeCookies($url, $set_cookies)
138 { 138 {
139 if (count($set_cookies) == 0) return; 139 if (count($set_cookies) == 0) return;
140 $param = @parse_url($url); 140 $param = @parse_url($url);
141 if (!is_array($param) || !isset($param['host'])) return; 141 if (!is_array($param) || !isset($param['host'])) return;
142 $request_host = $param['host']; 142 $request_host = $param['host'];
143 if (strpos($request_host, '.') === false) $request_host .= '.local'; 143 if (strpos($request_host, '.') === false) $request_host .= '.local';
144 $request_path = @$param['path']; 144 $request_path = @$param['path'];
145 if ($request_path == '') $request_path = '/'; 145 if ($request_path == '') $request_path = '/';
146 // 146 //
147 // loop through set-cookie headers 147 // loop through set-cookie headers
148 // 148 //
149 foreach ($set_cookies as $set_cookie) { 149 foreach ($set_cookies as $set_cookie) {
150 $this->debug('Parsing: '.$set_cookie); 150 $this->debug('Parsing: '.$set_cookie);
151 // temporary cookie store (before adding to jar) 151 // temporary cookie store (before adding to jar)
152 $tmp_cookie = array(); 152 $tmp_cookie = array();
153 $param = explode(';', $set_cookie); 153 $param = explode(';', $set_cookie);
154 // loop through params 154 // loop through params
155 for ($x=0; $x<count($param); $x++) { 155 for ($x=0; $x<count($param); $x++) {
156 $key_val = explode('=', $param[$x], 2); 156 $key_val = explode('=', $param[$x], 2);
157 if (count($key_val) != 2) { 157 if (count($key_val) != 2) {
158 // if the first param isn't a name=value pair, continue to the next set-cookie 158 // if the first param isn't a name=value pair, continue to the next set-cookie
159 // header 159 // header
160 if ($x == 0) continue 2; 160 if ($x == 0) continue 2;
161 // check for secure flag 161 // check for secure flag
162 if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; 162 if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
163 // continue to next param 163 // continue to next param
164 continue; 164 continue;
165 } 165 }
166 list($key, $val) = array_map('trim', $key_val); 166 list($key, $val) = array_map('trim', $key_val);
167 // first name=value pair is the cookie name and value 167 // first name=value pair is the cookie name and value
168 // the name and value are stored under 'name' and 'value' to avoid conflicts 168 // the name and value are stored under 'name' and 'value' to avoid conflicts
169 // with later parameters. 169 // with later parameters.
170 if ($x == 0) { 170 if ($x == 0) {
171 $tmp_cookie = array('name'=>$key, 'value'=>$val); 171 $tmp_cookie = array('name'=>$key, 'value'=>$val);
172 continue; 172 continue;
173 } 173 }
174 $key = strtolower($key); 174 $key = strtolower($key);
175 if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { 175 if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
176 $tmp_cookie[$key] = $val; 176 $tmp_cookie[$key] = $val;
177 } 177 }
178 } 178 }
179 // 179 //
180 // set cookie 180 // set cookie
181 // 181 //
182 // check domain 182 // check domain
183 if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && 183 if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
184 ($tmp_cookie['domain'] != ".$request_host")) { 184 ($tmp_cookie['domain'] != ".$request_host")) {
185 $domain = $tmp_cookie['domain']; 185 $domain = $tmp_cookie['domain'];
186 if ((strpos($domain, '.') === false) && ($domain != 'local')) { 186 if ((strpos($domain, '.') === false) && ($domain != 'local')) {
187 $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); 187 $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
188 continue; 188 continue;
189 } 189 }
190 if (preg_match('/\.[0-9]+$/', $domain)) { 190 if (preg_match('/\.[0-9]+$/', $domain)) {
191 $this->debug(' - domain "'.$domain.'" appears to be an ip address'); 191 $this->debug(' - domain "'.$domain.'" appears to be an ip address');
192 continue; 192 continue;
193 } 193 }
194 if (substr($domain, 0, 1) != '.') $domain = ".$domain"; 194 if (substr($domain, 0, 1) != '.') $domain = ".$domain";
195 if (!$this->_domain_match($request_host, $domain)) { 195 if (!$this->_domain_match($request_host, $domain)) {
196 $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); 196 $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
197 continue; 197 continue;
198 } 198 }
199 } else { 199 } else {
200 // if domain is not specified in the set-cookie header, domain will default to 200 // if domain is not specified in the set-cookie header, domain will default to
201 // the request host 201 // the request host
202 $domain = $request_host; 202 $domain = $request_host;
203 } 203 }
204 // check path 204 // check path
205 if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { 205 if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
206 $path = urldecode($tmp_cookie['path']); 206 $path = urldecode($tmp_cookie['path']);
207 if (!$this->_path_match($request_path, $path)) { 207 if (!$this->_path_match($request_path, $path)) {
208 $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); 208 $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
209 continue; 209 continue;
210 } 210 }
211 } else { 211 } else {
212 $path = $request_path; 212 $path = $request_path;
213 $path = substr($path, 0, strrpos($path, '/')); 213 $path = substr($path, 0, strrpos($path, '/'));
214 if ($path == '') $path = '/'; 214 if ($path == '') $path = '/';
215 } 215 }
216 // check if secure 216 // check if secure
217 $secure = (isset($tmp_cookie['secure'])) ? true : false; 217 $secure = (isset($tmp_cookie['secure'])) ? true : false;
218 // check expiry 218 // check expiry
219 if (isset($tmp_cookie['expires'])) { 219 if (isset($tmp_cookie['expires'])) {
220 if (($expires = strtotime($tmp_cookie['expires'])) < 0) { 220 if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
221 $expires = null; 221 $expires = null;
222 } 222 }
223 } else { 223 } else {
224 $expires = null; 224 $expires = null;
225 } 225 }
226 // set cookie 226 // set cookie
227 $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); 227 $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
228 } 228 }
229 } 229 }
230 230
231 // return array of set-cookie values extracted from HTTP response headers (string $h) 231 // return array of set-cookie values extracted from HTTP response headers (string $h)
232 public function extractCookies($h) { 232 public function extractCookies($h) {
233 $x = 0; 233 $x = 0;
234 $lines = 0; 234 $lines = 0;
235 $headers = array(); 235 $headers = array();
236 $last_match = false; 236 $last_match = false;
237 $h = explode("\n", $h); 237 $h = explode("\n", $h);
238 foreach ($h as $line) { 238 foreach ($h as $line) {
239 $line = rtrim($line); 239 $line = rtrim($line);
240 $lines++; 240 $lines++;
241 241
242 $trimmed_line = trim($line); 242 $trimmed_line = trim($line);
243 if (isset($line_last)) { 243 if (isset($line_last)) {
244 // check if we have \r\n\r\n (indicating the end of headers) 244 // check if we have \r\n\r\n (indicating the end of headers)
245 // some servers will not use CRLF (\r\n), so we make CR (\r) optional. 245 // some servers will not use CRLF (\r\n), so we make CR (\r) optional.
246 // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { 246 // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
247 // break; 247 // break;
248 // } 248 // }
249 // As an alternative, we can check if the current trimmed line is empty 249 // As an alternative, we can check if the current trimmed line is empty
250 if ($trimmed_line == '') { 250 if ($trimmed_line == '') {
251 break; 251 break;
252 } 252 }
253 253
254 // check for continuation line... 254 // check for continuation line...
255 // RFC 2616 Section 2.2 "Basic Rules": 255 // RFC 2616 Section 2.2 "Basic Rules":
256 // HTTP/1.1 header field values can be folded onto multiple lines if the 256 // HTTP/1.1 header field values can be folded onto multiple lines if the
257 // continuation line begins with a space or horizontal tab. All linear 257 // continuation line begins with a space or horizontal tab. All linear
258 // white space, including folding, has the same semantics as SP. A 258 // white space, including folding, has the same semantics as SP. A
259 // recipient MAY replace any linear white space with a single SP before 259 // recipient MAY replace any linear white space with a single SP before
260 // interpreting the field value or forwarding the message downstream. 260 // interpreting the field value or forwarding the message downstream.
261 if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { 261 if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
262 // append to previous header value 262 // append to previous header value
263 $headers[$x-1] .= ' '.rtrim($match[1]); 263 $headers[$x-1] .= ' '.rtrim($match[1]);
264 continue; 264 continue;
265 } 265 }
266 } 266 }
267 $line_last = $line; 267 $line_last = $line;
268 268
269 // split header name and value 269 // split header name and value
270 if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { 270 if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
271 $headers[$x++] = rtrim($match[1]); 271 $headers[$x++] = rtrim($match[1]);
272 $last_match = true; 272 $last_match = true;
273 } else { 273 } else {
274 $last_match = false; 274 $last_match = false;
275 } 275 }
276 } 276 }
277 return $headers; 277 return $headers;
278 } 278 }
279 279
280 /** 280 /**
281 * Set Cookie 281 * Set Cookie
282 * @param string $domain 282 * @param string $domain
283 * @param string $path 283 * @param string $path
284 * @param string $name cookie name 284 * @param string $name cookie name
285 * @param string $value cookie value 285 * @param string $value cookie value
286 * @param bool $secure 286 * @param bool $secure
287 * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) 287 * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
288 * @return void 288 * @return void
289 */ 289 */
290 function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) 290 function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
291 { 291 {
292 if ($domain == '') return; 292 if ($domain == '') return;
293 if ($path == '') return; 293 if ($path == '') return;
294 if ($name == '') return; 294 if ($name == '') return;
295 // check if cookie needs to go 295 // check if cookie needs to go
296 if (isset($expires) && ($expires <= 0)) { 296 if (isset($expires) && ($expires <= 0)) {
297 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); 297 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
298 return; 298 return;
299 } 299 }
300 if ($value == '') return; 300 if ($value == '') return;
301 $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); 301 $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
302 return; 302 return;
303 } 303 }
304 304
305 /** 305 /**
306 * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. 306 * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
307 * @param string $domain 307 * @param string $domain
308 * @param string $path 308 * @param string $path
309 * @param string $name 309 * @param string $name
310 * @return void 310 * @return void
311 */ 311 */
312 function clear($domain=null, $path=null, $name=null) 312 function clear($domain=null, $path=null, $name=null)
313 { 313 {
314 if (!isset($domain)) { 314 if (!isset($domain)) {
315 $this->cookies = array(); 315 $this->cookies = array();
316 } elseif (!isset($path)) { 316 } elseif (!isset($path)) {
317 if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); 317 if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
318 } elseif (!isset($name)) { 318 } elseif (!isset($name)) {
319 if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); 319 if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
320 } elseif (isset($name)) { 320 } elseif (isset($name)) {
321 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); 321 if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
322 } 322 }
323 } 323 }
324 324
325 /** 325 /**
326 * Compare string length - used for sorting 326 * Compare string length - used for sorting
327 * @access private 327 * @access private
328 * @return int 328 * @return int
329 */ 329 */
330 function _cmp_length($a, $b) 330 function _cmp_length($a, $b)
331 { 331 {
332 $la = strlen($a); $lb = strlen($b); 332 $la = strlen($a); $lb = strlen($b);
333 if ($la == $lb) return 0; 333 if ($la == $lb) return 0;
334 return ($la > $lb) ? -1 : 1; 334 return ($la > $lb) ? -1 : 1;
335 } 335 }
336 336
337 /** 337 /**
338 * Reduce domain 338 * Reduce domain
339 * @param string $domain 339 * @param string $domain
340 * @return string 340 * @return string
341 * @access private 341 * @access private
342 */ 342 */
343 function _reduce_domain($domain) 343 function _reduce_domain($domain)
344 { 344 {
345 if ($domain == '') return ''; 345 if ($domain == '') return '';
346 if (substr($domain, 0, 1) == '.') return substr($domain, 1); 346 if (substr($domain, 0, 1) == '.') return substr($domain, 1);
347 return substr($domain, strpos($domain, '.')); 347 return substr($domain, strpos($domain, '.'));
348 } 348 }
349 349
350 /** 350 /**
351 * Path match - check if path1 path-matches path2 351 * Path match - check if path1 path-matches path2
352 * 352 *
353 * From RFC 2965: 353 * From RFC 2965:
354 * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 354 * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
355 * if P2 is a prefix of P1 (including the case where P1 and P2 string- 355 * if P2 is a prefix of P1 (including the case where P1 and P2 string-
356 * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> 356 * compare equal). Thus, the string /tec/waldo path-matches /tec.</i>
357 * @param string $path1 357 * @param string $path1
358 * @param string $path2 358 * @param string $path2
359 * @return bool 359 * @return bool
360 * @access private 360 * @access private
361 */ 361 */
362 function _path_match($path1, $path2) 362 function _path_match($path1, $path2)
363 { 363 {
364 return (substr($path1, 0, strlen($path2)) == $path2); 364 return (substr($path1, 0, strlen($path2)) == $path2);
365 } 365 }
366 366
367 /** 367 /**
368 * Domain match - check if domain1 domain-matches domain2 368 * Domain match - check if domain1 domain-matches domain2
369 * 369 *
370 * A few extracts from RFC 2965: 370 * A few extracts from RFC 2965:
371 * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com 371 * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
372 * would be rejected, because H is y.x and contains a dot. 372 * would be rejected, because H is y.x and contains a dot.
373 * 373 *
374 * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com 374 * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
375 * would be accepted. 375 * would be accepted.
376 * 376 *
377 * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be 377 * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be
378 * rejected, because there is no embedded dot. 378 * rejected, because there is no embedded dot.
379 * 379 *
380 * - A Set-Cookie2 from request-host example for Domain=.local will 380 * - A Set-Cookie2 from request-host example for Domain=.local will
381 * be accepted, because the effective host name for the request- 381 * be accepted, because the effective host name for the request-
382 * host is example.local, and example.local domain-matches .local. 382 * host is example.local, and example.local domain-matches .local.
383 * 383 *
384 * I'm ignoring the first point for now (must check to see how other browsers handle 384 * I'm ignoring the first point for now (must check to see how other browsers handle
385 * this rule for Set-Cookie headers) 385 * this rule for Set-Cookie headers)
386 * 386 *
387 * @param string $domain1 387 * @param string $domain1
388 * @param string $domain2 388 * @param string $domain2
389 * @return bool 389 * @return bool
390 * @access private 390 * @access private
391 */ 391 */
392 function _domain_match($domain1, $domain2) 392 function _domain_match($domain1, $domain2)
393 { 393 {
394 $domain1 = strtolower($domain1); 394 $domain1 = strtolower($domain1);
395 $domain2 = strtolower($domain2); 395 $domain2 = strtolower($domain2);
396 while (strpos($domain1, '.') !== false) { 396 while (strpos($domain1, '.') !== false) {
397 if ($domain1 == $domain2) return true; 397 if ($domain1 == $domain2) return true;
398 $domain1 = $this->_reduce_domain($domain1); 398 $domain1 = $this->_reduce_domain($domain1);
399 continue; 399 continue;
400 } 400 }
401 return false; 401 return false;
402 } 402 }
403} 403} \ No newline at end of file
404?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
index e4f1b3b3..963f0c05 100644
--- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -1,779 +1,810 @@
1<?php 1<?php
2/** 2/**
3 * Humble HTTP Agent 3 * Humble HTTP Agent
4 * 4 *
5 * This class is designed to take advantage of parallel HTTP requests 5 * This class is designed to take advantage of parallel HTTP requests
6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions. 6 * offered by PHP's PECL HTTP extension or the curl_multi_* functions.
7 * For environments which do not have these options, it reverts to standard sequential 7 * For environments which do not have these options, it reverts to standard sequential
8 * requests (using file_get_contents()) 8 * requests (using file_get_contents())
9 * 9 *
10 * @version 1.1 10 * @version 1.4
11 * @date 2012-08-20 11 * @date 2013-05-10
12 * @see http://php.net/HttpRequestPool 12 * @see http://php.net/HttpRequestPool
13 * @author Keyvan Minoukadeh 13 * @author Keyvan Minoukadeh
14 * @copyright 2011-2012 Keyvan Minoukadeh 14 * @copyright 2011-2013 Keyvan Minoukadeh
15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 15 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
16 */ 16 */
17 17
18class HumbleHttpAgent 18class HumbleHttpAgent
19{ 19{
20 const METHOD_REQUEST_POOL = 1; 20 const METHOD_REQUEST_POOL = 1;
21 const METHOD_CURL_MULTI = 2; 21 const METHOD_CURL_MULTI = 2;
22 const METHOD_FILE_GET_CONTENTS = 4; 22 const METHOD_FILE_GET_CONTENTS = 4;
23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; 23 //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; 24 const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
25 const UA_PHP = 'PHP/5.2'; 25 const UA_PHP = 'PHP/5.4';
26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; 26 const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
27 27
28 protected $requests = array(); 28 protected $requests = array();
29 protected $redirectQueue = array(); 29 protected $redirectQueue = array();
30 protected $requestOptions; 30 protected $requestOptions;
31 protected $maxParallelRequests = 5; 31 protected $maxParallelRequests = 5;
32 protected $cache = null; //TODO 32 protected $cache = null; //TODO
33 protected $httpContext; 33 protected $httpContext;
34 protected $minimiseMemoryUse = false; //TODO 34 protected $minimiseMemoryUse = false; //TODO
35 protected $method; 35 protected $method;
36 protected $cookieJar; 36 protected $cookieJar;
37 public $debug = false; 37 public $debug = false;
38 public $debugVerbose = false; 38 public $debugVerbose = false;
39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html 39 public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
40 public $maxRedirects = 5; 40 public $maxRedirects = 5;
41 public $userAgentMap = array(); 41 public $userAgentMap = array();
42 public $rewriteUrls = array(); 42 public $rewriteUrls = array();
43 public $userAgentDefault; 43 public $userAgentDefault;
44 public $referer; 44 public $referer;
45 //public $userAgent = 'Mozilla/5.0'; 45 //public $userAgent = 'Mozilla/5.0';
46 46
47 // Prevent certain file/mime types 47 // Prevent certain file/mime types
48 // HTTP responses which match these content types will 48 // HTTP responses which match these content types will
49 // be returned without body. 49 // be returned without body.
50 public $headerOnlyTypes = array(); 50 public $headerOnlyTypes = array();
51 // URLs ending with one of these extensions will 51 // URLs ending with one of these extensions will
52 // prompt Humble HTTP Agent to send a HEAD request first 52 // prompt Humble HTTP Agent to send a HEAD request first
53 // to see if returned content type matches $headerOnlyTypes. 53 // to see if returned content type matches $headerOnlyTypes.
54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); 54 public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
55 // AJAX triggers to search for. 55 // AJAX triggers to search for.
56 // for AJAX sites, e.g. Blogger with its dynamic views templates. 56 // for AJAX sites, e.g. Blogger with its dynamic views templates.
57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); 57 public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
58 58
59 //TODO: set max file size 59 //TODO: set max file size
60 //TODO: normalise headers 60 //TODO: normalise headers
61 61
62 function __construct($requestOptions=null, $method=null) { 62 function __construct($requestOptions=null, $method=null) {
63 $this->userAgentDefault = self::UA_BROWSER; 63 $this->userAgentDefault = self::UA_BROWSER;
64 $this->referer = self::REF_GOOGLE; 64 $this->referer = self::REF_GOOGLE;
65 // set the request method 65 // set the request method
66 if (in_array($method, array(1,2,4))) { 66 if (in_array($method, array(1,2,4))) {
67 $this->method = $method; 67 $this->method = $method;
68 } else { 68 } else {
69 if (class_exists('HttpRequestPool')) { 69 if (class_exists('HttpRequestPool')) {
70 $this->method = self::METHOD_REQUEST_POOL; 70 $this->method = self::METHOD_REQUEST_POOL;
71 } elseif (function_exists('curl_multi_init')) { 71 } elseif (function_exists('curl_multi_init')) {
72 $this->method = self::METHOD_CURL_MULTI; 72 $this->method = self::METHOD_CURL_MULTI;
73 } else { 73 } else {
74 $this->method = self::METHOD_FILE_GET_CONTENTS; 74 $this->method = self::METHOD_FILE_GET_CONTENTS;
75 } 75 }
76 } 76 }
77 if ($this->method == self::METHOD_CURL_MULTI) { 77 if ($this->method == self::METHOD_CURL_MULTI) {
78 require_once(dirname(__FILE__).'/RollingCurl.php'); 78 require_once(dirname(__FILE__).'/RollingCurl.php');
79 } 79 }
80 // create cookie jar 80 // create cookie jar
81 $this->cookieJar = new CookieJar(); 81 $this->cookieJar = new CookieJar();
82 // set request options (redirect must be 0) 82 // set request options (redirect must be 0)
83 $this->requestOptions = array( 83 $this->requestOptions = array(
84 'timeout' => 15, 84 'timeout' => 15,
85 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web 85 'connecttimeout' => 15,
86 // TODO: test onprogress? 86 'dns_cache_timeout' => 300,
87 ); 87 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
88 if (is_array($requestOptions)) { 88 // TODO: test onprogress?
89 $this->requestOptions = array_merge($this->requestOptions, $requestOptions); 89 );
90 } 90 if (is_array($requestOptions)) {
91 $this->httpContext = array( 91 $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
92 'http' => array( 92 }
93 'ignore_errors' => true, 93 $this->httpContext = array(
94 'timeout' => $this->requestOptions['timeout'], 94 'http' => array(
95 'max_redirects' => $this->requestOptions['redirect'], 95 'ignore_errors' => true,
96 'header' => "Accept: */*\r\n" 96 'timeout' => $this->requestOptions['timeout'],
97 ) 97 'max_redirects' => $this->requestOptions['redirect'],
98 ); 98 'header' => "Accept: */*\r\n"
99 } 99 )
100 100 );
101 protected function debug($msg) { 101 }
102 if ($this->debug) { 102
103 $mem = round(memory_get_usage()/1024, 2); 103 protected function debug($msg) {
104 $memPeak = round(memory_get_peak_usage()/1024, 2); 104 if ($this->debug) {
105 echo '* ',$msg; 105 $mem = round(memory_get_usage()/1024, 2);
106 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; 106 $memPeak = round(memory_get_peak_usage()/1024, 2);
107 echo "\n"; 107 echo '* ',$msg;
108 ob_flush(); 108 if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
109 flush(); 109 echo "\n";
110 } 110 ob_flush();
111 } 111 flush();
112 112 }
113 protected function getUserAgent($url, $asArray=false) { 113 }
114 $host = @parse_url($url, PHP_URL_HOST); 114
115 if (strtolower(substr($host, 0, 4)) == 'www.') { 115 protected function getUserAgent($url, $asArray=false) {
116 $host = substr($host, 4); 116 $host = @parse_url($url, PHP_URL_HOST);
117 } 117 if (strtolower(substr($host, 0, 4)) == 'www.') {
118 if ($host) { 118 $host = substr($host, 4);
119 $try = array($host); 119 }
120 $split = explode('.', $host); 120 if ($host) {
121 if (count($split) > 1) { 121 $try = array($host);
122 array_shift($split); 122 $split = explode('.', $host);
123 $try[] = '.'.implode('.', $split); 123 if (count($split) > 1) {
124 } 124 array_shift($split);
125 foreach ($try as $h) { 125 $try[] = '.'.implode('.', $split);
126 if (isset($this->userAgentMap[$h])) { 126 }
127 $ua = $this->userAgentMap[$h]; 127 foreach ($try as $h) {
128 break; 128 if (isset($this->userAgentMap[$h])) {
129 } 129 $ua = $this->userAgentMap[$h];
130 } 130 break;
131 } 131 }
132 if (!isset($ua)) $ua = $this->userAgentDefault; 132 }
133 if ($asArray) { 133 }
134 return array('User-Agent' => $ua); 134 if (!isset($ua)) $ua = $this->userAgentDefault;
135 } else { 135 if ($asArray) {
136 return 'User-Agent: '.$ua; 136 return array('User-Agent' => $ua);
137 } 137 } else {
138 } 138 return 'User-Agent: '.$ua;
139 139 }
140 public function rewriteHashbangFragment($url) { 140 }
141 // return $url if there's no '#!' 141
142 if (strpos($url, '#!') === false) return $url; 142 public function rewriteHashbangFragment($url) {
143 // split $url and rewrite 143 // return $url if there's no '#!'
144 // TODO: is SimplePie_IRI included? 144 if (strpos($url, '#!') === false) return $url;
145 $iri = new SimplePie_IRI($url); 145 // split $url and rewrite
146 $fragment = substr($iri->fragment, 1); // strip '!' 146 // TODO: is SimplePie_IRI included?
147 $iri->fragment = null; 147 $iri = new SimplePie_IRI($url);
148 if (isset($iri->query)) { 148 $fragment = substr($iri->fragment, 1); // strip '!'
149 parse_str($iri->query, $query); 149 $iri->fragment = null;
150 } else { 150 if (isset($iri->query)) {
151 $query = array(); 151 parse_str($iri->query, $query);
152 } 152 } else {
153 $query['_escaped_fragment_'] = (string)$fragment; 153 $query = array();
154 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites 154 }
155 return $iri->get_iri(); 155 $query['_escaped_fragment_'] = (string)$fragment;
156 } 156 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
157 157 return $iri->get_iri();
158 public function getUglyURL($url, $html) { 158 }
159 if ($html == '') return false; 159
160 $found = false; 160 public function getRedirectURLfromHTML($url, $html) {
161 foreach ($this->ajaxTriggers as $string) { 161 $redirect_url = $this->getMetaRefreshURL($url, $html);
162 if (stripos($html, $string)) { 162 if (!$redirect_url) {
163 $found = true; 163 $redirect_url = $this->getUglyURL($url, $html);
164 break; 164 }
165 } 165 return $redirect_url;
166 } 166 }
167 if (!$found) return false; 167
168 $iri = new SimplePie_IRI($url); 168 public function getMetaRefreshURL($url, $html) {
169 if (isset($iri->query)) { 169 if ($html == '') return false;
170 parse_str($iri->query, $query); 170 // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
171 } else { 171 if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
172 $query = array(); 172 return false;
173 } 173 }
174 $query['_escaped_fragment_'] = ''; 174 $redirect_url = $match[1];
175 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites 175 if (preg_match('!^https?://!i', $redirect_url)) {
176 return $iri->get_iri(); 176 // already absolute
177 } 177 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
178 178 return $redirect_url;
179 public function removeFragment($url) { 179 }
180 $pos = strpos($url, '#'); 180 // absolutize redirect URL
181 if ($pos === false) { 181 $base = new SimplePie_IRI($url);
182 return $url; 182 // remove '//' in URL path (causes URLs not to resolve properly)
183 } else { 183 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
184 return substr($url, 0, $pos); 184 if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
185 } 185 $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
186 } 186 return $absolute;
187 187 }
188 public function rewriteUrls($url) { 188 return false;
189 foreach ($this->rewriteUrls as $find => $action) { 189 }
190 if (strpos($url, $find) !== false) { 190
191 if (is_array($action)) { 191 public function getUglyURL($url, $html) {
192 return strtr($url, $action); 192 if ($html == '') return false;
193 } 193 $found = false;
194 } 194 foreach ($this->ajaxTriggers as $string) {
195 } 195 if (stripos($html, $string)) {
196 return $url; 196 $found = true;
197 } 197 break;
198 198 }
199 public function enableDebug($bool=true) { 199 }
200 $this->debug = (bool)$bool; 200 if (!$found) return false;
201 } 201 $iri = new SimplePie_IRI($url);
202 202 if (isset($iri->query)) {
203 public function minimiseMemoryUse($bool = true) { 203 parse_str($iri->query, $query);
204 $this->minimiseMemoryUse = $bool; 204 } else {
205 } 205 $query = array();
206 206 }
207 public function setMaxParallelRequests($max) { 207 $query['_escaped_fragment_'] = '';
208 $this->maxParallelRequests = $max; 208 $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
209 } 209 $ugly_url = $iri->get_iri();
210 210 $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
211 public function validateUrl($url) { 211 return $ugly_url;
212 $url = filter_var($url, FILTER_SANITIZE_URL); 212 }
213 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); 213
214 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) 214 public function removeFragment($url) {
215 if ($test === false) { 215 $pos = strpos($url, '#');
216 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); 216 if ($pos === false) {
217 } 217 return $url;
218 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { 218 } else {
219 return $url; 219 return substr($url, 0, $pos);
220 } else { 220 }
221 return false; 221 }
222 } 222
223 } 223 public function rewriteUrls($url) {
224 224 foreach ($this->rewriteUrls as $find => $action) {
225 public function fetchAll(array $urls) { 225 if (strpos($url, $find) !== false) {
226 $this->fetchAllOnce($urls, $isRedirect=false); 226 if (is_array($action)) {
227 $redirects = 0; 227 return strtr($url, $action);
228 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { 228 }
229 $this->debug("Following redirects #$redirects..."); 229 }
230 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); 230 }
231 } 231 return $url;
232 } 232 }
233 233
234 // fetch all URLs without following redirects 234 public function enableDebug($bool=true) {
235 public function fetchAllOnce(array $urls, $isRedirect=false) { 235 $this->debug = (bool)$bool;
236 if (!$isRedirect) $urls = array_unique($urls); 236 }
237 if (empty($urls)) return; 237
238 238 public function minimiseMemoryUse($bool = true) {
239 ////////////////////////////////////////////////////// 239 $this->minimiseMemoryUse = $bool;
240 // parallel (HttpRequestPool) 240 }
241 if ($this->method == self::METHOD_REQUEST_POOL) { 241
242 $this->debug('Starting parallel fetch (HttpRequestPool)'); 242 public function setMaxParallelRequests($max) {
243 try { 243 $this->maxParallelRequests = $max;
244 while (count($urls) > 0) { 244 }
245 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); 245
246 $subset = array_splice($urls, 0, $this->maxParallelRequests); 246 public function validateUrl($url) {
247 $pool = new HttpRequestPool(); 247 $url = filter_var($url, FILTER_SANITIZE_URL);
248 foreach ($subset as $orig => $url) { 248 $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
249 if (!$isRedirect) $orig = $url; 249 // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
250 unset($this->redirectQueue[$orig]); 250 if ($test === false) {
251 $this->debug("...$url"); 251 $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
252 if (!$isRedirect && isset($this->requests[$url])) { 252 }
253 $this->debug("......in memory"); 253 if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
254 /* 254 return $url;
255 } elseif ($this->isCached($url)) { 255 } else {
256 $this->debug("......is cached"); 256 return false;
257 if (!$this->minimiseMemoryUse) { 257 }
258 $this->requests[$url] = $this->getCached($url); 258 }
259 } 259
260 */ 260 public function fetchAll(array $urls) {
261 } else { 261 $this->fetchAllOnce($urls, $isRedirect=false);
262 $this->debug("......adding to pool"); 262 $redirects = 0;
263 $req_url = $this->rewriteUrls($url); 263 while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
264 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 264 $this->debug("Following redirects #$redirects...");
265 $req_url = $this->removeFragment($req_url); 265 $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
266 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { 266 }
267 $_meth = HttpRequest::METH_HEAD; 267 }
268 } else { 268
269 $_meth = HttpRequest::METH_GET; 269 // fetch all URLs without following redirects
270 unset($this->requests[$orig]['wrongGuess']); 270 public function fetchAllOnce(array $urls, $isRedirect=false) {
271 } 271 if (!$isRedirect) $urls = array_unique($urls);
272 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); 272 if (empty($urls)) return;
273 // send cookies, if we have any 273
274 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 274 //////////////////////////////////////////////////////
275 $this->debug("......sending cookies: $cookies"); 275 // parallel (HttpRequestPool)
276 $httpRequest->addHeaders(array('Cookie' => $cookies)); 276 if ($this->method == self::METHOD_REQUEST_POOL) {
277 } 277 $this->debug('Starting parallel fetch (HttpRequestPool)');
278 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); 278 try {
279 $httpRequest->addHeaders($this->getUserAgent($req_url, true)); 279 while (count($urls) > 0) {
280 // add referer for picky sites 280 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
281 $httpRequest->addheaders(array('Referer' => $this->referer)); 281 $subset = array_splice($urls, 0, $this->maxParallelRequests);
282 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); 282 $pool = new HttpRequestPool();
283 $this->requests[$orig]['original_url'] = $orig; 283 foreach ($subset as $orig => $url) {
284 $pool->attach($httpRequest); 284 if (!$isRedirect) $orig = $url;
285 } 285 unset($this->redirectQueue[$orig]);
286 } 286 $this->debug("...$url");
287 // did we get anything into the pool? 287 if (!$isRedirect && isset($this->requests[$url])) {
288 if (count($pool) > 0) { 288 $this->debug("......in memory");
289 $this->debug('Sending request...'); 289 /*
290 try { 290 } elseif ($this->isCached($url)) {
291 $pool->send(); 291 $this->debug("......is cached");
292 } catch (HttpRequestPoolException $e) { 292 if (!$this->minimiseMemoryUse) {
293 // do nothing 293 $this->requests[$url] = $this->getCached($url);
294 } 294 }
295 $this->debug('Received responses'); 295 */
296 foreach($subset as $orig => $url) { 296 } else {
297 if (!$isRedirect) $orig = $url; 297 $this->debug("......adding to pool");
298 $request = $this->requests[$orig]['httpRequest']; 298 $req_url = $this->rewriteUrls($url);
299 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); 299 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
300 // getResponseHeader() doesn't return status line, so, for consistency... 300 $req_url = $this->removeFragment($req_url);
301 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); 301 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
302 // check content type 302 $_meth = HttpRequest::METH_HEAD;
303 // TODO: use getResponseHeader('content-type') or getResponseInfo() 303 } else {
304 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 304 $_meth = HttpRequest::METH_GET;
305 $this->requests[$orig]['body'] = ''; 305 unset($this->requests[$orig]['wrongGuess']);
306 $_header_only_type = true; 306 }
307 $this->debug('Header only type returned'); 307 $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
308 } else { 308 // send cookies, if we have any
309 $this->requests[$orig]['body'] = $request->getResponseBody(); 309 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
310 $_header_only_type = false; 310 $this->debug("......sending cookies: $cookies");
311 } 311 $httpRequest->addHeaders(array('Cookie' => $cookies));
312 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); 312 }
313 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); 313 //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
314 // is redirect? 314 $httpRequest->addHeaders($this->getUserAgent($req_url, true));
315 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { 315 // add referer for picky sites
316 $redirectURL = $request->getResponseHeader('location'); 316 $httpRequest->addheaders(array('Referer' => $this->referer));
317 if (!preg_match('!^https?://!i', $redirectURL)) { 317 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
318 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 318 $this->requests[$orig]['original_url'] = $orig;
319 } 319 $pool->attach($httpRequest);
320 if ($this->validateURL($redirectURL)) { 320 }
321 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 321 }
322 // store any cookies 322 // did we get anything into the pool?
323 $cookies = $request->getResponseHeader('set-cookie'); 323 if (count($pool) > 0) {
324 if ($cookies && !is_array($cookies)) $cookies = array($cookies); 324 $this->debug('Sending request...');
325 if ($cookies) $this->cookieJar->storeCookies($url, $cookies); 325 try {
326 $this->redirectQueue[$orig] = $redirectURL; 326 $pool->send();
327 } else { 327 } catch (HttpRequestPoolException $e) {
328 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 328 // do nothing
329 } 329 }
330 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { 330 $this->debug('Received responses');
331 // the response content-type did not match our 'header only' types, 331 foreach($subset as $orig => $url) {
332 // but we'd issues a HEAD request because we assumed it would. So 332 if (!$isRedirect) $orig = $url;
333 // let's queue a proper GET request for this item... 333 $request = $this->requests[$orig]['httpRequest'];
334 $this->debug('Wrong guess at content-type, queing GET request'); 334 //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
335 $this->requests[$orig]['wrongGuess'] = true; 335 // getResponseHeader() doesn't return status line, so, for consistency...
336 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; 336 $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
337 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 337 // check content type
338 // check for <meta name='fragment' content='!'/> 338 // TODO: use getResponseHeader('content-type') or getResponseInfo()
339 // for AJAX sites, e.g. Blogger with its dynamic views templates. 339 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
340 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 340 $this->requests[$orig]['body'] = '';
341 if (isset($this->requests[$orig]['body'])) { 341 $_header_only_type = true;
342 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 342 $this->debug('Header only type returned');
343 if ($redirectURL) { 343 } else {
344 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 344 $this->requests[$orig]['body'] = $request->getResponseBody();
345 $this->redirectQueue[$orig] = $redirectURL; 345 $_header_only_type = false;
346 } 346 }
347 } 347 $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
348 } 348 $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
349 //die($url.' -multi- '.$request->getResponseInfo('effective_url')); 349 // is redirect?
350 $pool->detach($request); 350 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
351 unset($this->requests[$orig]['httpRequest'], $request); 351 $redirectURL = $request->getResponseHeader('location');
352 /* 352 if (!preg_match('!^https?://!i', $redirectURL)) {
353 if ($this->minimiseMemoryUse) { 353 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
354 if ($this->cache($url)) { 354 }
355 unset($this->requests[$url]); 355 if ($this->validateURL($redirectURL)) {
356 } 356 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
357 } 357 // store any cookies
358 */ 358 $cookies = $request->getResponseHeader('set-cookie');
359 } 359 if ($cookies && !is_array($cookies)) $cookies = array($cookies);
360 } 360 if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
361 } 361 $this->redirectQueue[$orig] = $redirectURL;
362 } catch (HttpException $e) { 362 } else {
363 $this->debug($e); 363 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
364 return false; 364 }
365 } 365 } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
366 } 366 // the response content-type did not match our 'header only' types,
367 367 // but we'd issues a HEAD request because we assumed it would. So
368 ////////////////////////////////////////////////////////// 368 // let's queue a proper GET request for this item...
369 // parallel (curl_multi_*) 369 $this->debug('Wrong guess at content-type, queing GET request');
370 elseif ($this->method == self::METHOD_CURL_MULTI) { 370 $this->requests[$orig]['wrongGuess'] = true;
371 $this->debug('Starting parallel fetch (curl_multi_*)'); 371 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
372 while (count($urls) > 0) { 372 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
373 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); 373 // check for <meta name='fragment' content='!'/>
374 $subset = array_splice($urls, 0, $this->maxParallelRequests); 374 // for AJAX sites, e.g. Blogger with its dynamic views templates.
375 $pool = new RollingCurl(array($this, 'handleCurlResponse')); 375 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
376 $pool->window_size = count($subset); 376 if (isset($this->requests[$orig]['body'])) {
377 377 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
378 foreach ($subset as $orig => $url) { 378 if ($redirectURL) {
379 if (!$isRedirect) $orig = $url; 379 $this->redirectQueue[$orig] = $redirectURL;
380 unset($this->redirectQueue[$orig]); 380 }
381 $this->debug("...$url"); 381 }
382 if (!$isRedirect && isset($this->requests[$url])) { 382 }
383 $this->debug("......in memory"); 383 //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
384 /* 384 $pool->detach($request);
385 } elseif ($this->isCached($url)) { 385 unset($this->requests[$orig]['httpRequest'], $request);
386 $this->debug("......is cached"); 386 /*
387 if (!$this->minimiseMemoryUse) { 387 if ($this->minimiseMemoryUse) {
388 $this->requests[$url] = $this->getCached($url); 388 if ($this->cache($url)) {
389 } 389 unset($this->requests[$url]);
390 */ 390 }
391 } else { 391 }
392 $this->debug("......adding to pool"); 392 */
393 $req_url = $this->rewriteUrls($url); 393 }
394 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 394 }
395 $req_url = $this->removeFragment($req_url); 395 }
396 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { 396 } catch (HttpException $e) {
397 $_meth = 'HEAD'; 397 $this->debug($e);
398 } else { 398 return false;
399 $_meth = 'GET'; 399 }
400 unset($this->requests[$orig]['wrongGuess']); 400 }
401 } 401
402 $headers = array(); 402 //////////////////////////////////////////////////////////
403 //$headers[] = 'User-Agent: '.$this->userAgent; 403 // parallel (curl_multi_*)
404 $headers[] = $this->getUserAgent($req_url); 404 elseif ($this->method == self::METHOD_CURL_MULTI) {
405 // add referer for picky sites 405 $this->debug('Starting parallel fetch (curl_multi_*)');
406 $headers[] = 'Referer: '.$this->referer; 406 while (count($urls) > 0) {
407 // send cookies, if we have any 407 $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
408 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 408 $subset = array_splice($urls, 0, $this->maxParallelRequests);
409 $this->debug("......sending cookies: $cookies"); 409 $pool = new RollingCurl(array($this, 'handleCurlResponse'));
410 $headers[] = 'Cookie: '.$cookies; 410 $pool->window_size = count($subset);
411 } 411
412 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( 412 foreach ($subset as $orig => $url) {
413 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], 413 if (!$isRedirect) $orig = $url;
414 CURLOPT_TIMEOUT => $this->requestOptions['timeout'] 414 unset($this->redirectQueue[$orig]);
415 )); 415 $this->debug("...$url");
416 $httpRequest->set_original_url($orig); 416 if (!$isRedirect && isset($this->requests[$url])) {
417 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); 417 $this->debug("......in memory");
418 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? 418 /*
419 $pool->add($httpRequest); 419 } elseif ($this->isCached($url)) {
420 } 420 $this->debug("......is cached");
421 } 421 if (!$this->minimiseMemoryUse) {
422 // did we get anything into the pool? 422 $this->requests[$url] = $this->getCached($url);
423 if (count($pool) > 0) { 423 }
424 $this->debug('Sending request...'); 424 */
425 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] 425 } else {
426 $this->debug('Received responses'); 426 $this->debug("......adding to pool");
427 foreach($subset as $orig => $url) { 427 $req_url = $this->rewriteUrls($url);
428 if (!$isRedirect) $orig = $url; 428 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
429 // $this->requests[$orig]['headers'] 429 $req_url = $this->removeFragment($req_url);
430 // $this->requests[$orig]['body'] 430 if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
431 // $this->requests[$orig]['effective_url'] 431 $_meth = 'HEAD';
432 // check content type 432 } else {
433 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 433 $_meth = 'GET';
434 $this->requests[$orig]['body'] = ''; 434 unset($this->requests[$orig]['wrongGuess']);
435 $_header_only_type = true; 435 }
436 $this->debug('Header only type returned'); 436 $headers = array();
437 } else { 437 //$headers[] = 'User-Agent: '.$this->userAgent;
438 $_header_only_type = false; 438 $headers[] = $this->getUserAgent($req_url);
439 } 439 // add referer for picky sites
440 $status_code = $this->requests[$orig]['status_code']; 440 $headers[] = 'Referer: '.$this->referer;
441 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { 441 // send cookies, if we have any
442 $redirectURL = $this->requests[$orig]['location']; 442 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
443 if (!preg_match('!^https?://!i', $redirectURL)) { 443 $this->debug("......sending cookies: $cookies");
444 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 444 $headers[] = 'Cookie: '.$cookies;
445 } 445 }
446 if ($this->validateURL($redirectURL)) { 446 $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
447 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 447 CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
448 // store any cookies 448 CURLOPT_TIMEOUT => $this->requestOptions['timeout']
449 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); 449 ));
450 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); 450 $httpRequest->set_original_url($orig);
451 $this->redirectQueue[$orig] = $redirectURL; 451 $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
452 } else { 452 $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
453 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 453 $pool->add($httpRequest);
454 } 454 }
455 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { 455 }
456 // the response content-type did not match our 'header only' types, 456 // did we get anything into the pool?
457 // but we'd issues a HEAD request because we assumed it would. So 457 if (count($pool) > 0) {
458 // let's queue a proper GET request for this item... 458 $this->debug('Sending request...');
459 $this->debug('Wrong guess at content-type, queing GET request'); 459 $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
460 $this->requests[$orig]['wrongGuess'] = true; 460 $this->debug('Received responses');
461 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; 461 foreach($subset as $orig => $url) {
462 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 462 if (!$isRedirect) $orig = $url;
463 // check for <meta name='fragment' content='!'/> 463 // $this->requests[$orig]['headers']
464 // for AJAX sites, e.g. Blogger with its dynamic views templates. 464 // $this->requests[$orig]['body']
465 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 465 // $this->requests[$orig]['effective_url']
466 if (isset($this->requests[$orig]['body'])) { 466 // check content type
467 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 467 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
468 if ($redirectURL) { 468 $this->requests[$orig]['body'] = '';
469 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 469 $_header_only_type = true;
470 $this->redirectQueue[$orig] = $redirectURL; 470 $this->debug('Header only type returned');
471 } 471 } else {
472 } 472 $_header_only_type = false;
473 } 473 }
474 // die($url.' -multi- '.$request->getResponseInfo('effective_url')); 474 $status_code = $this->requests[$orig]['status_code'];
475 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); 475 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
476 } 476 $redirectURL = $this->requests[$orig]['location'];
477 } 477 if (!preg_match('!^https?://!i', $redirectURL)) {
478 } 478 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
479 } 479 }
480 480 if ($this->validateURL($redirectURL)) {
481 ////////////////////////////////////////////////////// 481 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
482 // sequential (file_get_contents) 482 // store any cookies
483 else { 483 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
484 $this->debug('Starting sequential fetch (file_get_contents)'); 484 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
485 $this->debug('Processing set of '.count($urls)); 485 $this->redirectQueue[$orig] = $redirectURL;
486 foreach ($urls as $orig => $url) { 486 } else {
487 if (!$isRedirect) $orig = $url; 487 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
488 unset($this->redirectQueue[$orig]); 488 }
489 $this->debug("...$url"); 489 } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
490 if (!$isRedirect && isset($this->requests[$url])) { 490 // the response content-type did not match our 'header only' types,
491 $this->debug("......in memory"); 491 // but we'd issues a HEAD request because we assumed it would. So
492 /* 492 // let's queue a proper GET request for this item...
493 } elseif ($this->isCached($url)) { 493 $this->debug('Wrong guess at content-type, queing GET request');
494 $this->debug("......is cached"); 494 $this->requests[$orig]['wrongGuess'] = true;
495 if (!$this->minimiseMemoryUse) { 495 $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
496 $this->requests[$url] = $this->getCached($url); 496 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
497 } 497 // check for <meta name='fragment' content='!'/>
498 */ 498 // for AJAX sites, e.g. Blogger with its dynamic views templates.
499 } else { 499 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
500 $this->debug("Sending request for $url"); 500 if (isset($this->requests[$orig]['body'])) {
501 $this->requests[$orig]['original_url'] = $orig; 501 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
502 $req_url = $this->rewriteUrls($url); 502 if ($redirectURL) {
503 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; 503 $this->redirectQueue[$orig] = $redirectURL;
504 $req_url = $this->removeFragment($req_url); 504 }
505 // send cookies, if we have any 505 }
506 $httpContext = $this->httpContext; 506 }
507 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; 507 // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
508 // add referer for picky sites 508 unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
509 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; 509 }
510 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { 510 }
511 $this->debug("......sending cookies: $cookies"); 511 }
512 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; 512 }
513 } 513
514 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { 514 //////////////////////////////////////////////////////
515 $this->debug('Received response'); 515 // sequential (file_get_contents)
516 // get status code 516 else {
517 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { 517 $this->debug('Starting sequential fetch (file_get_contents)');
518 $this->debug('Error: no status code found'); 518 $this->debug('Processing set of '.count($urls));
519 // TODO: handle error - no status code 519 foreach ($urls as $orig => $url) {
520 } else { 520 if (!$isRedirect) $orig = $url;
521 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); 521 unset($this->redirectQueue[$orig]);
522 // check content type 522 $this->debug("...$url");
523 if ($this->headerOnlyType($this->requests[$orig]['headers'])) { 523 if (!$isRedirect && isset($this->requests[$url])) {
524 $this->requests[$orig]['body'] = ''; 524 $this->debug("......in memory");
525 } else { 525 /*
526 $this->requests[$orig]['body'] = $html; 526 } elseif ($this->isCached($url)) {
527 } 527 $this->debug("......is cached");
528 $this->requests[$orig]['effective_url'] = $req_url; 528 if (!$this->minimiseMemoryUse) {
529 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; 529 $this->requests[$url] = $this->getCached($url);
530 unset($match); 530 }
531 // handle redirect 531 */
532 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { 532 } else {
533 $this->requests[$orig]['location'] = trim($match[1]); 533 $this->debug("Sending request for $url");
534 } 534 $this->requests[$orig]['original_url'] = $orig;
535 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { 535 $req_url = $this->rewriteUrls($url);
536 $redirectURL = $this->requests[$orig]['location']; 536 $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
537 if (!preg_match('!^https?://!i', $redirectURL)) { 537 $req_url = $this->removeFragment($req_url);
538 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); 538 // send cookies, if we have any
539 } 539 $httpContext = $this->httpContext;
540 if ($this->validateURL($redirectURL)) { 540 $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
541 $this->debug('Redirect detected. Valid URL: '.$redirectURL); 541 // add referer for picky sites
542 // store any cookies 542 $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
543 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); 543 if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
544 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); 544 $this->debug("......sending cookies: $cookies");
545 $this->redirectQueue[$orig] = $redirectURL; 545 $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
546 } else { 546 }
547 $this->debug('Redirect detected. Invalid URL: '.$redirectURL); 547 if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
548 } 548 $this->debug('Received response');
549 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { 549 // get status code
550 // check for <meta name='fragment' content='!'/> 550 if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
551 // for AJAX sites, e.g. Blogger with its dynamic views templates. 551 $this->debug('Error: no status code found');
552 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification 552 // TODO: handle error - no status code
553 if (isset($this->requests[$orig]['body'])) { 553 } else {
554 $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); 554 $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
555 if ($redirectURL) { 555 // check content type
556 $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); 556 if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
557 $this->redirectQueue[$orig] = $redirectURL; 557 $this->requests[$orig]['body'] = '';
558 } 558 } else {
559 } 559 $this->requests[$orig]['body'] = $html;
560 } 560 }
561 } 561 $this->requests[$orig]['effective_url'] = $req_url;
562 } else { 562 $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
563 $this->debug('Error retrieving URL'); 563 unset($match);
564 //print_r($req_url); 564 // handle redirect
565 //print_r($http_response_header); 565 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
566 //print_r($html); 566 $this->requests[$orig]['location'] = trim($match[1]);
567 567 }
568 // TODO: handle error - failed to retrieve URL 568 if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
569 } 569 $redirectURL = $this->requests[$orig]['location'];
570 } 570 if (!preg_match('!^https?://!i', $redirectURL)) {
571 } 571 $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
572 } 572 }
573 } 573 if ($this->validateURL($redirectURL)) {
574 574 $this->debug('Redirect detected. Valid URL: '.$redirectURL);
575 public function handleCurlResponse($response, $info, $request) { 575 // store any cookies
576 $orig = $request->url_original; 576 $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
577 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); 577 if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
578 $this->requests[$orig]['body'] = substr($response, $info['header_size']); 578 $this->redirectQueue[$orig] = $redirectURL;
579 $this->requests[$orig]['method'] = $request->method; 579 } else {
580 $this->requests[$orig]['effective_url'] = $info['url']; 580 $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
581 $this->requests[$orig]['status_code'] = (int)$info['http_code']; 581 }
582 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { 582 } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
583 $this->requests[$orig]['location'] = trim($match[1]); 583 // check for <meta name='fragment' content='!'/>
584 } 584 // for AJAX sites, e.g. Blogger with its dynamic views templates.
585 } 585 // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
586 586 if (isset($this->requests[$orig]['body'])) {
587 protected function headersToString(array $headers, $associative=true) { 587 $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
588 if (!$associative) { 588 if ($redirectURL) {
589 return implode("\n", $headers); 589 $this->redirectQueue[$orig] = $redirectURL;
590 } else { 590 }
591 $str = ''; 591 }
592 foreach ($headers as $key => $val) { 592 }
593 if (is_array($val)) { 593 }
594 foreach ($val as $v) $str .= "$key: $v\n"; 594 } else {
595 } else { 595 $this->debug('Error retrieving URL');
596 $str .= "$key: $val\n"; 596 //print_r($req_url);
597 } 597 //print_r($http_response_header);
598 } 598 //print_r($html);
599 return rtrim($str); 599
600 } 600 // TODO: handle error - failed to retrieve URL
601 } 601 }
602 602 }
603 public function get($url, $remove=false, $gzdecode=true) { 603 }
604 $url = "$url"; 604 }
605 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { 605 }
606 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); 606
607 $response = $this->requests[$url]; 607 public function handleCurlResponse($response, $info, $request) {
608 /* 608 $orig = $request->url_original;
609 } elseif ($this->isCached($url)) { 609 $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
610 $this->debug("URL already fetched - in disk cache ($url)"); 610 $this->requests[$orig]['body'] = substr($response, $info['header_size']);
611 $response = $this->getCached($url); 611 $this->requests[$orig]['method'] = $request->method;
612 $this->requests[$url] = $response; 612 $this->requests[$orig]['effective_url'] = $info['url'];
613 */ 613 $this->requests[$orig]['status_code'] = (int)$info['http_code'];
614 } else { 614 if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
615 $this->debug("Fetching URL ($url)"); 615 $this->requests[$orig]['location'] = trim($match[1]);
616 $this->fetchAll(array($url)); 616 }
617 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { 617 }
618 $response = $this->requests[$url]; 618
619 } else { 619 protected function headersToString(array $headers, $associative=true) {
620 $this->debug("Request failed"); 620 if (!$associative) {
621 $response = false; 621 return implode("\n", $headers);
622 } 622 } else {
623 } 623 $str = '';
624 /* 624 foreach ($headers as $key => $val) {
625 if ($this->minimiseMemoryUse && $response) { 625 if (is_array($val)) {
626 $this->cache($url); 626 foreach ($val as $v) $str .= "$key: $v\n";
627 unset($this->requests[$url]); 627 } else {
628 } 628 $str .= "$key: $val\n";
629 */ 629 }
630 if ($remove && $response) unset($this->requests[$url]); 630 }
631 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { 631 return rtrim($str);
632 if ($html = gzdecode($response['body'])) { 632 }
633 $response['body'] = $html; 633 }
634 } 634
635 } 635 public function get($url, $remove=false, $gzdecode=true) {
636 return $response; 636 $url = "$url";
637 } 637 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
638 638 $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
639 public function parallelSupport() { 639 $response = $this->requests[$url];
640 return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); 640 /*
641 } 641 } elseif ($this->isCached($url)) {
642 642 $this->debug("URL already fetched - in disk cache ($url)");
643 private function headerOnlyType($headers) { 643 $response = $this->getCached($url);
644 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { 644 $this->requests[$url] = $response;
645 // look for full mime type (e.g. image/jpeg) or just type (e.g. image) 645 */
646 $match[1] = strtolower(trim($match[1])); 646 } else {
647 $match[2] = strtolower(trim($match[2])); 647 $this->debug("Fetching URL ($url)");
648 foreach (array($match[1], $match[2]) as $mime) { 648 $this->fetchAll(array($url));
649 if (in_array($mime, $this->headerOnlyTypes)) return true; 649 if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
650 } 650 $response = $this->requests[$url];
651 } 651 } else {
652 return false; 652 $this->debug("Request failed");
653 } 653 $response = false;
654 654 }
655 private function possibleUnsupportedType($url) { 655 }
656 $path = @parse_url($url, PHP_URL_PATH); 656 /*
657 if ($path && strpos($path, '.') !== false) { 657 if ($this->minimiseMemoryUse && $response) {
658 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); 658 $this->cache($url);
659 return in_array($ext, $this->headerOnlyClues); 659 unset($this->requests[$url]);
660 } 660 }
661 return false; 661 */
662 } 662 if ($remove && $response) unset($this->requests[$url]);
663} 663 if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
664 664 if ($html = gzdecode($response['body'])) {
665// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 665 $response['body'] = $html;
666if (!function_exists('gzdecode')) { 666 }
667 function gzdecode($data,&$filename='',&$error='',$maxlength=null) 667 }
668 { 668 return $response;
669 $len = strlen($data); 669 }
670 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { 670
671 $error = "Not in GZIP format."; 671 public function parallelSupport() {
672 return null; // Not GZIP format (See RFC 1952) 672 return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
673 } 673 }
674 $method = ord(substr($data,2,1)); // Compression method 674
675 $flags = ord(substr($data,3,1)); // Flags 675 private function headerOnlyType($headers) {
676 if ($flags & 31 != $flags) { 676 if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
677 $error = "Reserved bits not allowed."; 677 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
678 return null; 678 $match[1] = strtolower(trim($match[1]));
679 } 679 $match[2] = strtolower(trim($match[2]));
680 // NOTE: $mtime may be negative (PHP integer limitations) 680 foreach (array($match[1], $match[2]) as $mime) {
681 $mtime = unpack("V", substr($data,4,4)); 681 if (in_array($mime, $this->headerOnlyTypes)) return true;
682 $mtime = $mtime[1]; 682 }
683 $xfl = substr($data,8,1); 683 }
684 $os = substr($data,8,1); 684 return false;
685 $headerlen = 10; 685 }
686 $extralen = 0; 686
687 $extra = ""; 687 private function possibleUnsupportedType($url) {
688 if ($flags & 4) { 688 $path = @parse_url($url, PHP_URL_PATH);
689 // 2-byte length prefixed EXTRA data in header 689 if ($path && strpos($path, '.') !== false) {
690 if ($len - $headerlen - 2 < 8) { 690 $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
691 return false; // invalid 691 return in_array($ext, $this->headerOnlyClues);
692 } 692 }
693 $extralen = unpack("v",substr($data,8,2)); 693 return false;
694 $extralen = $extralen[1]; 694 }
695 if ($len - $headerlen - 2 - $extralen < 8) { 695}
696 return false; // invalid 696
697 } 697// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
698 $extra = substr($data,10,$extralen); 698if (!function_exists('gzdecode')) {
699 $headerlen += 2 + $extralen; 699 function gzdecode($data,&$filename='',&$error='',$maxlength=null)
700 } 700 {
701 $filenamelen = 0; 701 $len = strlen($data);
702 $filename = ""; 702 if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
703 if ($flags & 8) { 703 $error = "Not in GZIP format.";
704 // C-style string 704 return null; // Not GZIP format (See RFC 1952)
705 if ($len - $headerlen - 1 < 8) { 705 }
706 return false; // invalid 706 $method = ord(substr($data,2,1)); // Compression method
707 } 707 $flags = ord(substr($data,3,1)); // Flags
708 $filenamelen = strpos(substr($data,$headerlen),chr(0)); 708 if ($flags & 31 != $flags) {
709 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { 709 $error = "Reserved bits not allowed.";
710 return false; // invalid 710 return null;
711 } 711 }
712 $filename = substr($data,$headerlen,$filenamelen); 712 // NOTE: $mtime may be negative (PHP integer limitations)
713 $headerlen += $filenamelen + 1; 713 $mtime = unpack("V", substr($data,4,4));
714 } 714 $mtime = $mtime[1];
715 $commentlen = 0; 715 $xfl = substr($data,8,1);
716 $comment = ""; 716 $os = substr($data,8,1);
717 if ($flags & 16) { 717 $headerlen = 10;
718 // C-style string COMMENT data in header 718 $extralen = 0;
719 if ($len - $headerlen - 1 < 8) { 719 $extra = "";
720 return false; // invalid 720 if ($flags & 4) {
721 } 721 // 2-byte length prefixed EXTRA data in header
722 $commentlen = strpos(substr($data,$headerlen),chr(0)); 722 if ($len - $headerlen - 2 < 8) {
723 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { 723 return false; // invalid
724 return false; // Invalid header format 724 }
725 } 725 $extralen = unpack("v",substr($data,8,2));
726 $comment = substr($data,$headerlen,$commentlen); 726 $extralen = $extralen[1];
727 $headerlen += $commentlen + 1; 727 if ($len - $headerlen - 2 - $extralen < 8) {
728 } 728 return false; // invalid
729 $headercrc = ""; 729 }
730 if ($flags & 2) { 730 $extra = substr($data,10,$extralen);
731 // 2-bytes (lowest order) of CRC32 on header present 731 $headerlen += 2 + $extralen;
732 if ($len - $headerlen - 2 < 8) { 732 }
733 return false; // invalid 733 $filenamelen = 0;
734 } 734 $filename = "";
735 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; 735 if ($flags & 8) {
736 $headercrc = unpack("v", substr($data,$headerlen,2)); 736 // C-style string
737 $headercrc = $headercrc[1]; 737 if ($len - $headerlen - 1 < 8) {
738 if ($headercrc != $calccrc) { 738 return false; // invalid
739 $error = "Header checksum failed."; 739 }
740 return false; // Bad header CRC 740 $filenamelen = strpos(substr($data,$headerlen),chr(0));
741 } 741 if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
742 $headerlen += 2; 742 return false; // invalid
743 } 743 }
744 // GZIP FOOTER 744 $filename = substr($data,$headerlen,$filenamelen);
745 $datacrc = unpack("V",substr($data,-8,4)); 745 $headerlen += $filenamelen + 1;
746 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); 746 }
747 $isize = unpack("V",substr($data,-4)); 747 $commentlen = 0;
748 $isize = $isize[1]; 748 $comment = "";
749 // decompression: 749 if ($flags & 16) {
750 $bodylen = $len-$headerlen-8; 750 // C-style string COMMENT data in header
751 if ($bodylen < 1) { 751 if ($len - $headerlen - 1 < 8) {
752 // IMPLEMENTATION BUG! 752 return false; // invalid
753 return null; 753 }
754 } 754 $commentlen = strpos(substr($data,$headerlen),chr(0));
755 $body = substr($data,$headerlen,$bodylen); 755 if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
756 $data = ""; 756 return false; // Invalid header format
757 if ($bodylen > 0) { 757 }
758 switch ($method) { 758 $comment = substr($data,$headerlen,$commentlen);
759 case 8: 759 $headerlen += $commentlen + 1;
760 // Currently the only supported compression method: 760 }
761 $data = gzinflate($body,$maxlength); 761 $headercrc = "";
762 break; 762 if ($flags & 2) {
763 default: 763 // 2-bytes (lowest order) of CRC32 on header present
764 $error = "Unknown compression method."; 764 if ($len - $headerlen - 2 < 8) {
765 return false; 765 return false; // invalid
766 } 766 }
767 } // zero-byte body content is allowed 767 $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
768 // Verifiy CRC32 768 $headercrc = unpack("v", substr($data,$headerlen,2));
769 $crc = sprintf("%u",crc32($data)); 769 $headercrc = $headercrc[1];
770 $crcOK = $crc == $datacrc; 770 if ($headercrc != $calccrc) {
771 $lenOK = $isize == strlen($data); 771 $error = "Header checksum failed.";
772 if (!$lenOK || !$crcOK) { 772 return false; // Bad header CRC
773 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); 773 }
774 return false; 774 $headerlen += 2;
775 } 775 }
776 return $data; 776 // GZIP FOOTER
777 } 777 $datacrc = unpack("V",substr($data,-8,4));
778} 778 $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
779?> \ No newline at end of file 779 $isize = unpack("V",substr($data,-4));
780 $isize = $isize[1];
781 // decompression:
782 $bodylen = $len-$headerlen-8;
783 if ($bodylen < 1) {
784 // IMPLEMENTATION BUG!
785 return null;
786 }
787 $body = substr($data,$headerlen,$bodylen);
788 $data = "";
789 if ($bodylen > 0) {
790 switch ($method) {
791 case 8:
792 // Currently the only supported compression method:
793 $data = gzinflate($body,$maxlength);
794 break;
795 default:
796 $error = "Unknown compression method.";
797 return false;
798 }
799 } // zero-byte body content is allowed
800 // Verifiy CRC32
801 $crc = sprintf("%u",crc32($data));
802 $crcOK = $crc == $datacrc;
803 $lenOK = $isize == strlen($data);
804 if (!$lenOK || !$crcOK) {
805 $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
806 return false;
807 }
808 return $data;
809 }
810} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
index ecd46d5f..c524a1ee 100644
--- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
@@ -1,79 +1,78 @@
1<?php 1<?php
2/** 2/**
3 * Humble HTTP Agent extension for SimplePie_File 3 * Humble HTTP Agent extension for SimplePie_File
4 * 4 *
5 * This class is designed to extend and override SimplePie_File 5 * This class is designed to extend and override SimplePie_File
6 * in order to prevent duplicate HTTP requests being sent out. 6 * in order to prevent duplicate HTTP requests being sent out.
7 * The idea is to initialise an instance of Humble HTTP Agent 7 * The idea is to initialise an instance of Humble HTTP Agent
8 * and attach it, to a static class variable, of this class. 8 * and attach it, to a static class variable, of this class.
9 * SimplePie will then automatically initialise this class 9 * SimplePie will then automatically initialise this class
10 * 10 *
11 * @date 2011-02-28 11 * @date 2011-02-28
12 */ 12 */
13 13
14class SimplePie_HumbleHttpAgent extends SimplePie_File 14class SimplePie_HumbleHttpAgent extends SimplePie_File
15{ 15{
16 protected static $agent; 16 protected static $agent;
17 var $url; 17 var $url;
18 var $useragent; 18 var $useragent;
19 var $success = true; 19 var $success = true;
20 var $headers = array(); 20 var $headers = array();
21 var $body; 21 var $body;
22 var $status_code; 22 var $status_code;
23 var $redirects = 0; 23 var $redirects = 0;
24 var $error; 24 var $error;
25 var $method = SIMPLEPIE_FILE_SOURCE_NONE; 25 var $method = SIMPLEPIE_FILE_SOURCE_NONE;
26 26
27 public static function set_agent(HumbleHttpAgent $agent) { 27 public static function set_agent(HumbleHttpAgent $agent) {
28 self::$agent = $agent; 28 self::$agent = $agent;
29 } 29 }
30 30
31 public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { 31 public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
32 if (class_exists('idna_convert')) 32 if (class_exists('idna_convert'))
33 { 33 {
34 $idn = new idna_convert(); 34 $idn = new idna_convert();
35 $parsed = SimplePie_Misc::parse_url($url); 35 $parsed = SimplePie_Misc::parse_url($url);
36 $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); 36 $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
37 } 37 }
38 $this->url = $url; 38 $this->url = $url;
39 $this->useragent = $useragent; 39 $this->useragent = $useragent;
40 if (preg_match('/^http(s)?:\/\//i', $url)) 40 if (preg_match('/^http(s)?:\/\//i', $url))
41 { 41 {
42 if (!is_array($headers)) 42 if (!is_array($headers))
43 { 43 {
44 $headers = array(); 44 $headers = array();
45 } 45 }
46 $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; 46 $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
47 $headers2 = array(); 47 $headers2 = array();
48 foreach ($headers as $key => $value) { 48 foreach ($headers as $key => $value) {
49 $headers2[] = "$key: $value"; 49 $headers2[] = "$key: $value";
50 } 50 }
51 //TODO: allow for HTTP headers 51 //TODO: allow for HTTP headers
52 // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); 52 // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
53 53
54 $response = self::$agent->get($url); 54 $response = self::$agent->get($url);
55 55
56 if ($response === false || !isset($response['status_code'])) { 56 if ($response === false || !isset($response['status_code'])) {
57 $this->error = 'failed to fetch URL'; 57 $this->error = 'failed to fetch URL';
58 $this->success = false; 58 $this->success = false;
59 } else { 59 } else {
60 // The extra lines at the end are there to satisfy SimplePie's HTTP parser. 60 // The extra lines at the end are there to satisfy SimplePie's HTTP parser.
61 // The class expects a full HTTP message, whereas we're giving it only 61 // The class expects a full HTTP message, whereas we're giving it only
62 // headers - the new lines indicate the start of the body. 62 // headers - the new lines indicate the start of the body.
63 $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); 63 $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
64 if ($parser->parse()) { 64 if ($parser->parse()) {
65 $this->headers = $parser->headers; 65 $this->headers = $parser->headers;
66 //$this->body = $parser->body; 66 //$this->body = $parser->body;
67 $this->body = $response['body']; 67 $this->body = $response['body'];
68 $this->status_code = $parser->status_code; 68 $this->status_code = $parser->status_code;
69 } 69 }
70 } 70 }
71 } 71 }
72 else 72 else
73 { 73 {
74 $this->error = 'invalid URL'; 74 $this->error = 'invalid URL';
75 $this->success = false; 75 $this->success = false;
76 } 76 }
77 } 77 }
78} 78} \ No newline at end of file
79?> \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
index 09b11546..382d869c 100644
--- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
@@ -6,23 +6,24 @@
6 * Attempts to detect the language of a sample of text by correlating ranked 6 * Attempts to detect the language of a sample of text by correlating ranked
7 * 3-gram frequencies to a table of 3-gram frequencies of known languages. 7 * 3-gram frequencies to a table of 3-gram frequencies of known languages.
8 * 8 *
9 * Implements a version of a technique originally proposed by Cavnar & Trenkle 9 * Implements a version of a technique originally proposed by Cavnar & Trenkle
10 * (1994): "N-Gram-Based Text Categorization" 10 * (1994): "N-Gram-Based Text Categorization"
11 * 11 *
12 * PHP versions 4 and 5 12 * PHP version 5
13 * 13 *
14 * @category Text 14 * @category Text
15 * @package Text_LanguageDetect 15 * @package Text_LanguageDetect
16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 16 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
17 * @copyright 2005-2006 Nicholas Pisarro 17 * @copyright 2005-2006 Nicholas Pisarro
18 * @license http://www.debian.org/misc/bsd.license BSD 18 * @license http://www.debian.org/misc/bsd.license BSD
19 * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ 19 * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
20 * @link http://pear.php.net/package/Text_LanguageDetect/ 20 * @link http://pear.php.net/package/Text_LanguageDetect/
21 * @link http://langdetect.blogspot.com/ 21 * @link http://langdetect.blogspot.com/
22 */ 22 */
23 23
24//require_once 'PEAR.php'; 24require_once 'LanguageDetect/Exception.php';
25require_once 'Parser.php'; 25require_once 'LanguageDetect/Parser.php';
26require_once 'LanguageDetect/ISO639.php';
26 27
27/** 28/**
28 * Language detection class 29 * Language detection class
@@ -41,9 +42,10 @@ require_once 'Parser.php';
41 * 42 *
42 * echo "Supported languages:\n"; 43 * echo "Supported languages:\n";
43 * 44 *
44 * $langs = $l->getLanguages(); 45 * try {
45 * if (PEAR::isError($langs)) { 46 * $langs = $l->getLanguages();
46 * die($langs->getMessage()); 47 * } catch (Text_LanguageDetect_Exception $e) {
48 * die($e->getMessage());
47 * } 49 * }
48 * 50 *
49 * sort($langs); 51 * sort($langs);
@@ -54,38 +56,38 @@ require_once 'Parser.php';
54 * } 56 * }
55 * </code> 57 * </code>
56 * 58 *
57 * @category Text 59 * @category Text
58 * @package Text_LanguageDetect 60 * @package Text_LanguageDetect
59 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> 61 * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
60 * @copyright 2005 Nicholas Pisarro 62 * @copyright 2005 Nicholas Pisarro
61 * @license http://www.debian.org/misc/bsd.license BSD 63 * @license http://www.debian.org/misc/bsd.license BSD
62 * @version Release: @package_version@ 64 * @version Release: @package_version@
63 * @todo allow users to generate their own language models 65 * @link http://pear.php.net/package/Text_LanguageDetect/
66 * @todo allow users to generate their own language models
64 */ 67 */
65
66class Text_LanguageDetect 68class Text_LanguageDetect
67{ 69{
68 /** 70 /**
69 * The filename that stores the trigram data for the detector 71 * The filename that stores the trigram data for the detector
70 * 72 *
71 * If this value starts with a slash (/) or a dot (.) the value of 73 * If this value starts with a slash (/) or a dot (.) the value of
72 * $this->_data_dir will be ignored 74 * $this->_data_dir will be ignored
73 * 75 *
74 * @var string 76 * @var string
75 * @access private 77 * @access private
76 */ 78 */
77 var $_db_filename = './lang.dat'; 79 var $_db_filename = 'lang.dat';
78 80
79 /** 81 /**
80 * The filename that stores the unicode block definitions 82 * The filename that stores the unicode block definitions
81 * 83 *
82 * If this value starts with a slash (/) or a dot (.) the value of 84 * If this value starts with a slash (/) or a dot (.) the value of
83 * $this->_data_dir will be ignored 85 * $this->_data_dir will be ignored
84 * 86 *
85 * @var string 87 * @var string
86 * @access private 88 * @access private
87 */ 89 */
88 var $_unicode_db_filename = './unicode_blocks.dat'; 90 var $_unicode_db_filename = 'unicode_blocks.dat';
89 91
90 /** 92 /**
91 * The data directory 93 * The data directory
@@ -99,11 +101,8 @@ class Text_LanguageDetect
99 101
100 /** 102 /**
101 * The trigram data for comparison 103 * The trigram data for comparison
102 *
103 * Will be loaded on start from $this->_db_filename
104 * 104 *
105 * May be set to a PEAR_Error object if there is an error during its 105 * Will be loaded on start from $this->_db_filename
106 * initialization
107 * 106 *
108 * @var array 107 * @var array
109 * @access private 108 * @access private
@@ -120,7 +119,7 @@ class Text_LanguageDetect
120 119
121 /** 120 /**
122 * The size of the trigram data arrays 121 * The size of the trigram data arrays
123 * 122 *
124 * @var int 123 * @var int
125 * @access private 124 * @access private
126 */ 125 */
@@ -140,7 +139,7 @@ class Text_LanguageDetect
140 139
141 /** 140 /**
142 * Whether or not to simulate perl's Language::Guess exactly 141 * Whether or not to simulate perl's Language::Guess exactly
143 * 142 *
144 * @access private 143 * @access private
145 * @var bool 144 * @var bool
146 * @see setPerlCompatible() 145 * @see setPerlCompatible()
@@ -165,18 +164,24 @@ class Text_LanguageDetect
165 var $_clusters; 164 var $_clusters;
166 165
167 /** 166 /**
167 * Which type of "language names" are accepted and returned:
168 *
169 * 0 - language name ("english")
170 * 2 - 2-letter ISO 639-1 code ("en")
171 * 3 - 3-letter ISO 639-2 code ("eng")
172 */
173 var $_name_mode = 0;
174
175 /**
168 * Constructor 176 * Constructor
169 * 177 *
170 * Will attempt to load the language database. If it fails, you will get 178 * Will attempt to load the language database. If it fails, you will get
171 * a PEAR_Error object returned when you try to use detect() 179 * an exception.
172 *
173 */ 180 */
174 function Text_LanguageDetect($db=null, $unicode_db=null) 181 function __construct()
175 { 182 {
176 if (isset($db)) $this->_db_filename = $db;
177 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
178
179 $data = $this->_readdb($this->_db_filename); 183 $data = $this->_readdb($this->_db_filename);
184 $this->_checkTrigram($data['trigram']);
180 $this->_lang_db = $data['trigram']; 185 $this->_lang_db = $data['trigram'];
181 186
182 if (isset($data['trigram-unicodemap'])) { 187 if (isset($data['trigram-unicodemap'])) {
@@ -186,29 +191,32 @@ class Text_LanguageDetect
186 // Not yet implemented: 191 // Not yet implemented:
187 if (isset($data['trigram-clusters'])) { 192 if (isset($data['trigram-clusters'])) {
188 $this->_clusters = $data['trigram-clusters']; 193 $this->_clusters = $data['trigram-clusters'];
189 } 194 }
190 } 195 }
191 196
192 /** 197 /**
193 * Returns the path to the location of the database 198 * Returns the path to the location of the database
194 * 199 *
195 * @access private 200 * @param string $fname File name to load
196 * @return string expected path to the language model database 201 *
202 * @return string expected path to the language model database
203 * @access private
197 */ 204 */
198 function _get_data_loc($fname) 205 function _get_data_loc($fname)
199 { 206 {
200 return $fname; 207 return dirname(__FILE__).'/'.$fname;
201 } 208 }
202 209
203 /** 210 /**
204 * Loads the language trigram database from filename 211 * Loads the language trigram database from filename
205 * 212 *
206 * Trigram datbase should be a serialize()'d array 213 * Trigram datbase should be a serialize()'d array
207 * 214 *
208 * @access private 215 * @param string $fname the filename where the data is stored
209 * @param string $fname the filename where the data is stored 216 *
210 * @return array the language model data 217 * @return array the language model data
211 * @throws PEAR_Error 218 * @throws Text_LanguageDetect_Exception
219 * @access private
212 */ 220 */
213 function _readdb($fname) 221 function _readdb($fname)
214 { 222 {
@@ -217,79 +225,74 @@ class Text_LanguageDetect
217 225
218 // input check 226 // input check
219 if (!file_exists($fname)) { 227 if (!file_exists($fname)) {
220 throw new Exception('Language database does not exist.'); 228 throw new Text_LanguageDetect_Exception(
229 'Language database does not exist: ' . $fname,
230 Text_LanguageDetect_Exception::DB_NOT_FOUND
231 );
221 } elseif (!is_readable($fname)) { 232 } elseif (!is_readable($fname)) {
222 throw new Exception('Language database is not readable.'); 233 throw new Text_LanguageDetect_Exception(
234 'Language database is not readable: ' . $fname,
235 Text_LanguageDetect_Exception::DB_NOT_READABLE
236 );
223 } 237 }
224 238
225 if (function_exists('file_get_contents')) { 239 return unserialize(file_get_contents($fname));
226 return unserialize(file_get_contents($fname));
227 } else {
228 // if you don't have file_get_contents(),
229 // then this is the next fastest way
230 ob_start();
231 readfile($fname);
232 $contents = ob_get_contents();
233 ob_end_clean();
234 return unserialize($contents);
235 }
236 } 240 }
237 241
238 242
239 /** 243 /**
240 * Checks if this object is ready to detect languages 244 * Checks if this object is ready to detect languages
241 * 245 *
242 * @access private 246 * @param array $trigram Trigram data from database
243 * @param mixed &$err error object to be returned by reference, if any 247 *
244 * @return bool true if no errors 248 * @return void
249 * @access private
245 */ 250 */
246 function _setup_ok(&$err) 251 function _checkTrigram($trigram)
247 { 252 {
248 if (!is_array($this->_lang_db)) { 253 if (!is_array($trigram)) {
249 if (ini_get('magic_quotes_runtime')) { 254 if (ini_get('magic_quotes_runtime')) {
250 throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); 255 throw new Text_LanguageDetect_Exception(
251 } else { 256 'Error loading database. Try turning magic_quotes_runtime off.',
252 throw new Exception('Language database is not an array.'); 257 Text_LanguageDetect_Exception::MAGIC_QUOTES
258 );
253 } 259 }
254 return false; 260 throw new Text_LanguageDetect_Exception(
255 261 'Language database is not an array.',
256 } elseif (empty($this->_lang_db)) { 262 Text_LanguageDetect_Exception::DB_NOT_ARRAY
257 throw new Exception('Language database has no elements.'); 263 );
258 return false; 264 } elseif (empty($trigram)) {
259 265 throw new Text_LanguageDetect_Exception(
260 } else { 266 'Language database has no elements.',
261 return true; 267 Text_LanguageDetect_Exception::DB_EMPTY
268 );
262 } 269 }
263 } 270 }
264 271
265 /** 272 /**
266 * Omits languages 273 * Omits languages
267 * 274 *
268 * Pass this function the name of or an array of names of 275 * Pass this function the name of or an array of names of
269 * languages that you don't want considered 276 * languages that you don't want considered
270 * 277 *
271 * If you're only expecting a limited set of languages, this can greatly 278 * If you're only expecting a limited set of languages, this can greatly
272 * speed up processing 279 * speed up processing
273 * 280 *
274 * @access public 281 * @param mixed $omit_list language name or array of names to omit
275 * @param mixed $omit_list language name or array of names to omit 282 * @param bool $include_only if true will include (rather than
276 * @param bool $include_only if true will include (rather than 283 * exclude) only those in the list
277 * exclude) only those in the list 284 *
278 * @return int number of languages successfully deleted 285 * @return int number of languages successfully deleted
279 * @throws PEAR_Error 286 * @throws Text_LanguageDetect_Exception
280 */ 287 */
281 function omitLanguages($omit_list, $include_only = false) 288 public function omitLanguages($omit_list, $include_only = false)
282 { 289 {
283
284 // setup check
285 if (!$this->_setup_ok($err)) {
286 return $err;
287 }
288
289 $deleted = 0; 290 $deleted = 0;
290 291
291 // deleting the given languages 292 $omit_list = $this->_convertFromNameMode($omit_list);
293
292 if (!$include_only) { 294 if (!$include_only) {
295 // deleting the given languages
293 if (!is_array($omit_list)) { 296 if (!is_array($omit_list)) {
294 $omit_list = strtolower($omit_list); // case desensitize 297 $omit_list = strtolower($omit_list); // case desensitize
295 if (isset($this->_lang_db[$omit_list])) { 298 if (isset($this->_lang_db[$omit_list])) {
@@ -301,12 +304,12 @@ class Text_LanguageDetect
301 if (isset($this->_lang_db[$omit_lang])) { 304 if (isset($this->_lang_db[$omit_lang])) {
302 unset($this->_lang_db[$omit_lang]); 305 unset($this->_lang_db[$omit_lang]);
303 $deleted++; 306 $deleted++;
304 } 307 }
305 } 308 }
306 } 309 }
307 310
308 // deleting all except the given languages
309 } else { 311 } else {
312 // deleting all except the given languages
310 if (!is_array($omit_list)) { 313 if (!is_array($omit_list)) {
311 $omit_list = array($omit_list); 314 $omit_list = array($omit_list);
312 } 315 }
@@ -327,7 +330,7 @@ class Text_LanguageDetect
327 // reset the cluster cache if the number of languages changes 330 // reset the cluster cache if the number of languages changes
328 // this will then have to be recalculated 331 // this will then have to be recalculated
329 if (isset($this->_clusters) && $deleted > 0) { 332 if (isset($this->_clusters) && $deleted > 0) {
330 unset($this->_clusters); 333 $this->_clusters = null;
331 } 334 }
332 335
333 return $deleted; 336 return $deleted;
@@ -339,49 +342,40 @@ class Text_LanguageDetect
339 * 342 *
340 * @access public 343 * @access public
341 * @return int the number of languages 344 * @return int the number of languages
342 * @throws PEAR_Error 345 * @throws Text_LanguageDetect_Exception
343 */ 346 */
344 function getLanguageCount() 347 function getLanguageCount()
345 { 348 {
346 if (!$this->_setup_ok($err)) { 349 return count($this->_lang_db);
347 return $err;
348 } else {
349 return count($this->_lang_db);
350 }
351 } 350 }
352 351
353 /** 352 /**
354 * Returns true if a given language exists 353 * Checks if the language with the given name exists in the database
355 * 354 *
356 * If passed an array of names, will return true only if all exist 355 * @param mixed $lang Language name or array of language names
357 * 356 *
358 * @access public 357 * @return bool true if language model exists
359 * @param mixed $lang language name or array of language names
360 * @return bool true if language model exists
361 * @throws PEAR_Error
362 */ 358 */
363 function languageExists($lang) 359 public function languageExists($lang)
364 { 360 {
365 if (!$this->_setup_ok($err)) { 361 $lang = $this->_convertFromNameMode($lang);
366 return $err;
367 } else {
368 // string
369 if (is_string($lang)) {
370 return isset($this->_lang_db[strtolower($lang)]);
371
372 // array
373 } elseif (is_array($lang)) {
374 foreach ($lang as $test_lang) {
375 if (!isset($this->_lang_db[strtolower($test_lang)])) {
376 return false;
377 }
378 }
379 return true;
380 362
381 // other (error) 363 if (is_string($lang)) {
382 } else { 364 return isset($this->_lang_db[strtolower($lang)]);
383 throw new Exception('Unknown type passed to languageExists()'); 365
366 } elseif (is_array($lang)) {
367 foreach ($lang as $test_lang) {
368 if (!isset($this->_lang_db[strtolower($test_lang)])) {
369 return false;
370 }
384 } 371 }
372 return true;
373
374 } else {
375 throw new Text_LanguageDetect_Exception(
376 'Unsupported parameter type passed to languageExists()',
377 Text_LanguageDetect_Exception::PARAM_TYPE
378 );
385 } 379 }
386 } 380 }
387 381
@@ -389,25 +383,24 @@ class Text_LanguageDetect
389 * Returns the list of detectable languages 383 * Returns the list of detectable languages
390 * 384 *
391 * @access public 385 * @access public
392 * @return array the names of the languages known to this object 386 * @return array the names of the languages known to this object<<<<<<<
393 * @throws PEAR_Error 387 * @throws Text_LanguageDetect_Exception
394 */ 388 */
395 function getLanguages() 389 function getLanguages()
396 { 390 {
397 if (!$this->_setup_ok($err)) { 391 return $this->_convertToNameMode(
398 return $err; 392 array_keys($this->_lang_db)
399 } else { 393 );
400 return array_keys($this->_lang_db);
401 }
402 } 394 }
403 395
404 /** 396 /**
405 * Make this object behave like Language::Guess 397 * Make this object behave like Language::Guess
406 * 398 *
407 * @access public 399 * @param bool $setting false to turn off perl compatibility
408 * @param bool $setting false to turn off perl compatibility 400 *
401 * @return void
409 */ 402 */
410 function setPerlCompatible($setting = true) 403 public function setPerlCompatible($setting = true)
411 { 404 {
412 if (is_bool($setting)) { // input check 405 if (is_bool($setting)) { // input check
413 $this->_perl_compatible = $setting; 406 $this->_perl_compatible = $setting;
@@ -422,6 +415,21 @@ class Text_LanguageDetect
422 } 415 }
423 416
424 /** 417 /**
418 * Sets the way how language names are accepted and returned.
419 *
420 * @param integer $name_mode One of the following modes:
421 * 0 - language name ("english")
422 * 2 - 2-letter ISO 639-1 code ("en")
423 * 3 - 3-letter ISO 639-2 code ("eng")
424 *
425 * @return void
426 */
427 function setNameMode($name_mode)
428 {
429 $this->_name_mode = $name_mode;
430 }
431
432 /**
425 * Whether to use unicode block ranges in detection 433 * Whether to use unicode block ranges in detection
426 * 434 *
427 * Should speed up most detections if turned on (detault is on). In some 435 * Should speed up most detections if turned on (detault is on). In some
@@ -429,10 +437,11 @@ class Text_LanguageDetect
429 * in languages that use latin scripts. In other cases it should speed up 437 * in languages that use latin scripts. In other cases it should speed up
430 * detection noticeably. 438 * detection noticeably.
431 * 439 *
432 * @access public 440 * @param bool $setting false to turn off
433 * @param bool $setting false to turn off 441 *
442 * @return void
434 */ 443 */
435 function useUnicodeBlocks($setting = true) 444 public function useUnicodeBlocks($setting = true)
436 { 445 {
437 if (is_bool($setting)) { 446 if (is_bool($setting)) {
438 $this->_use_unicode_narrowing = $setting; 447 $this->_use_unicode_narrowing = $setting;
@@ -442,15 +451,15 @@ class Text_LanguageDetect
442 /** 451 /**
443 * Converts a piece of text into trigrams 452 * Converts a piece of text into trigrams
444 * 453 *
445 * Superceded by the Text_LanguageDetect_Parser class 454 * @param string $text text to convert
446 * 455 *
447 * @access private 456 * @return array array of trigram frequencies
448 * @param string $text text to convert 457 * @access private
449 * @return array array of trigram frequencies 458 * @deprecated Superceded by the Text_LanguageDetect_Parser class
450 */ 459 */
451 function _trigram($text) 460 function _trigram($text)
452 { 461 {
453 $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); 462 $s = new Text_LanguageDetect_Parser($text);
454 $s->prepareTrigram(); 463 $s->prepareTrigram();
455 $s->prepareUnicode(false); 464 $s->prepareUnicode(false);
456 $s->setPadStart(!$this->_perl_compatible); 465 $s->setPadStart(!$this->_perl_compatible);
@@ -463,11 +472,12 @@ class Text_LanguageDetect
463 * 472 *
464 * Thresholds (cuts off) the list at $this->_threshold 473 * Thresholds (cuts off) the list at $this->_threshold
465 * 474 *
466 * @access protected 475 * @param array $arr array of trigram
467 * @param array $arr array of trgram 476 *
468 * @return array ranks of trigrams 477 * @return array ranks of trigrams
478 * @access protected
469 */ 479 */
470 function _arr_rank(&$arr) 480 function _arr_rank($arr)
471 { 481 {
472 482
473 // sorts alphabetically first as a standard way of breaking rank ties 483 // sorts alphabetically first as a standard way of breaking rank ties
@@ -494,14 +504,17 @@ class Text_LanguageDetect
494 504
495 /** 505 /**
496 * Sorts an array by value breaking ties alphabetically 506 * Sorts an array by value breaking ties alphabetically
497 * 507 *
498 * @access private 508 * @param array &$arr the array to sort
499 * @param array &$arr the array to sort 509 *
510 * @return void
511 * @access private
500 */ 512 */
501 function _bub_sort(&$arr) 513 function _bub_sort(&$arr)
502 { 514 {
503 // should do the same as this perl statement: 515 // should do the same as this perl statement:
504 // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } 516 // sort { $trigrams{$b} == $trigrams{$a}
517 // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
505 518
506 // needs to sort by both key and value at once 519 // needs to sort by both key and value at once
507 // using the key to break ties for the value 520 // using the key to break ties for the value
@@ -528,13 +541,14 @@ class Text_LanguageDetect
528 /** 541 /**
529 * Sort function used by bubble sort 542 * Sort function used by bubble sort
530 * 543 *
531 * Callback function for usort(). 544 * Callback function for usort().
532 * 545 *
533 * @access private 546 * @param array $a first param passed by usort()
534 * @param array first param passed by usort() 547 * @param array $b second param passed by usort()
535 * @param array second param passed by usort() 548 *
536 * @return int 1 if $a is greater, -1 if not 549 * @return int 1 if $a is greater, -1 if not
537 * @see _bub_sort() 550 * @see _bub_sort()
551 * @access private
538 */ 552 */
539 function _sort_func($a, $b) 553 function _sort_func($a, $b)
540 { 554 {
@@ -542,12 +556,12 @@ class Text_LanguageDetect
542 list($a_key, $a_value) = $a; 556 list($a_key, $a_value) = $a;
543 list($b_key, $b_value) = $b; 557 list($b_key, $b_value) = $b;
544 558
545 // if the values are the same, break ties using the key
546 if ($a_value == $b_value) { 559 if ($a_value == $b_value) {
560 // if the values are the same, break ties using the key
547 return strcmp($a_key, $b_key); 561 return strcmp($a_key, $b_key);
548 562
549 // if not, just sort normally
550 } else { 563 } else {
564 // if not, just sort normally
551 if ($a_value > $b_value) { 565 if ($a_value > $b_value) {
552 return -1; 566 return -1;
553 } else { 567 } else {
@@ -559,23 +573,24 @@ class Text_LanguageDetect
559 } 573 }
560 574
561 /** 575 /**
562 * Calculates a linear rank-order distance statistic between two sets of 576 * Calculates a linear rank-order distance statistic between two sets of
563 * ranked trigrams 577 * ranked trigrams
564 * 578 *
565 * Sums the differences in rank for each trigram. If the trigram does not 579 * Sums the differences in rank for each trigram. If the trigram does not
566 * appear in both, consider it a difference of $this->_threshold. 580 * appear in both, consider it a difference of $this->_threshold.
567 * 581 *
568 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite 582 * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
569 * its simplicity it has been shown to be highly accurate for language 583 * its simplicity it has been shown to be highly accurate for language
570 * identification tasks. 584 * identification tasks.
571 * 585 *
572 * @access private 586 * @param array $arr1 the reference set of trigram ranks
573 * @param array $arr1 the reference set of trigram ranks 587 * @param array $arr2 the target set of trigram ranks
574 * @param array $arr2 the target set of trigram ranks 588 *
575 * @return int the sum of the differences between the ranks of 589 * @return int the sum of the differences between the ranks of
576 * the two trigram sets 590 * the two trigram sets
591 * @access private
577 */ 592 */
578 function _distance(&$arr1, &$arr2) 593 function _distance($arr1, $arr2)
579 { 594 {
580 $sumdist = 0; 595 $sumdist = 0;
581 596
@@ -598,14 +613,15 @@ class Text_LanguageDetect
598 613
599 /** 614 /**
600 * Normalizes the score returned by _distance() 615 * Normalizes the score returned by _distance()
601 * 616 *
602 * Different if perl compatible or not 617 * Different if perl compatible or not
603 * 618 *
604 * @access private 619 * @param int $score the score from _distance()
605 * @param int $score the score from _distance() 620 * @param int $base_count the number of trigrams being considered
606 * @param int $base_count the number of trigrams being considered 621 *
607 * @return float the normalized score 622 * @return float the normalized score
608 * @see _distance() 623 * @see _distance()
624 * @access private
609 */ 625 */
610 function _normalize_score($score, $base_count = null) 626 function _normalize_score($score, $base_count = null)
611 { 627 {
@@ -630,29 +646,24 @@ class Text_LanguageDetect
630 * 646 *
631 * If perl compatible, the score is 300-0, 0 being most similar. 647 * If perl compatible, the score is 300-0, 0 being most similar.
632 * Otherwise, it's 0-1 with 1 being most similar. 648 * Otherwise, it's 0-1 with 1 being most similar.
633 * 649 *
634 * The $sample text should be at least a few sentences in length; 650 * The $sample text should be at least a few sentences in length;
635 * should be ascii-7 or utf8 encoded, if another and the mbstring extension 651 * should be ascii-7 or utf8 encoded, if another and the mbstring extension
636 * is present it will try to detect and convert. However, experience has 652 * is present it will try to detect and convert. However, experience has
637 * shown that mb_detect_encoding() *does not work very well* with at least 653 * shown that mb_detect_encoding() *does not work very well* with at least
638 * some types of encoding. 654 * some types of encoding.
639 * 655 *
640 * @access public 656 * @param string $sample a sample of text to compare.
641 * @param string $sample a sample of text to compare. 657 * @param int $limit if specified, return an array of the most likely
642 * @param int $limit if specified, return an array of the most likely 658 * $limit languages and their scores.
643 * $limit languages and their scores. 659 *
644 * @return mixed sorted array of language scores, blank array if no 660 * @return mixed sorted array of language scores, blank array if no
645 * useable text was found, or PEAR_Error if error 661 * useable text was found
646 * with the object setup 662 * @see _distance()
647 * @see _distance() 663 * @throws Text_LanguageDetect_Exception
648 * @throws PEAR_Error
649 */ 664 */
650 function detect($sample, $limit = 0) 665 public function detect($sample, $limit = 0)
651 { 666 {
652 if (!$this->_setup_ok($err)) {
653 return $err;
654 }
655
656 // input check 667 // input check
657 if (!Text_LanguageDetect_Parser::validateString($sample)) { 668 if (!Text_LanguageDetect_Parser::validateString($sample)) {
658 return array(); 669 return array();
@@ -660,36 +671,27 @@ class Text_LanguageDetect
660 671
661 // check char encoding 672 // check char encoding
662 // (only if mbstring extension is compiled and PHP > 4.0.6) 673 // (only if mbstring extension is compiled and PHP > 4.0.6)
663 if (function_exists('mb_detect_encoding') 674 if (function_exists('mb_detect_encoding')
664 && function_exists('mb_convert_encoding')) { 675 && function_exists('mb_convert_encoding')
665 676 ) {
666 // mb_detect_encoding isn't very reliable, to say the least 677 // mb_detect_encoding isn't very reliable, to say the least
667 // detection should still work with a sufficient sample of ascii characters 678 // detection should still work with a sufficient sample
679 // of ascii characters
668 $encoding = mb_detect_encoding($sample); 680 $encoding = mb_detect_encoding($sample);
669 681
670 // mb_detect_encoding() will return FALSE if detection fails 682 // mb_detect_encoding() will return FALSE if detection fails
671 // don't attempt conversion if that's the case 683 // don't attempt conversion if that's the case
672 if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { 684 if ($encoding != 'ASCII' && $encoding != 'UTF-8'
673 685 && $encoding !== false
674 if (function_exists('mb_list_encodings')) { 686 ) {
675 687 // verify the encoding exists in mb_list_encodings
676 // verify the encoding exists in mb_list_encodings 688 if (in_array($encoding, mb_list_encodings())) {
677 if (in_array($encoding, mb_list_encodings())) { 689 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
678 $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
679 }
680
681 // if the previous condition failed:
682 // somehow we detected an encoding that also we don't support
683
684 } else {
685 // php 4 doesnt have mb_list_encodings()
686 // so attempt with error suppression
687 $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding);
688 } 690 }
689 } 691 }
690 } 692 }
691 693
692 $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); 694 $sample_obj = new Text_LanguageDetect_Parser($sample);
693 $sample_obj->prepareTrigram(); 695 $sample_obj->prepareTrigram();
694 if ($this->_use_unicode_narrowing) { 696 if ($this->_use_unicode_narrowing) {
695 $sample_obj->prepareUnicode(); 697 $sample_obj->prepareUnicode();
@@ -713,7 +715,10 @@ class Text_LanguageDetect
713 if (is_array($blocks)) { 715 if (is_array($blocks)) {
714 $present_blocks = array_keys($blocks); 716 $present_blocks = array_keys($blocks);
715 } else { 717 } else {
716 throw new Exception('Error during block detection'); 718 throw new Text_LanguageDetect_Exception(
719 'Error during block detection',
720 Text_LanguageDetect_Exception::BLOCK_DETECTION
721 );
717 } 722 }
718 723
719 $possible_langs = array(); 724 $possible_langs = array();
@@ -731,30 +736,30 @@ class Text_LanguageDetect
731 } 736 }
732 737
733 // could also try an intersect operation rather than a union 738 // could also try an intersect operation rather than a union
734 // in other words, choose languages whose trigrams contain 739 // in other words, choose languages whose trigrams contain
735 // ALL of the unicode blocks found in this sample 740 // ALL of the unicode blocks found in this sample
736 // would improve speed but would be completely thrown off by an 741 // would improve speed but would be completely thrown off by an
737 // unexpected character, like an umlaut appearing in english text 742 // unexpected character, like an umlaut appearing in english text
738 743
739 $possible_langs = array_intersect( 744 $possible_langs = array_intersect(
740 array_keys($this->_lang_db), 745 array_keys($this->_lang_db),
741 array_unique($possible_langs) 746 array_unique($possible_langs)
742 ); 747 );
743 748
744 // needs to intersect it with the keys of _lang_db in case 749 // needs to intersect it with the keys of _lang_db in case
745 // languages have been omitted 750 // languages have been omitted
746 751
747 // or just try 'em all
748 } else { 752 } else {
753 // or just try 'em all
749 $possible_langs = array_keys($this->_lang_db); 754 $possible_langs = array_keys($this->_lang_db);
750 } 755 }
751 756
752 757
753 foreach ($possible_langs as $lang) { 758 foreach ($possible_langs as $lang) {
754 $scores[$lang] = 759 $scores[$lang] = $this->_normalize_score(
755 $this->_normalize_score( 760 $this->_distance($this->_lang_db[$lang], $trigram_freqs),
756 $this->_distance($this->_lang_db[$lang], $trigram_freqs), 761 $trigram_count
757 $trigram_count); 762 );
758 } 763 }
759 764
760 unset($sample_obj); 765 unset($sample_obj);
@@ -772,7 +777,6 @@ class Text_LanguageDetect
772 $limited_scores = array(); 777 $limited_scores = array();
773 778
774 $i = 0; 779 $i = 0;
775
776 foreach ($scores as $key => $value) { 780 foreach ($scores as $key => $value) {
777 if ($i++ >= $limit) { 781 if ($i++ >= $limit) {
778 break; 782 break;
@@ -781,9 +785,9 @@ class Text_LanguageDetect
781 $limited_scores[$key] = $value; 785 $limited_scores[$key] = $value;
782 } 786 }
783 787
784 return $limited_scores; 788 return $this->_convertToNameMode($limited_scores, true);
785 } else { 789 } else {
786 return $scores; 790 return $this->_convertToNameMode($scores, true);
787 } 791 }
788 } 792 }
789 793
@@ -791,35 +795,33 @@ class Text_LanguageDetect
791 * Returns only the most similar language to the text sample 795 * Returns only the most similar language to the text sample
792 * 796 *
793 * Calls $this->detect() and returns only the top result 797 * Calls $this->detect() and returns only the top result
794 * 798 *
795 * @access public 799 * @param string $sample text to detect the language of
796 * @param string $sample text to detect the language of 800 *
797 * @return string the name of the most likely language 801 * @return string the name of the most likely language
798 * or null if no language is similar 802 * or null if no language is similar
799 * @see detect() 803 * @see detect()
800 * @throws PEAR_Error 804 * @throws Text_LanguageDetect_Exception
801 */ 805 */
802 function detectSimple($sample) 806 public function detectSimple($sample)
803 { 807 {
804 $scores = $this->detect($sample, 1); 808 $scores = $this->detect($sample, 1);
805 809
806 // if top language has the maximum possible score, 810 // if top language has the maximum possible score,
807 // then the top score will have been picked at random 811 // then the top score will have been picked at random
808 if ( !is_array($scores) 812 if (!is_array($scores) || empty($scores)
809 || empty($scores) 813 || current($scores) == $this->_max_score
810 || current($scores) == $this->_max_score) { 814 ) {
811
812 return null; 815 return null;
813
814 } else { 816 } else {
815 return ucfirst(key($scores)); 817 return key($scores);
816 } 818 }
817 } 819 }
818 820
819 /** 821 /**
820 * Returns an array containing the most similar language and a confidence 822 * Returns an array containing the most similar language and a confidence
821 * rating 823 * rating
822 * 824 *
823 * Confidence is a simple measure calculated from the similarity score 825 * Confidence is a simple measure calculated from the similarity score
824 * minus the similarity score from the next most similar language 826 * minus the similarity score from the next most similar language
825 * divided by the highest possible score. Languages that have closely 827 * divided by the highest possible score. Languages that have closely
@@ -827,46 +829,43 @@ class Text_LanguageDetect
827 * confidence scores. 829 * confidence scores.
828 * 830 *
829 * The similarity score answers the question "How likely is the text the 831 * The similarity score answers the question "How likely is the text the
830 * returned language regardless of the other languages considered?" The 832 * returned language regardless of the other languages considered?" The
831 * confidence score is one way of answering the question "how likely is the 833 * confidence score is one way of answering the question "how likely is the
832 * text the detected language relative to the rest of the language model 834 * text the detected language relative to the rest of the language model
833 * set?" 835 * set?"
834 * 836 *
835 * To see how similar languages are a priori, see languageSimilarity() 837 * To see how similar languages are a priori, see languageSimilarity()
836 * 838 *
837 * @access public 839 * @param string $sample text for which language will be detected
838 * @param string $sample text for which language will be detected 840 *
839 * @return array most similar language, score and confidence rating 841 * @return array most similar language, score and confidence rating
840 * or null if no language is similar 842 * or null if no language is similar
841 * @see detect() 843 * @see detect()
842 * @throws PEAR_Error 844 * @throws Text_LanguageDetect_Exception
843 */ 845 */
844 function detectConfidence($sample) 846 public function detectConfidence($sample)
845 { 847 {
846 $scores = $this->detect($sample, 2); 848 $scores = $this->detect($sample, 2);
847 849
848 // if most similar language has the max score, it 850 // if most similar language has the max score, it
849 // will have been picked at random 851 // will have been picked at random
850 if ( !is_array($scores) 852 if (!is_array($scores) || empty($scores)
851 || empty($scores) 853 || current($scores) == $this->_max_score
852 || current($scores) == $this->_max_score) { 854 ) {
853
854 return null; 855 return null;
855 } 856 }
856 857
857 $arr['language'] = ucfirst(key($scores)); 858 $arr['language'] = key($scores);
858 $arr['similarity'] = current($scores); 859 $arr['similarity'] = current($scores);
859 if (next($scores) !== false) { // if false then no next element 860 if (next($scores) !== false) { // if false then no next element
860 // the goal is to return a higher value if the distance between 861 // the goal is to return a higher value if the distance between
861 // the similarity of the first score and the second score is high 862 // the similarity of the first score and the second score is high
862 863
863 if ($this->_perl_compatible) { 864 if ($this->_perl_compatible) {
864 865 $arr['confidence'] = (current($scores) - $arr['similarity'])
865 $arr['confidence'] = 866 / $this->_max_score;
866 (current($scores) - $arr['similarity']) / $this->_max_score;
867 867
868 } else { 868 } else {
869
870 $arr['confidence'] = $arr['similarity'] - current($scores); 869 $arr['confidence'] = $arr['similarity'] - current($scores);
871 870
872 } 871 }
@@ -882,32 +881,26 @@ class Text_LanguageDetect
882 * Returns the distribution of unicode blocks in a given utf8 string 881 * Returns the distribution of unicode blocks in a given utf8 string
883 * 882 *
884 * For the block name of a single char, use unicodeBlockName() 883 * For the block name of a single char, use unicodeBlockName()
885 * 884 *
886 * @access public 885 * @param string $str input string. Must be ascii or utf8
887 * @param string $str input string. Must be ascii or utf8 886 * @param bool $skip_symbols if true, skip ascii digits, symbols and
888 * @param bool $skip_symbols if true, skip ascii digits, symbols and 887 * non-printing characters. Includes spaces,
889 * non-printing characters. Includes spaces, 888 * newlines and common punctutation characters.
890 * newlines and common punctutation characters. 889 *
891 * @return array 890 * @return array
892 * @throws PEAR_Error 891 * @throws Text_LanguageDetect_Exception
893 */ 892 */
894 function detectUnicodeBlocks($str, $skip_symbols) 893 public function detectUnicodeBlocks($str, $skip_symbols)
895 { 894 {
896 // input check 895 $skip_symbols = (bool)$skip_symbols;
897 if (!is_bool($skip_symbols)) { 896 $str = (string)$str;
898 throw new Exception('Second parameter must be boolean');
899 }
900
901 if (!is_string($str)) {
902 throw new Exception('First parameter was not a string');
903 }
904 897
905 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 898 $sample_obj = new Text_LanguageDetect_Parser($str);
906 $sample_obj->prepareUnicode(); 899 $sample_obj->prepareUnicode();
907 $sample_obj->prepareTrigram(false); 900 $sample_obj->prepareTrigram(false);
908 $sample_obj->setUnicodeSkipSymbols($skip_symbols); 901 $sample_obj->setUnicodeSkipSymbols($skip_symbols);
909 $sample_obj->analyze(); 902 $sample_obj->analyze();
910 $blocks =& $sample_obj->getUnicodeBlocks(); 903 $blocks = $sample_obj->getUnicodeBlocks();
911 unset($sample_obj); 904 unset($sample_obj);
912 return $blocks; 905 return $blocks;
913 } 906 }
@@ -915,38 +908,37 @@ class Text_LanguageDetect
915 /** 908 /**
916 * Returns the block name for a given unicode value 909 * Returns the block name for a given unicode value
917 * 910 *
918 * If passed a string, will assume it is being passed a UTF8-formatted 911 * If passed a string, will assume it is being passed a UTF8-formatted
919 * character and will automatically convert. Otherwise it will assume it 912 * character and will automatically convert. Otherwise it will assume it
920 * is being passed a numeric unicode value. 913 * is being passed a numeric unicode value.
921 * 914 *
922 * Make sure input is of the correct type! 915 * Make sure input is of the correct type!
923 * 916 *
924 * @access public
925 * @param mixed $unicode unicode value or utf8 char 917 * @param mixed $unicode unicode value or utf8 char
918 *
926 * @return mixed the block name string or false if not found 919 * @return mixed the block name string or false if not found
927 * @throws PEAR_Error 920 * @throws Text_LanguageDetect_Exception
928 */ 921 */
929 function unicodeBlockName($unicode) { 922 public function unicodeBlockName($unicode)
923 {
930 if (is_string($unicode)) { 924 if (is_string($unicode)) {
931 // assume it is being passed a utf8 char, so convert it 925 // assume it is being passed a utf8 char, so convert it
932 926 if (self::utf8strlen($unicode) > 1) {
933 // input check 927 throw new Text_LanguageDetect_Exception(
934 if ($this->utf8strlen($unicode) > 1) { 928 'Pass a single char only to this method',
935 throw new Exception('Pass this function only a single char'); 929 Text_LanguageDetect_Exception::PARAM_TYPE
930 );
936 } 931 }
937
938 $unicode = $this->_utf8char2unicode($unicode); 932 $unicode = $this->_utf8char2unicode($unicode);
939 933
940 if ($unicode == -1) {
941 throw new Exception('Malformatted char');
942 }
943
944 // input check
945 } elseif (!is_int($unicode)) { 934 } elseif (!is_int($unicode)) {
946 throw new Exception('Input must be of type string or int.'); 935 throw new Text_LanguageDetect_Exception(
936 'Input must be of type string or int.',
937 Text_LanguageDetect_Exception::PARAM_TYPE
938 );
947 } 939 }
948 940
949 $blocks =& $this->_read_unicode_block_db(); 941 $blocks = $this->_read_unicode_block_db();
950 942
951 $result = $this->_unicode_block_name($unicode, $blocks); 943 $result = $this->_unicode_block_name($unicode, $blocks);
952 944
@@ -964,14 +956,17 @@ class Text_LanguageDetect
964 * the public interface for this function, which does input checks which 956 * the public interface for this function, which does input checks which
965 * this function omits for speed. 957 * this function omits for speed.
966 * 958 *
967 * @access protected 959 * @param int $unicode the unicode value
968 * @param int $unicode the unicode value 960 * @param array $blocks the block database
969 * @param array &$blocks the block database 961 * @param int $block_count the number of defined blocks in the database
970 * @param int $block_count the number of defined blocks in the database 962 *
971 * @see unicodeBlockName() 963 * @return mixed Block name, -1 if it failed
964 * @see unicodeBlockName()
965 * @access protected
972 */ 966 */
973 function _unicode_block_name($unicode, &$blocks, $block_count = -1) { 967 function _unicode_block_name($unicode, $blocks, $block_count = -1)
974 // for a reference, see 968 {
969 // for a reference, see
975 // http://www.unicode.org/Public/UNIDATA/Blocks.txt 970 // http://www.unicode.org/Public/UNIDATA/Blocks.txt
976 971
977 // assume that ascii characters are the most common 972 // assume that ascii characters are the most common
@@ -994,35 +989,36 @@ class Text_LanguageDetect
994 while ($low <= $high) { 989 while ($low <= $high) {
995 $mid = floor(($low + $high) / 2); 990 $mid = floor(($low + $high) / 2);
996 991
997 // if it's lower than the lower bound
998 if ($unicode < $blocks[$mid][0]) { 992 if ($unicode < $blocks[$mid][0]) {
993 // if it's lower than the lower bound
999 $high = $mid - 1; 994 $high = $mid - 1;
1000 995
1001 // if it's higher than the upper bound
1002 } elseif ($unicode > $blocks[$mid][1]) { 996 } elseif ($unicode > $blocks[$mid][1]) {
997 // if it's higher than the upper bound
1003 $low = $mid + 1; 998 $low = $mid + 1;
1004 999
1005 // found it
1006 } else { 1000 } else {
1001 // found it
1007 return $blocks[$mid]; 1002 return $blocks[$mid];
1008 } 1003 }
1009 } 1004 }
1010 1005
1011 // failed to find the block 1006 // failed to find the block
1012 return -1; 1007 return -1;
1013 1008
1014 // todo: differentiate when it's out of range or when it falls 1009 // todo: differentiate when it's out of range or when it falls
1015 // into an unassigned range? 1010 // into an unassigned range?
1016 } 1011 }
1017 1012
1018 /** 1013 /**
1019 * Brings up the unicode block database 1014 * Brings up the unicode block database
1020 * 1015 *
1021 * @access protected
1022 * @return array the database of unicode block definitions 1016 * @return array the database of unicode block definitions
1023 * @throws PEAR_Error 1017 * @throws Text_LanguageDetect_Exception
1018 * @access protected
1024 */ 1019 */
1025 function &_read_unicode_block_db() { 1020 function _read_unicode_block_db()
1021 {
1026 // since the unicode definitions are always going to be the same, 1022 // since the unicode definitions are always going to be the same,
1027 // might as well share the memory for the db with all other instances 1023 // might as well share the memory for the db with all other instances
1028 // of this class 1024 // of this class
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect
1037 1033
1038 /** 1034 /**
1039 * Calculate the similarities between the language models 1035 * Calculate the similarities between the language models
1040 * 1036 *
1041 * Use this function to see how similar languages are to each other. 1037 * Use this function to see how similar languages are to each other.
1042 * 1038 *
1043 * If passed 2 language names, will return just those languages compared. 1039 * If passed 2 language names, will return just those languages compared.
1044 * If passed 1 language name, will return that language compared to 1040 * If passed 1 language name, will return that language compared to
1045 * all others. 1041 * all others.
1046 * If passed none, will return an array of every language model compared 1042 * If passed none, will return an array of every language model compared
1047 * to every other one. 1043 * to every other one.
1048 * 1044 *
1049 * @access public 1045 * @param string $lang1 the name of the first language to be compared
1050 * @param string $lang1 the name of the first language to be compared 1046 * @param string $lang2 the name of the second language to be compared
1051 * @param string $lang2 the name of the second language to be compared 1047 *
1052 * @return array scores of every language compared 1048 * @return array scores of every language compared
1053 * or the score of just the provided languages 1049 * or the score of just the provided languages
1054 * or null if one of the supplied languages does not exist 1050 * or null if one of the supplied languages does not exist
1055 * @throws PEAR_Error 1051 * @throws Text_LanguageDetect_Exception
1056 */ 1052 */
1057 function languageSimilarity($lang1 = null, $lang2 = null) 1053 public function languageSimilarity($lang1 = null, $lang2 = null)
1058 { 1054 {
1059 if (!$this->_setup_ok($err)) { 1055 $lang1 = $this->_convertFromNameMode($lang1);
1060 return $err; 1056 $lang2 = $this->_convertFromNameMode($lang2);
1061 }
1062
1063 if ($lang1 != null) { 1057 if ($lang1 != null) {
1064 $lang1 = strtolower($lang1); 1058 $lang1 = strtolower($lang1);
1065 1059
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect
1069 } 1063 }
1070 1064
1071 if ($lang2 != null) { 1065 if ($lang2 != null) {
1072 1066 if (!isset($this->_lang_db[$lang2])) {
1073 // can't only set the second param 1067 // check if language model exists
1074 if ($lang1 == null) {
1075 return null;
1076 // check if language model exists
1077 } elseif (!isset($this->_lang_db[$lang2])) {
1078 return null; 1068 return null;
1079 } 1069 }
1080 1070
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect
1088 ) 1078 )
1089 ); 1079 );
1090 1080
1091
1092 // compare just $lang1 to all languages
1093 } else { 1081 } else {
1082 // compare just $lang1 to all languages
1094 $return_arr = array(); 1083 $return_arr = array();
1095 foreach ($this->_lang_db as $key => $value) { 1084 foreach ($this->_lang_db as $key => $value) {
1096 if ($key != $lang1) { // don't compare a language to itself 1085 if ($key != $lang1) {
1086 // don't compare a language to itself
1097 $return_arr[$key] = $this->_normalize_score( 1087 $return_arr[$key] = $this->_normalize_score(
1098 $this->_distance($this->_lang_db[$lang1], $value)); 1088 $this->_distance($this->_lang_db[$lang1], $value)
1089 );
1099 } 1090 }
1100 } 1091 }
1101 asort($return_arr); 1092 asort($return_arr);
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect
1104 } 1095 }
1105 1096
1106 1097
1107 // compare all languages to each other
1108 } else { 1098 } else {
1099 // compare all languages to each other
1109 $return_arr = array(); 1100 $return_arr = array();
1110 foreach (array_keys($this->_lang_db) as $lang1) { 1101 foreach (array_keys($this->_lang_db) as $lang1) {
1111 foreach (array_keys($this->_lang_db) as $lang2) { 1102 foreach (array_keys($this->_lang_db) as $lang2) {
1112
1113 // skip comparing languages to themselves 1103 // skip comparing languages to themselves
1114 if ($lang1 != $lang2) { 1104 if ($lang1 != $lang2) {
1115
1116 // don't re-calculate what's already been done
1117 if (isset($return_arr[$lang2][$lang1])) {
1118 1105
1119 $return_arr[$lang1][$lang2] = 1106 if (isset($return_arr[$lang2][$lang1])) {
1120 $return_arr[$lang2][$lang1]; 1107 // don't re-calculate what's already been done
1108 $return_arr[$lang1][$lang2]
1109 = $return_arr[$lang2][$lang1];
1121 1110
1122 // calculate
1123 } else { 1111 } else {
1124 1112 // calculate
1125 $return_arr[$lang1][$lang2] = 1113 $return_arr[$lang1][$lang2]
1126 $this->_normalize_score( 1114 = $this->_normalize_score(
1127 $this->_distance( 1115 $this->_distance(
1128 $this->_lang_db[$lang1], 1116 $this->_lang_db[$lang1],
1129 $this->_lang_db[$lang2] 1117 $this->_lang_db[$lang2]
1130 ) 1118 )
1131 ); 1119 );
1132 1120
1133 } 1121 }
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect
1150 * 1138 *
1151 * @access public 1139 * @access public
1152 * @return array language cluster data 1140 * @return array language cluster data
1153 * @throws PEAR_Error 1141 * @throws Text_LanguageDetect_Exception
1154 * @see languageSimilarity() 1142 * @see languageSimilarity()
1155 * @deprecated this function will eventually be removed and placed into 1143 * @deprecated this function will eventually be removed and placed into
1156 * the model generation class 1144 * the model generation class
1157 */ 1145 */
1158 function clusterLanguages() 1146 function clusterLanguages()
1159 { 1147 {
1160 // todo: set the maximum number of clusters 1148 // todo: set the maximum number of clusters
1161
1162 // setup check
1163 if (!$this->_setup_ok($err)) {
1164 return $err;
1165 }
1166
1167 // return cached result, if any 1149 // return cached result, if any
1168 if (isset($this->_clusters)) { 1150 if (isset($this->_clusters)) {
1169 return $this->_clusters; 1151 return $this->_clusters;
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect
1177 1159
1178 foreach ($langs as $lang) { 1160 foreach ($langs as $lang) {
1179 if (!isset($this->_lang_db[$lang])) { 1161 if (!isset($this->_lang_db[$lang])) {
1180 throw new Exception("missing $lang!\n"); 1162 throw new Text_LanguageDetect_Exception(
1163 "missing $lang!",
1164 Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
1165 );
1181 } 1166 }
1182 } 1167 }
1183 1168
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect
1186 $langs[$lang1] = $lang1; 1171 $langs[$lang1] = $lang1;
1187 unset($langs[$old_key]); 1172 unset($langs[$old_key]);
1188 } 1173 }
1189 1174
1175 $result_data = $really_map = array();
1176
1190 $i = 0; 1177 $i = 0;
1191 while (count($langs) > 2 && $i++ < 200) { 1178 while (count($langs) > 2 && $i++ < 200) {
1192 $highest_score = -1; 1179 $highest_score = -1;
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect
1194 $highest_key2 = ''; 1181 $highest_key2 = '';
1195 foreach ($langs as $lang1) { 1182 foreach ($langs as $lang1) {
1196 foreach ($langs as $lang2) { 1183 foreach ($langs as $lang2) {
1197 if ( $lang1 != $lang2 1184 if ($lang1 != $lang2
1198 && $arr[$lang1][$lang2] > $highest_score) { 1185 && $arr[$lang1][$lang2] > $highest_score
1186 ) {
1199 $highest_score = $arr[$lang1][$lang2]; 1187 $highest_score = $arr[$lang1][$lang2];
1200 $highest_key1 = $lang1; 1188 $highest_key1 = $lang1;
1201 $highest_key2 = $lang2; 1189 $highest_key2 = $lang2;
1202 } 1190 }
1203 } 1191 }
1204 } 1192 }
1205 1193
1206 if (!$highest_key1) { 1194 if (!$highest_key1) {
1207 // should not ever happen 1195 // should not ever happen
1208 throw new Exception("no highest key? (step: $i)"); 1196 throw new Text_LanguageDetect_Exception(
1197 "no highest key? (step: $i)",
1198 Text_LanguageDetect_Exception::NO_HIGHEST_KEY
1199 );
1209 } 1200 }
1210 1201
1211 if ($highest_score == 0) { 1202 if ($highest_score == 0) {
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect
1217 $sum1 = array_sum($arr[$highest_key1]); 1208 $sum1 = array_sum($arr[$highest_key1]);
1218 $sum2 = array_sum($arr[$highest_key2]); 1209 $sum2 = array_sum($arr[$highest_key2]);
1219 1210
1220 // use the score for the one that is most similar to the rest of 1211 // use the score for the one that is most similar to the rest of
1221 // the field as the score for the group 1212 // the field as the score for the group
1222 // todo: could try averaging or "centroid" method instead 1213 // todo: could try averaging or "centroid" method instead
1223 // seems like that might make more sense 1214 // seems like that might make more sense
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect
1248 $really_lang = $replaceme; 1239 $really_lang = $replaceme;
1249 while (isset($really_map[$really_lang])) { 1240 while (isset($really_map[$really_lang])) {
1250 $really_lang = $really_map[$really_lang]; 1241 $really_lang = $really_map[$really_lang];
1251 } 1242 }
1252 $really_map[$newkey] = $really_lang; 1243 $really_map[$newkey] = $really_lang;
1253 1244
1254 1245
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect
1259 $arr[$key1][$newkey] = $arr[$key1][$key2]; 1250 $arr[$key1][$newkey] = $arr[$key1][$key2];
1260 unset($arr[$key1][$key2]); 1251 unset($arr[$key1][$key2]);
1261 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] 1252 // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
1262 } 1253 }
1263 1254
1264 if ($key1 == $replaceme) { 1255 if ($key1 == $replaceme) {
1265 $arr[$newkey][$key2] = $arr[$key1][$key2]; 1256 $arr[$newkey][$key2] = $arr[$key1][$key2];
1266 unset($arr[$key1][$key2]); 1257 unset($arr[$key1][$key2]);
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect
1273 } 1264 }
1274 } 1265 }
1275 } 1266 }
1276 1267
1277 1268
1278 unset($langs[$highest_key1]); 1269 unset($langs[$highest_key1]);
1279 unset($langs[$highest_key2]); 1270 unset($langs[$highest_key2]);
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect
1293 } 1284 }
1294 1285
1295 $return_val = array( 1286 $return_val = array(
1296 'open_forks' => $langs, 1287 'open_forks' => $langs,
1297 // the top level of clusters 1288 // the top level of clusters
1298 // clusters that are mutually exclusive 1289 // clusters that are mutually exclusive
1299 // or specified by a specific maximum 1290 // or specified by a specific maximum
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect
1323 * use, and it may disappear or its functionality may change in future 1314 * use, and it may disappear or its functionality may change in future
1324 * releases without notice. 1315 * releases without notice.
1325 * 1316 *
1326 * This compares the sample text to top the top level of clusters. If the 1317 * This compares the sample text to top the top level of clusters. If the
1327 * sample is similar to the cluster it will drop down and compare it to the 1318 * sample is similar to the cluster it will drop down and compare it to the
1328 * languages in the cluster, and so on until it hits a leaf node. 1319 * languages in the cluster, and so on until it hits a leaf node.
1329 * 1320 *
1330 * this should find the language in considerably fewer compares 1321 * this should find the language in considerably fewer compares
1331 * (the equivalent of a binary search), however clusterLanguages() is costly 1322 * (the equivalent of a binary search), however clusterLanguages() is costly
1332 * and the loss of accuracy from this technique is significant. 1323 * and the loss of accuracy from this technique is significant.
1333 * 1324 *
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect
1337 * was very large, however in such cases some method of Bayesian inference 1328 * was very large, however in such cases some method of Bayesian inference
1338 * might be more helpful. 1329 * might be more helpful.
1339 * 1330 *
1340 * @see clusterLanguages() 1331 * @param string $str input string
1341 * @access public 1332 *
1342 * @param string $str input string 1333 * @return array language scores (only those compared)
1343 * @return array language scores (only those compared) 1334 * @throws Text_LanguageDetect_Exception
1344 * @throws PEAR_Error 1335 * @see clusterLanguages()
1345 */ 1336 */
1346 function clusteredSearch($str) 1337 public function clusteredSearch($str)
1347 { 1338 {
1348
1349 // input check 1339 // input check
1350 if (!Text_LanguageDetect_Parser::validateString($str)) { 1340 if (!Text_LanguageDetect_Parser::validateString($str)) {
1351 return array(); 1341 return array();
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect
1359 $dendogram_data = $result['fork_data']; 1349 $dendogram_data = $result['fork_data'];
1360 $dendogram_alias = $result['name_map']; 1350 $dendogram_alias = $result['name_map'];
1361 1351
1362 $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); 1352 $sample_obj = new Text_LanguageDetect_Parser($str);
1363 $sample_obj->prepareTrigram(); 1353 $sample_obj->prepareTrigram();
1364 $sample_obj->setPadStart(!$this->_perl_compatible); 1354 $sample_obj->setPadStart(!$this->_perl_compatible);
1365 $sample_obj->analyze(); 1355 $sample_obj->analyze();
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect
1372 } 1362 }
1373 1363
1374 $i = 0; // counts the number of steps 1364 $i = 0; // counts the number of steps
1375 1365
1376 foreach ($dendogram_start as $lang) { 1366 foreach ($dendogram_start as $lang) {
1377 if (isset($dendogram_alias[$lang])) { 1367 if (isset($dendogram_alias[$lang])) {
1378 $lang_key = $dendogram_alias[$lang]; 1368 $lang_key = $dendogram_alias[$lang];
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect
1382 1372
1383 $scores[$lang] = $this->_normalize_score( 1373 $scores[$lang] = $this->_normalize_score(
1384 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1374 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1385 $sample_count); 1375 $sample_count
1376 );
1386 1377
1387 $i++; 1378 $i++;
1388 } 1379 }
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect
1411 1402
1412 $scores[$lang] = $this->_normalize_score( 1403 $scores[$lang] = $this->_normalize_score(
1413 $this->_distance($this->_lang_db[$lang_key], $sample_result), 1404 $this->_distance($this->_lang_db[$lang_key], $sample_result),
1414 $sample_count); 1405 $sample_count
1406 );
1415 1407
1416 //todo: does not need to do same comparison again 1408 //todo: does not need to do same comparison again
1417 } 1409 }
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect
1428 1420
1429 $diff = $scores[$cur_key] - $scores[$loser_key]; 1421 $diff = $scores[$cur_key] - $scores[$loser_key];
1430 1422
1431 // $cur_key ({$dendogram_alias[$cur_key]}) wins 1423 // $cur_key ({$dendogram_alias[$cur_key]}) wins
1432 // over $loser_key ({$dendogram_alias[$loser_key]}) 1424 // over $loser_key ({$dendogram_alias[$loser_key]})
1433 // with a difference of $diff 1425 // with a difference of $diff
1434 } 1426 }
1435 1427
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect
1439 // which paths the algorithm decided to take along the tree 1431 // which paths the algorithm decided to take along the tree
1440 1432
1441 // but sometimes the last item is only the second highest 1433 // but sometimes the last item is only the second highest
1442 if ( ($this->_perl_compatible && (end($scores) > prev($scores))) 1434 if (($this->_perl_compatible && (end($scores) > prev($scores)))
1443 || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { 1435 || (!$this->_perl_compatible && (end($scores) < prev($scores)))
1444 1436 ) {
1445 $real_last_score = current($scores); 1437 $real_last_score = current($scores);
1446 $real_last_key = key($scores); 1438 $real_last_key = key($scores);
1447 1439
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect
1449 unset($scores[$real_last_key]); 1441 unset($scores[$real_last_key]);
1450 $scores[$real_last_key] = $real_last_score; 1442 $scores[$real_last_key] = $real_last_score;
1451 } 1443 }
1452 1444
1453 1445
1454 if (!$this->_perl_compatible) { 1446 if (!$this->_perl_compatible) {
1455 $scores = array_reverse($scores, true); 1447 $scores = array_reverse($scores, true);
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect
1464 * 1456 *
1465 * Returns the numbers of characters (not bytes) in a utf8 string 1457 * Returns the numbers of characters (not bytes) in a utf8 string
1466 * 1458 *
1467 * @static 1459 * @param string $str string to get the length of
1468 * @access public 1460 *
1469 * @param string $str string to get the length of 1461 * @return int number of chars
1470 * @return int number of chars
1471 */ 1462 */
1472 function utf8strlen($str) 1463 public static function utf8strlen($str)
1473 { 1464 {
1474 // utf8_decode() will convert unknown chars to '?', which is actually 1465 // utf8_decode() will convert unknown chars to '?', which is actually
1475 // ideal for counting. 1466 // ideal for counting.
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect
1482 /** 1473 /**
1483 * Returns the unicode value of a utf8 char 1474 * Returns the unicode value of a utf8 char
1484 * 1475 *
1485 * @access protected 1476 * @param string $char a utf8 (possibly multi-byte) char
1486 * @param string $char a utf8 (possibly multi-byte) char 1477 *
1487 * @return int unicode value or -1 if malformatted 1478 * @return int unicode value
1479 * @access protected
1480 * @link http://en.wikipedia.org/wiki/UTF-8
1488 */ 1481 */
1489 function _utf8char2unicode($char) { 1482 function _utf8char2unicode($char)
1490 1483 {
1491 // strlen() here will actually get the binary length of a single char 1484 // strlen() here will actually get the binary length of a single char
1492 switch (strlen($char)) { 1485 switch (strlen($char)) {
1493 1486 case 1:
1494 // for a reference, see http://en.wikipedia.org/wiki/UTF-8 1487 // normal ASCII-7 byte
1495 1488 // 0xxxxxxx --> 0xxxxxxx
1496 case 1: 1489 return ord($char{0});
1497 // normal ASCII-7 byte 1490
1498 // 0xxxxxxx --> 0xxxxxxx 1491 case 2:
1499 return ord($char{0}); 1492 // 2 byte unicode
1500 1493 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
1501 case 2: 1494 $z = (ord($char{0}) & 0x000001F) << 6;
1502 // 2 byte unicode 1495 $x = (ord($char{1}) & 0x0000003F);
1503 // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx 1496 return ($z | $x);
1504 $z = (ord($char{0}) & 0x000001F) << 6; 1497
1505 $x = (ord($char{1}) & 0x0000003F); 1498 case 3:
1506 1499 // 3 byte unicode
1507 return ($z | $x); 1500 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
1508 1501 $z = (ord($char{0}) & 0x0000000F) << 12;
1509 case 3: 1502 $x1 = (ord($char{1}) & 0x0000003F) << 6;
1510 // 3 byte unicode 1503 $x2 = (ord($char{2}) & 0x0000003F);
1511 // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx 1504 return ($z | $x1 | $x2);
1512 $z = (ord($char{0}) & 0x0000000F) << 12; 1505
1513 $x1 = (ord($char{1}) & 0x0000003F) << 6; 1506 case 4:
1514 $x2 = (ord($char{2}) & 0x0000003F); 1507 // 4 byte unicode
1515 1508 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
1516 return ($z | $x1 | $x2); 1509 // 000zzzzz xxxxxxxx xxxxxxxx
1517 1510 $z1 = (ord($char{0}) & 0x00000007) << 18;
1518 case 4: 1511 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1519 // 4 byte unicode 1512 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1520 // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> 1513 $x2 = (ord($char{3}) & 0x0000003F);
1521 // 000zzzzz xxxxxxxx xxxxxxxx 1514 return ($z1 | $z2 | $x1 | $x2);
1522 $z1 = (ord($char{0}) & 0x00000007) << 18;
1523 $z2 = (ord($char{1}) & 0x0000003F) << 12;
1524 $x1 = (ord($char{2}) & 0x0000003F) << 6;
1525 $x2 = (ord($char{3}) & 0x0000003F);
1526
1527 return ($z1 | $z2 | $x1 | $x2);
1528
1529 default:
1530 // error: malformatted char?
1531 return -1;
1532 } 1515 }
1533 } 1516 }
1534 1517
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect
1536 * utf8-safe fast character iterator 1519 * utf8-safe fast character iterator
1537 * 1520 *
1538 * Will get the next character starting from $counter, which will then be 1521 * Will get the next character starting from $counter, which will then be
1539 * incremented. If a multi-byte char the bytes will be concatenated and 1522 * incremented. If a multi-byte char the bytes will be concatenated and
1540 * $counter will be incremeted by the number of bytes in the char. 1523 * $counter will be incremeted by the number of bytes in the char.
1541 * 1524 *
1542 * @access private 1525 * @param string $str the string being iterated over
1543 * @param string &$str the string being iterated over 1526 * @param int &$counter the iterator, will increment by reference
1544 * @param int &$counter the iterator, will increment by reference 1527 * @param bool $special_convert whether to do special conversions
1545 * @param bool $special_convert whether to do special conversions 1528 *
1546 * @return char the next (possibly multi-byte) char from $counter 1529 * @return char the next (possibly multi-byte) char from $counter
1530 * @access private
1547 */ 1531 */
1548 function _next_char(&$str, &$counter, $special_convert = false) 1532 static function _next_char($str, &$counter, $special_convert = false)
1549 { 1533 {
1550
1551 $char = $str{$counter++}; 1534 $char = $str{$counter++};
1552 $ord = ord($char); 1535 $ord = ord($char);
1553 1536
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect
1556 1539
1557 // normal ascii one byte char 1540 // normal ascii one byte char
1558 if ($ord <= 127) { 1541 if ($ord <= 127) {
1559
1560 // special conversions needed for this package 1542 // special conversions needed for this package
1561 // (that only apply to regular ascii characters) 1543 // (that only apply to regular ascii characters)
1562 // lower case, and convert all non-alphanumeric characters 1544 // lower case, and convert all non-alphanumeric characters
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect
1571 1553
1572 return $char; 1554 return $char;
1573 1555
1574 // multi-byte chars
1575 } elseif ($ord >> 5 == 6) { // two-byte char 1556 } elseif ($ord >> 5 == 6) { // two-byte char
1557 // multi-byte chars
1576 $nextchar = $str{$counter++}; // get next byte 1558 $nextchar = $str{$counter++}; // get next byte
1577 1559
1578 // lower-casing of non-ascii characters is still incomplete 1560 // lower-casing of non-ascii characters is still incomplete
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect
1582 if ($ord == 195) { 1564 if ($ord == 195) {
1583 $nextord = ord($nextchar); 1565 $nextord = ord($nextchar);
1584 $nextord_adj = $nextord + 64; 1566 $nextord_adj = $nextord + 64;
1585 // for a reference, see 1567 // for a reference, see
1586 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html 1568 // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
1587 1569
1588 // &Agrave; - &THORN; but not &times; 1570 // &Agrave; - &THORN; but not &times;
1589 if ( $nextord_adj >= 192 1571 if ($nextord_adj >= 192
1590 && $nextord_adj <= 222 1572 && $nextord_adj <= 222
1591 && $nextord_adj != 215) { 1573 && $nextord_adj != 215
1592 1574 ) {
1593 $nextchar = chr($nextord + 32); 1575 $nextchar = chr($nextord + 32);
1594 } 1576 }
1595 1577
1596 // lower case cyrillic alphabet
1597 } elseif ($ord == 208) { 1578 } elseif ($ord == 208) {
1579 // lower case cyrillic alphabet
1598 $nextord = ord($nextchar); 1580 $nextord = ord($nextchar);
1599 // if A - Pe 1581 // if A - Pe
1600 if ($nextord >= 144 && $nextord <= 159) { 1582 if ($nextord >= 144 && $nextord <= 159) {
1601 // lower case 1583 // lower case
1602 $nextchar = chr($nextord + 32); 1584 $nextchar = chr($nextord + 32);
1603 1585
1604 // if Er - Ya
1605 } elseif ($nextord >= 160 && $nextord <= 175) { 1586 } elseif ($nextord >= 160 && $nextord <= 175) {
1587 // if Er - Ya
1606 // lower case 1588 // lower case
1607 $char = chr(209); // == $ord++ 1589 $char = chr(209); // == $ord++
1608 $nextchar = chr($nextord - 32); 1590 $nextchar = chr($nextord - 32);
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect
1611 } 1593 }
1612 1594
1613 // tag on next byte 1595 // tag on next byte
1614 return $char . $nextchar; 1596 return $char . $nextchar;
1615
1616 } elseif ($ord >> 4 == 14) { // three-byte char 1597 } elseif ($ord >> 4 == 14) { // three-byte char
1617 1598
1618 // tag on next 2 bytes 1599 // tag on next 2 bytes
1619 return $char . $str{$counter++} . $str{$counter++}; 1600 return $char . $str{$counter++} . $str{$counter++};
1620 1601
1621 } elseif ($ord >> 3 == 30) { // four-byte char 1602 } elseif ($ord >> 3 == 30) { // four-byte char
1622 1603
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect
1628 } 1609 }
1629 } 1610 }
1630 1611
1631} 1612 /**
1613 * Converts an $language input parameter from the configured mode
1614 * to the language name that is used internally.
1615 *
1616 * Works for strings and arrays.
1617 *
1618 * @param string|array $lang A language description ("english"/"en"/"eng")
1619 * @param boolean $convertKey If $lang is an array, setting $key
1620 * converts the keys to the language name.
1621 *
1622 * @return string|array Language name
1623 */
1624 function _convertFromNameMode($lang, $convertKey = false)
1625 {
1626 if ($this->_name_mode == 0) {
1627 return $lang;
1628 }
1629
1630 if ($this->_name_mode == 2) {
1631 $method = 'code2ToName';
1632 } else {
1633 $method = 'code3ToName';
1634 }
1635
1636 if (is_string($lang)) {
1637 return (string)Text_LanguageDetect_ISO639::$method($lang);
1638 }
1639
1640 $newlang = array();
1641 foreach ($lang as $key => $val) {
1642 if ($convertKey) {
1643 $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
1644 $newlang[$newkey] = $val;
1645 } else {
1646 $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
1647 }
1648 }
1649 return $newlang;
1650 }
1632 1651
1633/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 1652 /**
1653 * Converts an $language output parameter from the language name that is
1654 * used internally to the configured mode.
1655 *
1656 * Works for strings and arrays.
1657 *
1658 * @param string|array $lang A language description ("english"/"en"/"eng")
1659 * @param boolean $convertKey If $lang is an array, setting $key
1660 * converts the keys to the language name.
1661 *
1662 * @return string|array Language name
1663 */
1664 function _convertToNameMode($lang, $convertKey = false)
1665 {
1666 if ($this->_name_mode == 0) {
1667 return $lang;
1668 }
1669
1670 if ($this->_name_mode == 2) {
1671 $method = 'nameToCode2';
1672 } else {
1673 $method = 'nameToCode3';
1674 }
1675
1676 if (is_string($lang)) {
1677 return Text_LanguageDetect_ISO639::$method($lang);
1678 }
1679
1680 $newlang = array();
1681 foreach ($lang as $key => $val) {
1682 if ($convertKey) {
1683 $newkey = Text_LanguageDetect_ISO639::$method($key);
1684 $newlang[$newkey] = $val;
1685 } else {
1686 $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
1687 }
1688 }
1689 return $newlang;
1690 }
1691}
1634 1692
1635?> 1693/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
1<?php
2class Text_LanguageDetect_Exception extends Exception
3{
4 /**
5 * Database file could not be found
6 */
7 const DB_NOT_FOUND = 10;
8
9 /**
10 * Database file found, but not readable
11 */
12 const DB_NOT_READABLE = 11;
13
14 /**
15 * Database file is empty
16 */
17 const DB_EMPTY = 12;
18
19 /**
20 * Database contents is not a PHP array
21 */
22 const DB_NOT_ARRAY = 13;
23
24 /**
25 * Magic quotes are activated
26 */
27 const MAGIC_QUOTES = 14;
28
29
30 /**
31 * Parameter of invalid type passed to method
32 */
33 const PARAM_TYPE = 20;
34
35 /**
36 * Character in parameter is invalid
37 */
38 const INVALID_CHAR = 21;
39
40
41 /**
42 * Language is not in the database
43 */
44 const UNKNOWN_LANGUAGE = 30;
45
46
47 /**
48 * Error during block detection
49 */
50 const BLOCK_DETECTION = 40;
51
52
53 /**
54 * Error while clustering languages
55 */
56 const NO_HIGHEST_KEY = 50;
57}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
1<?php
2/**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Christian Weiske <cweiske@php.net>
10 * @copyright 2011 Christian Weiske <cweiske@php.net>
11 * @license http://www.debian.org/misc/bsd.license BSD
12 * @version SVN: $Id$
13 * @link http://pear.php.net/package/Text_LanguageDetect/
14 */
15
16/**
17 * Provides a mapping between the languages from lang.dat and the
18 * ISO 639-1 and ISO-639-2 codes.
19 *
20 * Note that this class contains only languages that exist in lang.dat.
21 *
22 * @category Text
23 * @package Text_LanguageDetect
24 * @author Christian Weiske <cweiske@php.net>
25 * @copyright 2011 Christian Weiske <cweiske@php.net>
26 * @license http://www.debian.org/misc/bsd.license BSD
27 * @link http://www.loc.gov/standards/iso639-2/php/code_list.php
28 */
29class Text_LanguageDetect_ISO639
30{
31 /**
32 * Maps all language names from the language database to the
33 * ISO 639-1 2-letter language code.
34 *
35 * NULL indicates that there is no 2-letter code.
36 *
37 * @var array
38 */
39 public static $nameToCode2 = array(
40 'albanian' => 'sq',
41 'arabic' => 'ar',
42 'azeri' => 'az',
43 'bengali' => 'bn',
44 'bulgarian' => 'bg',
45 'cebuano' => null,
46 'croatian' => 'hr',
47 'czech' => 'cs',
48 'danish' => 'da',
49 'dutch' => 'nl',
50 'english' => 'en',
51 'estonian' => 'et',
52 'farsi' => 'fa',
53 'finnish' => 'fi',
54 'french' => 'fr',
55 'german' => 'de',
56 'hausa' => 'ha',
57 'hawaiian' => null,
58 'hindi' => 'hi',
59 'hungarian' => 'hu',
60 'icelandic' => 'is',
61 'indonesian' => 'id',
62 'italian' => 'it',
63 'kazakh' => 'kk',
64 'kyrgyz' => 'ky',
65 'latin' => 'la',
66 'latvian' => 'lv',
67 'lithuanian' => 'lt',
68 'macedonian' => 'mk',
69 'mongolian' => 'mn',
70 'nepali' => 'ne',
71 'norwegian' => 'no',
72 'pashto' => 'ps',
73 'pidgin' => null,
74 'polish' => 'pl',
75 'portuguese' => 'pt',
76 'romanian' => 'ro',
77 'russian' => 'ru',
78 'serbian' => 'sr',
79 'slovak' => 'sk',
80 'slovene' => 'sl',
81 'somali' => 'so',
82 'spanish' => 'es',
83 'swahili' => 'sw',
84 'swedish' => 'sv',
85 'tagalog' => 'tl',
86 'turkish' => 'tr',
87 'ukrainian' => 'uk',
88 'urdu' => 'ur',
89 'uzbek' => 'uz',
90 'vietnamese' => 'vi',
91 'welsh' => 'cy',
92 );
93
94 /**
95 * Maps all language names from the language database to the
96 * ISO 639-2 3-letter language code.
97 *
98 * @var array
99 */
100 public static $nameToCode3 = array(
101 'albanian' => 'sqi',
102 'arabic' => 'ara',
103 'azeri' => 'aze',
104 'bengali' => 'ben',
105 'bulgarian' => 'bul',
106 'cebuano' => 'ceb',
107 'croatian' => 'hrv',
108 'czech' => 'ces',
109 'danish' => 'dan',
110 'dutch' => 'nld',
111 'english' => 'eng',
112 'estonian' => 'est',
113 'farsi' => 'fas',
114 'finnish' => 'fin',
115 'french' => 'fra',
116 'german' => 'deu',
117 'hausa' => 'hau',
118 'hawaiian' => 'haw',
119 'hindi' => 'hin',
120 'hungarian' => 'hun',
121 'icelandic' => 'isl',
122 'indonesian' => 'ind',
123 'italian' => 'ita',
124 'kazakh' => 'kaz',
125 'kyrgyz' => 'kir',
126 'latin' => 'lat',
127 'latvian' => 'lav',
128 'lithuanian' => 'lit',
129 'macedonian' => 'mkd',
130 'mongolian' => 'mon',
131 'nepali' => 'nep',
132 'norwegian' => 'nor',
133 'pashto' => 'pus',
134 'pidgin' => 'crp',
135 'polish' => 'pol',
136 'portuguese' => 'por',
137 'romanian' => 'ron',
138 'russian' => 'rus',
139 'serbian' => 'srp',
140 'slovak' => 'slk',
141 'slovene' => 'slv',
142 'somali' => 'som',
143 'spanish' => 'spa',
144 'swahili' => 'swa',
145 'swedish' => 'swe',
146 'tagalog' => 'tgl',
147 'turkish' => 'tur',
148 'ukrainian' => 'ukr',
149 'urdu' => 'urd',
150 'uzbek' => 'uzb',
151 'vietnamese' => 'vie',
152 'welsh' => 'cym',
153 );
154
155 /**
156 * Maps ISO 639-1 2-letter language codes to the language names
157 * in the language database
158 *
159 * Not all languages have a 2 letter code, so some are missing
160 *
161 * @var array
162 */
163 public static $code2ToName = array(
164 'ar' => 'arabic',
165 'az' => 'azeri',
166 'bg' => 'bulgarian',
167 'bn' => 'bengali',
168 'cs' => 'czech',
169 'cy' => 'welsh',
170 'da' => 'danish',
171 'de' => 'german',
172 'en' => 'english',
173 'es' => 'spanish',
174 'et' => 'estonian',
175 'fa' => 'farsi',
176 'fi' => 'finnish',
177 'fr' => 'french',
178 'ha' => 'hausa',
179 'hi' => 'hindi',
180 'hr' => 'croatian',
181 'hu' => 'hungarian',
182 'id' => 'indonesian',
183 'is' => 'icelandic',
184 'it' => 'italian',
185 'kk' => 'kazakh',
186 'ky' => 'kyrgyz',
187 'la' => 'latin',
188 'lt' => 'lithuanian',
189 'lv' => 'latvian',
190 'mk' => 'macedonian',
191 'mn' => 'mongolian',
192 'ne' => 'nepali',
193 'nl' => 'dutch',
194 'no' => 'norwegian',
195 'pl' => 'polish',
196 'ps' => 'pashto',
197 'pt' => 'portuguese',
198 'ro' => 'romanian',
199 'ru' => 'russian',
200 'sk' => 'slovak',
201 'sl' => 'slovene',
202 'so' => 'somali',
203 'sq' => 'albanian',
204 'sr' => 'serbian',
205 'sv' => 'swedish',
206 'sw' => 'swahili',
207 'tl' => 'tagalog',
208 'tr' => 'turkish',
209 'uk' => 'ukrainian',
210 'ur' => 'urdu',
211 'uz' => 'uzbek',
212 'vi' => 'vietnamese',
213 );
214
215 /**
216 * Maps ISO 639-2 3-letter language codes to the language names
217 * in the language database.
218 *
219 * @var array
220 */
221 public static $code3ToName = array(
222 'ara' => 'arabic',
223 'aze' => 'azeri',
224 'ben' => 'bengali',
225 'bul' => 'bulgarian',
226 'ceb' => 'cebuano',
227 'ces' => 'czech',
228 'crp' => 'pidgin',
229 'cym' => 'welsh',
230 'dan' => 'danish',
231 'deu' => 'german',
232 'eng' => 'english',
233 'est' => 'estonian',
234 'fas' => 'farsi',
235 'fin' => 'finnish',
236 'fra' => 'french',
237 'hau' => 'hausa',
238 'haw' => 'hawaiian',
239 'hin' => 'hindi',
240 'hrv' => 'croatian',
241 'hun' => 'hungarian',
242 'ind' => 'indonesian',
243 'isl' => 'icelandic',
244 'ita' => 'italian',
245 'kaz' => 'kazakh',
246 'kir' => 'kyrgyz',
247 'lat' => 'latin',
248 'lav' => 'latvian',
249 'lit' => 'lithuanian',
250 'mkd' => 'macedonian',
251 'mon' => 'mongolian',
252 'nep' => 'nepali',
253 'nld' => 'dutch',
254 'nor' => 'norwegian',
255 'pol' => 'polish',
256 'por' => 'portuguese',
257 'pus' => 'pashto',
258 'rom' => 'romanian',
259 'rus' => 'russian',
260 'slk' => 'slovak',
261 'slv' => 'slovene',
262 'som' => 'somali',
263 'spa' => 'spanish',
264 'sqi' => 'albanian',
265 'srp' => 'serbian',
266 'swa' => 'swahili',
267 'swe' => 'swedish',
268 'tgl' => 'tagalog',
269 'tur' => 'turkish',
270 'ukr' => 'ukrainian',
271 'urd' => 'urdu',
272 'uzb' => 'uzbek',
273 'vie' => 'vietnamese',
274 );
275
276 /**
277 * Returns the 2-letter ISO 639-1 code for the given language name.
278 *
279 * @param string $lang English language name like "swedish"
280 *
281 * @return string Two-letter language code (e.g. "sv") or NULL if not found
282 */
283 public static function nameToCode2($lang)
284 {
285 $lang = strtolower($lang);
286 if (!isset(self::$nameToCode2[$lang])) {
287 return null;
288 }
289 return self::$nameToCode2[$lang];
290 }
291
292 /**
293 * Returns the 3-letter ISO 639-2 code for the given language name.
294 *
295 * @param string $lang English language name like "swedish"
296 *
297 * @return string Three-letter language code (e.g. "swe") or NULL if not found
298 */
299 public static function nameToCode3($lang)
300 {
301 $lang = strtolower($lang);
302 if (!isset(self::$nameToCode3[$lang])) {
303 return null;
304 }
305 return self::$nameToCode3[$lang];
306 }
307
308 /**
309 * Returns the language name for the given 2-letter ISO 639-1 code.
310 *
311 * @param string $code Two-letter language code (e.g. "sv")
312 *
313 * @return string English language name like "swedish"
314 */
315 public static function code2ToName($code)
316 {
317 $lang = strtolower($code);
318 if (!isset(self::$code2ToName[$code])) {
319 return null;
320 }
321 return self::$code2ToName[$code];
322 }
323
324 /**
325 * Returns the language name for the given 3-letter ISO 639-2 code.
326 *
327 * @param string $code Three-letter language code (e.g. "swe")
328 *
329 * @return string English language name like "swedish"
330 */
331 public static function code3ToName($code)
332 {
333 $lang = strtolower($code);
334 if (!isset(self::$code3ToName[$code])) {
335 return null;
336 }
337 return self::$code3ToName[$code];
338 }
339} \ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
index 7f15fa98..fb0e1e20 100644
--- a/inc/3rdparty/libraries/language-detect/Parser.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -8,7 +8,7 @@
8 * @author Nicholas Pisarro 8 * @author Nicholas Pisarro
9 * @copyright 2006 9 * @copyright 2006
10 * @license BSD 10 * @license BSD
11 * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ 11 * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
12 * @link http://pear.php.net/package/Text_LanguageDetect/ 12 * @link http://pear.php.net/package/Text_LanguageDetect/
13 * @link http://langdetect.blogspot.com/ 13 * @link http://langdetect.blogspot.com/
14 */ 14 */
@@ -28,7 +28,7 @@
28 * @author Nicholas Pisarro 28 * @author Nicholas Pisarro
29 * @copyright 2006 29 * @copyright 2006
30 * @license BSD 30 * @license BSD
31 * @version release: 0.2.3 31 * @version release: 0.3.0
32 */ 32 */
33class Text_LanguageDetect_Parser extends Text_LanguageDetect 33class Text_LanguageDetect_Parser extends Text_LanguageDetect
34{ 34{
@@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
102 * @access private 102 * @access private
103 * @param string $string string to be parsed 103 * @param string $string string to be parsed
104 */ 104 */
105 function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { 105 function Text_LanguageDetect_Parser($string) {
106 if (isset($db)) $this->_db_filename = $db;
107 if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
108 $this->_string = $string; 106 $this->_string = $string;
109 } 107 }
110 108
111 /** 109 /**
112 * Returns true if a string is suitable for parsing 110 * Returns true if a string is suitable for parsing
113 * 111 *
114 * @static
115 * @access public
116 * @param string $str input string to test 112 * @param string $str input string to test
117 * @return bool true if acceptable, false if not 113 * @return bool true if acceptable, false if not
118 */ 114 */
119 function validateString($str) { 115 public static function validateString($str) {
120 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { 116 if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
121 return true; 117 return true;
122 } else { 118 } else {
@@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
222 218
223 // unicode startup 219 // unicode startup
224 if ($this->_compile_unicode) { 220 if ($this->_compile_unicode) {
225 $blocks =& $this->_read_unicode_block_db(); 221 $blocks = $this->_read_unicode_block_db();
226
227 $block_count = count($blocks); 222 $block_count = count($blocks);
228 223
229 $skipped_count = 0; 224 $skipped_count = 0;
@@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
349 } 344 }
350} 345}
351 346
352/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ 347/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file
353
354?>
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php
index 2e8991cc..4fa3ba63 100644..100755
--- a/inc/3rdparty/libraries/readability/Readability.php
+++ b/inc/3rdparty/libraries/readability/Readability.php
@@ -1,1138 +1,1151 @@
1<?php 1<?php
2/** 2/**
3* Arc90's Readability ported to PHP for FiveFilters.org 3* Arc90's Readability ported to PHP for FiveFilters.org
4* Based on readability.js version 1.7.1 (without multi-page support) 4* Based on readability.js version 1.7.1 (without multi-page support)
5* Updated to allow HTML5 parsing with html5lib 5* Updated to allow HTML5 parsing with html5lib
6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds 6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
7* ------------------------------------------------------ 7* ------------------------------------------------------
8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js 8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
9* Arc90's project URL: http://lab.arc90.com/experiments/readability/ 9* Arc90's project URL: http://lab.arc90.com/experiments/readability/
10* JS Source: http://code.google.com/p/arc90labs-readability 10* JS Source: http://code.google.com/p/arc90labs-readability
11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net 11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
12* More information: http://fivefilters.org/content-only/ 12* More information: http://fivefilters.org/content-only/
13* License: Apache License, Version 2.0 13* License: Apache License, Version 2.0
14* Requires: PHP5 14* Requires: PHP5
15* Date: 2012-09-19 15* Date: 2012-09-19
16* 16*
17* Differences between the PHP port and the original 17* Differences between the PHP port and the original
18* ------------------------------------------------------ 18* ------------------------------------------------------
19* Arc90's Readability is designed to run in the browser. It works on the DOM 19* Arc90's Readability is designed to run in the browser. It works on the DOM
20* tree (the parsed HTML) after the page's CSS styles have been applied and 20* tree (the parsed HTML) after the page's CSS styles have been applied and
21* Javascript code executed. This PHP port does not run inside a browser. 21* Javascript code executed. This PHP port does not run inside a browser.
22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot
23* rely on CSS or Javascript support. As such, the results will not always 23* rely on CSS or Javascript support. As such, the results will not always
24* match Arc90's Readability. (For example, if a web page contains CSS style 24* match Arc90's Readability. (For example, if a web page contains CSS style
25* rules or Javascript code which hide certain HTML elements from display, 25* rules or Javascript code which hide certain HTML elements from display,
26* Arc90's Readability will dismiss those from consideration but our PHP port, 26* Arc90's Readability will dismiss those from consideration but our PHP port,
27* unable to understand CSS or Javascript, will not know any better.) 27* unable to understand CSS or Javascript, will not know any better.)
28* 28*
29* Another significant difference is that the aim of Arc90's Readability is 29* Another significant difference is that the aim of Arc90's Readability is
30* to re-present the main content block of a given web page so users can 30* to re-present the main content block of a given web page so users can
31* read it more easily in their browsers. Correct identification, clean up, 31* read it more easily in their browsers. Correct identification, clean up,
32* and separation of the content block is only a part of this process. 32* and separation of the content block is only a part of this process.
33* This PHP port is only concerned with this part, it does not include code 33* This PHP port is only concerned with this part, it does not include code
34* that relates to presentation in the browser - Arc90 already do 34* that relates to presentation in the browser - Arc90 already do
35* that extremely well, and for PDF output there's FiveFilters.org's 35* that extremely well, and for PDF output there's FiveFilters.org's
36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/. 36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
37* 37*
38* Finally, this class contains methods that might be useful for developers 38* Finally, this class contains methods that might be useful for developers
39* working on HTML document fragments. So without deviating too much from 39* working on HTML document fragments. So without deviating too much from
40* the original code (which I don't want to do because it makes debugging 40* the original code (which I don't want to do because it makes debugging
41* and updating more difficult), I've tried to make it a little more 41* and updating more difficult), I've tried to make it a little more
42* developer friendly. You should be able to use the methods here on 42* developer friendly. You should be able to use the methods here on
43* existing DOMElement objects without passing an entire HTML document to 43* existing DOMElement objects without passing an entire HTML document to
44* be parsed. 44* be parsed.
45*/ 45*/
46 46
47// This class allows us to do JavaScript like assignements to innerHTML 47// This class allows us to do JavaScript like assignements to innerHTML
48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); 48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
49 49
50// Alternative usage (for testing only!) 50// Alternative usage (for testing only!)
51// uncomment the lines below and call Readability.php in your browser 51// uncomment the lines below and call Readability.php in your browser
52// passing it the URL of the page you'd like content from, e.g.: 52// passing it the URL of the page you'd like content from, e.g.:
53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php 53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
54 54
55/* 55/*
56if (!isset($_GET['url']) || $_GET['url'] == '') { 56if (!isset($_GET['url']) || $_GET['url'] == '') {
57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); 57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
58} 58}
59$url = $_GET['url']; 59$url = $_GET['url'];
60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; 60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
61$html = file_get_contents($url); 61$html = file_get_contents($url);
62$r = new Readability($html, $url); 62$r = new Readability($html, $url);
63$r->init(); 63$r->init();
64echo $r->articleContent->innerHTML; 64echo $r->articleContent->innerHTML;
65*/ 65*/
66 66
67class Readability 67class Readability
68{ 68{
69 public $version = '1.7.1-without-multi-page'; 69 public $version = '1.7.1-without-multi-page';
70 public $convertLinksToFootnotes = false; 70 public $convertLinksToFootnotes = false;
71 public $revertForcedParagraphElements = true; 71 public $revertForcedParagraphElements = true;
72 public $articleTitle; 72 public $articleTitle;
73 public $articleContent; 73 public $articleContent;
74 public $dom; 74 public $dom;
75 public $url = null; // optional - URL where HTML was retrieved 75 public $url = null; // optional - URL where HTML was retrieved
76 public $debug = false; 76 public $debug = false;
77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19 77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19
78 protected $body = null; // 78 protected $body = null; //
79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. 80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
81 protected $success = false; // indicates whether we were able to extract or not 81 protected $success = false; // indicates whether we were able to extract or not
82 82
83 /**
84 * All of the regular expressions in use within readability.
85 * Defined up here so we don't instantiate them repeatedly in loops.
86 **/
87 public $regexps = array(
88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
94 'replaceFonts' => '/<(\/?)font[^>]*>/i',
95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
96 'normalize' => '/\s{2,}/',
97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 );
101
102 /* constants */
103 const FLAG_STRIP_UNLIKELYS = 1;
104 const FLAG_WEIGHT_CLASSES = 2;
105 const FLAG_CLEAN_CONDITIONALLY = 4;
106
107 /**
108 * Create instance of Readability
109 * @param string UTF-8 encoded string
110 * @param string (optional) URL associated with HTML (used for footnotes)
111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 */
113 function __construct($html, $url=null, $parser='libxml')
114 {
115 $this->url = $url;
116 /* Turn all double br's into p's */
117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
120 if (trim($html) == '') $html = '<html></html>';
121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
122 // all good
123 } else {
124 $this->dom = new DOMDocument();
125 $this->dom->preserveWhiteSpace = false;
126 @$this->dom->loadHTML($html);
127 }
128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
129 }
130
131 /**
132 * Get article title element
133 * @return DOMElement
134 */
135 public function getTitle() {
136 return $this->articleTitle;
137 }
138
139 /**
140 * Get article content element
141 * @return DOMElement
142 */
143 public function getContent() {
144 return $this->articleContent;
145 }
146
147 /**
148 * Runs readability.
149 *
150 * Workflow:
151 * 1. Prep the document by removing script tags, css, etc.
152 * 2. Build readability's DOM tree.
153 * 3. Grab the article content from the current dom tree.
154 * 4. Replace the current DOM tree with the new one.
155 * 5. Read peacefully.
156 *
157 * @return boolean true if we found content, false otherwise
158 **/
159 public function init()
160 {
161 if (!isset($this->dom->documentElement)) return false;
162 $this->removeScripts($this->dom);
163 //die($this->getInnerHTML($this->dom->documentElement));
164
165 // Assume successful outcome
166 $this->success = true;
167
168 $bodyElems = $this->dom->getElementsByTagName('body');
169 if ($bodyElems->length > 0) {
170 if ($this->bodyCache == null) {
171 $this->bodyCache = $bodyElems->item(0)->innerHTML;
172 }
173 if ($this->body == null) {
174 $this->body = $bodyElems->item(0);
175 }
176 }
177
178 $this->prepDocument();
179
180 //die($this->dom->documentElement->parentNode->nodeType);
181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 //die($this->getInnerHTML($this->dom->documentElement));
183
184 /* Build readability's DOM tree */
185 $overlay = $this->dom->createElement('div');
186 $innerDiv = $this->dom->createElement('div');
187 $articleTitle = $this->getArticleTitle();
188 $articleContent = $this->grabArticle();
189
190 if (!$articleContent) {
191 $this->success = false;
192 $articleContent = $this->dom->createElement('div');
193 $articleContent->setAttribute('id', 'readability-content');
194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
195 }
196
197 $overlay->setAttribute('id', 'readOverlay');
198 $innerDiv->setAttribute('id', 'readInner');
199
200 /* Glue the structure of our document together. */
201 $innerDiv->appendChild($articleTitle);
202 $innerDiv->appendChild($articleContent);
203 $overlay->appendChild($innerDiv);
204
205 /* Clear the old HTML, insert the new content. */
206 $this->body->innerHTML = '';
207 $this->body->appendChild($overlay);
208 //document.body.insertBefore(overlay, document.body.firstChild);
209 $this->body->removeAttribute('style');
210
211 $this->postProcessContent($articleContent);
212
213 // Set title and content instance variables
214 $this->articleTitle = $articleTitle;
215 $this->articleContent = $articleContent;
216
217 return $this->success;
218 }
219
220 /**
221 * Debug
222 */
223 protected function dbg($msg) {
224 if ($this->debug) echo '* ',$msg, "\n";
225 }
226
227 /**
228 * Run any post-process modifications to article content as necessary.
229 *
230 * @param DOMElement
231 * @return void
232 */
233 public function postProcessContent($articleContent) {
234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 $this->addFootnotes($articleContent);
236 }
237 }
238
239 /**
240 * Get the article title as an H1.
241 *
242 * @return DOMElement
243 */
244 protected function getArticleTitle() {
245 $curTitle = '';
246 $origTitle = '';
247
248 try {
249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 } catch(Exception $e) {}
251
252 if (preg_match('/ [\|\-] /', $curTitle))
253 {
254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255
256 if (count(explode(' ', $curTitle)) < 3) {
257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 }
259 }
260 else if (strpos($curTitle, ': ') !== false)
261 {
262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
263
264 if (count(explode(' ', $curTitle)) < 3) {
265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
266 }
267 }
268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
269 {
270 $hOnes = $this->dom->getElementsByTagName('h1');
271 if($hOnes->length == 1)
272 {
273 $curTitle = $this->getInnerText($hOnes->item(0));
274 }
275 }
276
277 $curTitle = trim($curTitle);
278
279 if (count(explode(' ', $curTitle)) <= 4) {
280 $curTitle = $origTitle;
281 }
282
283 $articleTitle = $this->dom->createElement('h1');
284 $articleTitle->innerHTML = $curTitle;
285
286 return $articleTitle;
287 }
288
289 /**
290 * Prepare the HTML document for readability to scrape it.
291 * This includes things like stripping javascript, CSS, and handling terrible markup.
292 *
293 * @return void
294 **/
295 protected function prepDocument() {
296 /**
297 * In some cases a body element can't be found (if the HTML is totally hosed for example)
298 * so we create a new body node and append it to the document.
299 */
300 if ($this->body == null)
301 {
302 $this->body = $this->dom->createElement('body');
303 $this->dom->documentElement->appendChild($this->body);
304 }
305 $this->body->setAttribute('id', 'readabilityBody');
306
307 /* Remove all style tags in head */
308 $styleTags = $this->dom->getElementsByTagName('style');
309 for ($i = $styleTags->length-1; $i >= 0; $i--)
310 {
311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
312 }
313
314 /* Turn all double br's into p's */
315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
318 // Manipulating innerHTML as it's done in JS is not possible in PHP.
319 }
320
321 /**
322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
324 *
325 * @return void
326 **/
327 public function addFootnotes($articleContent) {
328 $footnotesWrapper = $this->dom->createElement('div');
329 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 $footnotesWrapper->innerHTML = '<h3>References</h3>';
331
332 $articleFootnotes = $this->dom->createElement('ol');
333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 $footnotesWrapper->appendChild($articleFootnotes);
335
336 $articleLinks = $articleContent->getElementsByTagName('a');
337
338 $linkCount = 0;
339 for ($i = 0; $i < $articleLinks->length; $i++)
340 {
341 $articleLink = $articleLinks->item($i);
342 $footnoteLink = $articleLink->cloneNode(true);
343 $refLink = $this->dom->createElement('a');
344 $footnote = $this->dom->createElement('li');
345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 $linkText = $this->getInnerText($articleLink);
349
350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 continue;
352 }
353
354 $linkCount++;
355
356 /** Add a superscript reference after the article link */
357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
359 $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 $refLink->setAttribute('style', 'color: inherit;');
361
362 //TODO: does this work or should we use DOMNode.isSameNode()?
363 if ($articleLink->parentNode->lastChild == $articleLink) {
364 $articleLink->parentNode->appendChild($refLink);
365 } else {
366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
367 }
368
369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
371
372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
373
374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376
377 $footnote->appendChild($footnoteLink);
378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
379
380 $articleFootnotes->appendChild($footnote);
381 }
382
383 if ($linkCount > 0) {
384 $articleContent->appendChild($footnotesWrapper);
385 }
386 }
387
388 /**
389 * Reverts P elements with class 'readability-styled'
390 * to text nodes - which is what they were before.
391 *
392 * @param DOMElement
393 * @return void
394 */
395 function revertReadabilityStyledElements($articleContent) {
396 $xpath = new DOMXPath($articleContent->ownerDocument);
397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
398 //$elems = $articleContent->getElementsByTagName('p');
399 for ($i = $elems->length-1; $i >= 0; $i--) {
400 $e = $elems->item($i);
401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
404 //}
405 }
406 }
407
408 /**
409 * Prepare the article node for display. Clean out any inline styles,
410 * iframes, forms, strip extraneous <p> tags, etc.
411 *
412 * @param DOMElement
413 * @return void
414 */
415 function prepArticle($articleContent) {
416 $this->cleanStyles($articleContent);
417 $this->killBreaks($articleContent);
418 if ($this->revertForcedParagraphElements) {
419 $this->revertReadabilityStyledElements($articleContent);
420 }
421
422 /* Clean out junk from the article content */
423 $this->cleanConditionally($articleContent, 'form');
424 $this->clean($articleContent, 'object');
425 $this->clean($articleContent, 'h1');
426
427 /**
428 * If there is only one h2, they are probably using it
429 * as a header and not a subheader, so remove it since we already have a header.
430 ***/
431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 $this->clean($articleContent, 'h2');
433 }
434 $this->clean($articleContent, 'iframe');
435
436 $this->cleanHeaders($articleContent);
437
438 /* Do these last as the previous stuff may have removed junk that will affect these */
439 $this->cleanConditionally($articleContent, 'table');
440 $this->cleanConditionally($articleContent, 'ul');
441 $this->cleanConditionally($articleContent, 'div');
442
443 /* Remove extra paragraphs */
444 $articleParagraphs = $articleContent->getElementsByTagName('p');
445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
446 {
447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451
452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 {
454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
455 }
456 }
457
458 try {
459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
461 }
462 catch (Exception $e) {
463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 }
465 }
466
467 /**
468 * Initialize a node with the readability object. Also checks the
469 * className/id for special names to add to its score.
470 *
471 * @param Element
472 * @return void
473 **/
474 protected function initializeNode($node) {
475 $readability = $this->dom->createAttribute('readability');
476 $readability->value = 0; // this is our contentScore
477 $node->setAttributeNode($readability);
478
479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 case 'DIV':
481 $readability->value += 5;
482 break;
483
484 case 'PRE':
485 case 'TD':
486 case 'BLOCKQUOTE':
487 $readability->value += 3;
488 break;
489
490 case 'ADDRESS':
491 case 'OL':
492 case 'UL':
493 case 'DL':
494 case 'DD':
495 case 'DT':
496 case 'LI':
497 case 'FORM':
498 $readability->value -= 3;
499 break;
500
501 case 'H1':
502 case 'H2':
503 case 'H3':
504 case 'H4':
505 case 'H5':
506 case 'H6':
507 case 'TH':
508 $readability->value -= 5;
509 break;
510 }
511 $readability->value += $this->getClassWeight($node);
512 }
513
514 /***
515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
517 *
518 * @return DOMElement
519 **/
520 protected function grabArticle($page=null) {
521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
522 if (!$page) $page = $this->dom;
523 $allElements = $page->getElementsByTagName('*');
524 /**
525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
527 *
528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
529 * TODO: Shouldn't this be a reverse traversal?
530 **/
531 $node = null;
532 $nodesToScore = array();
533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
535 //$node = $targetList->item($nodeIndex);
536 $tagName = strtoupper($node->tagName);
537 /* Remove unlikely candidates */
538 if ($stripUnlikelyCandidates) {
539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
540 if (
541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
543 $tagName != 'BODY'
544 )
545 {
546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
547 //$nodesToRemove[] = $node;
548 $node->parentNode->removeChild($node);
549 $nodeIndex--;
550 continue;
551 }
552 }
553
554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
555 $nodesToScore[] = $node;
556 }
557
558 /* Turn all divs that don't have children block level elements into p's */
559 if ($tagName == 'DIV') {
560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
561 //$this->dbg('Altering div to p');
562 $newNode = $this->dom->createElement('p');
563 try {
564 $newNode->innerHTML = $node->innerHTML;
565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
566 $node->parentNode->replaceChild($newNode, $node);
567 $nodeIndex--;
568 $nodesToScore[] = $node; // or $newNode?
569 }
570 catch(Exception $e) {
571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
572 }
573 }
574 else
575 {
576 /* EXPERIMENTAL */
577 // TODO: change these p elements back to text nodes after processing
578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
579 $childNode = $node->childNodes->item($i);
580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE
581 //$this->dbg('replacing text node with a p tag with the same content.');
582 $p = $this->dom->createElement('p');
583 $p->innerHTML = $childNode->nodeValue;
584 $p->setAttribute('style', 'display: inline;');
585 $p->setAttribute('class', 'readability-styled');
586 $childNode->parentNode->replaceChild($p, $childNode);
587 }
588 }
589 }
590 }
591 }
592
593 /**
594 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 * Then add their score to their parent node.
596 *
597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
598 **/
599 $candidates = array();
600 for ($pt=0; $pt < count($nodesToScore); $pt++) {
601 $parentNode = $nodesToScore[$pt]->parentNode;
602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
604 $innerText = $this->getInnerText($nodesToScore[$pt]);
605
606 if (!$parentNode || !isset($parentNode->tagName)) {
607 continue;
608 }
609
610 /* If this paragraph is less than 25 characters, don't even count it. */
611 if(strlen($innerText) < 25) {
612 continue;
613 }
614
615 /* Initialize readability data for the parent. */
616 if (!$parentNode->hasAttribute('readability'))
617 {
618 $this->initializeNode($parentNode);
619 $candidates[] = $parentNode;
620 }
621
622 /* Initialize readability data for the grandparent. */
623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
624 {
625 $this->initializeNode($grandParentNode);
626 $candidates[] = $grandParentNode;
627 }
628
629 $contentScore = 0;
630
631 /* Add a point for the paragraph itself as a base. */
632 $contentScore++;
633
634 /* Add points for any commas within this paragraph */
635 $contentScore += count(explode(',', $innerText));
636
637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 $contentScore += min(floor(strlen($innerText) / 100), 3);
639
640 /* Add the score to the parent. The grandparent gets half. */
641 $parentNode->getAttributeNode('readability')->value += $contentScore;
642
643 if ($grandParentNode) {
644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 }
646 }
647
648 /**
649 * After we've calculated scores, loop through all of the possible candidate nodes we found
650 * and find the one with the highest score.
651 **/
652 $topCandidate = null;
653 for ($c=0, $cl=count($candidates); $c < $cl; $c++)
654 {
655 /**
656 * Scale the final candidates score based on link density. Good content should have a
657 * relatively small link density (5% or less) and be mostly unaffected by this operation.
658 **/
659 $readability = $candidates[$c]->getAttributeNode('readability');
660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
661
662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
663
664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
665 $topCandidate = $candidates[$c];
666 }
667 }
668
669 /**
670 * If we still have no top candidate, just use the body as a last resort.
671 * We also have to copy the body node so it is something we can modify.
672 **/
673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
674 {
675 $topCandidate = $this->dom->createElement('div');
676 if ($page instanceof DOMDocument) {
677 if (!isset($page->documentElement)) {
678 // we don't have a body either? what a mess! :)
679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = '';
682 $this->reinitBody();
683 $page->documentElement->appendChild($topCandidate);
684 }
685 } else {
686 $topCandidate->innerHTML = $page->innerHTML;
687 $page->innerHTML = '';
688 $page->appendChild($topCandidate);
689 }
690 $this->initializeNode($topCandidate);
691 }
692
693 /**
694 * Now that we have the top candidate, look through its siblings for content that might also be related.
695 * Things like preambles, content split by ads that we removed, etc.
696 **/
697 $articleContent = $this->dom->createElement('div');
698 $articleContent->setAttribute('id', 'readability-content');
699 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
700 $siblingNodes = $topCandidate->parentNode->childNodes;
701 if (!isset($siblingNodes)) {
702 $siblingNodes = new stdClass;
703 $siblingNodes->length = 0;
704 }
705
706 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
707 {
708 $siblingNode = $siblingNodes->item($s);
709 $append = false;
710
711 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
712
713 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
714
715 if ($siblingNode === $topCandidate)
716 // or if ($siblingNode->isSameNode($topCandidate))
717 {
718 $append = true;
719 }
720
721 $contentBonus = 0;
722 /* Give a bonus if sibling nodes and top candidates have the example same classname */
723 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
724 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
725 }
726
727 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
728 {
729 $append = true;
730 }
731
732 if (strtoupper($siblingNode->nodeName) == 'P') {
733 $linkDensity = $this->getLinkDensity($siblingNode);
734 $nodeContent = $this->getInnerText($siblingNode);
735 $nodeLength = strlen($nodeContent);
736
737 if ($nodeLength > 80 && $linkDensity < 0.25)
738 {
739 $append = true;
740 }
741 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
742 {
743 $append = true;
744 }
745 }
746
747 if ($append)
748 {
749 $this->dbg('Appending node: ' . $siblingNode->nodeName);
750
751 $nodeToAppend = null;
752 $sibNodeName = strtoupper($siblingNode->nodeName);
753 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
754 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
755
756 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
757 $nodeToAppend = $this->dom->createElement('div');
758 try {
759 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
760 $nodeToAppend->innerHTML = $siblingNode->innerHTML;
761 }
762 catch(Exception $e)
763 {
764 $this->dbg('Could not alter siblingNode to div, reverting back to original.');
765 $nodeToAppend = $siblingNode;
766 $s--;
767 $sl--;
768 }
769 } else {
770 $nodeToAppend = $siblingNode;
771 $s--;
772 $sl--;
773 }
774
775 /* To ensure a node does not interfere with readability styles, remove its classnames */
776 $nodeToAppend->removeAttribute('class');
777
778 /* Append sibling and subtract from our list because it removes the node when you append to another node */
779 $articleContent->appendChild($nodeToAppend);
780 }
781 }
782
783 /**
784 * So we have all of the content that we need. Now we clean it up for presentation.
785 **/
786 $this->prepArticle($articleContent);
787
788 /**
789 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
790 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
791 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
792 * finding the -right- content.
793 **/
794 if (strlen($this->getInnerText($articleContent, false)) < 250)
795 {
796 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
797 // in the meantime, we check and create an empty element if it's not there.
798 $this->reinitBody();
799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 return $this->grabArticle($this->body);
803 }
804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 return $this->grabArticle($this->body);
807 }
808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
810 return $this->grabArticle($this->body);
811 }
812 else {
813 return false;
814 }
815 }
816 return $articleContent;
817 }
818
819 /**
820 * Remove script tags from document
821 *
822 * @param DOMElement
823 * @return void
824 */
825 public function removeScripts($doc) {
826 $scripts = $doc->getElementsByTagName('script');
827 for($i = $scripts->length-1; $i >= 0; $i--)
828 {
829 $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 }
831 }
832
833 /**
834 * Get the inner text of a node.
835 * This also strips out any excess whitespace to be found.
836 *
837 * @param DOMElement $
838 * @param boolean $normalizeSpaces (default: true)
839 * @return string
840 **/
841 public function getInnerText($e, $normalizeSpaces=true) {
842 $textContent = '';
843
844 if (!isset($e->textContent) || $e->textContent == '') {
845 return '';
846 }
847
848 $textContent = trim($e->textContent);
849
850 if ($normalizeSpaces) {
851 return preg_replace($this->regexps['normalize'], ' ', $textContent);
852 } else {
853 return $textContent;
854 }
855 }
856
857 /**
858 * Get the number of times a string $s appears in the node $e.
859 *
860 * @param DOMElement $e
861 * @param string - what to count. Default is ","
862 * @return number (integer)
863 **/
864 public function getCharCount($e, $s=',') {
865 return substr_count($this->getInnerText($e), $s);
866 }
867
868 /**
869 * Remove the style attribute on every $e and under.
870 *
871 * @param DOMElement $e
872 * @return void
873 */
874 public function cleanStyles($e) {
875 if (!is_object($e)) return;
876 $elems = $e->getElementsByTagName('*');
877 foreach ($elems as $elem) {
878 $elem->removeAttribute('style');
879 }
880 }
881
882 /**
883 * Get the density of links as a percentage of the content
884 * This is the amount of text that is inside a link divided by the total text in the node.
885 *
886 * @param DOMElement $e
887 * @return number (float)
888 */
889 public function getLinkDensity($e) {
890 $links = $e->getElementsByTagName('a');
891 $textLength = strlen($this->getInnerText($e));
892 $linkLength = 0;
893 for ($i=0, $il=$links->length; $i < $il; $i++)
894 {
895 $linkLength += strlen($this->getInnerText($links->item($i)));
896 }
897 if ($textLength > 0) {
898 return $linkLength / $textLength;
899 } else {
900 return 0;
901 }
902 }
903
904 /**
905 * Get an elements class/id weight. Uses regular expressions to tell if this
906 * element looks good or bad.
907 *
908 * @param DOMElement $e
909 * @return number (Integer)
910 */
911 public function getClassWeight($e) {
912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
913 return 0;
914 }
915
916 $weight = 0;
917
918 /* Look for a special classname */
919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
920 {
921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
922 $weight -= 25;
923 }
924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
925 $weight += 25;
926 }
927 }
928
929 /* Look for a special ID */
930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
931 {
932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
933 $weight -= 25;
934 }
935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
936 $weight += 25;
937 }
938 }
939 return $weight;
940 }
941
942 /**
943 * Remove extraneous break tags from a node.
944 *
945 * @param DOMElement $node
946 * @return void
947 */
948 public function killBreaks($node) {
949 $html = $node->innerHTML;
950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
951 $node->innerHTML = $html;
952 }
953
954 /**
955 * Clean a node of all elements of type "tag".
956 * (Unless it's a youtube/vimeo video. People love movies.)
957 *
958 * Updated 2012-09-18 to preserve youtube/vimeo iframes
959 *
960 * @param DOMElement $e
961 * @param string $tag
962 * @return void
963 */
964 public function clean($e, $tag) {
965 $targetList = $e->getElementsByTagName($tag);
966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967
968 for ($y=$targetList->length-1; $y >= 0; $y--) {
969 /* Allow youtube and vimeo videos through as people usually want to see those. */
970 if ($isEmbed) {
971 $attributeValues = '';
972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 }
975
976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 if (preg_match($this->regexps['video'], $attributeValues)) {
978 continue;
979 }
980
981 /* Then check the elements inside this element for the same. */
982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
983 continue;
984 }
985 }
986 $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 }
988 }
989
990 /**
991 * Clean an element of all tags of type "tag" if they look fishy.
992 * "Fishy" is an algorithm based on content length, classnames,
993 * link density, number of images & embeds, etc.
994 *
995 * @param DOMElement $e
996 * @param string $tag
997 * @return void
998 */
999 public function cleanConditionally($e, $tag) {
1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1001 return;
1002 }
1003
1004 $tagsList = $e->getElementsByTagName($tag);
1005 $curTagsLength = $tagsList->length;
1006
1007 /**
1008 * Gather counts for other typical elements embedded within.
1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1010 *
1011 * TODO: Consider taking into account original contentScore here.
1012 */
1013 for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 $weight = $this->getClassWeight($tagsList->item($i));
1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016
1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018
1019 if ($weight + $contentScore < 0) {
1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1021 }
1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1023 /**
1024 * If there are not very many commas, and the number of
1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1026 **/
1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length;
1032
1033 $embedCount = 0;
1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 $embedCount++;
1038 }
1039 }
1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 $embedCount++;
1044 }
1045 }
1046
1047 $linkDensity = $this->getLinkDensity($tagsList->item($i));
1048 $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1049 $toRemove = false;
1050
1051 if ($this->lightClean) {
1052 $this->dbg('Light clean...');
1053 if ( ($img > $p) && ($img > 4) ) {
1054 $this->dbg(' more than 4 images and more image elements than paragraph elements');
1055 $toRemove = true;
1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1058 $toRemove = true;
1059 } else if ( $input > floor($p/3) ) {
1060 $this->dbg(' too many <input> elements');
1061 $toRemove = true;
1062 } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
1063 $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
1064 $toRemove = true;
1065 } else if($weight < 25 && $linkDensity > 0.2) {
1066 $this->dbg(' weight smaller than 25 and link density above 0.2');
1067 $toRemove = true;
1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
1070 $toRemove = true;
1071 } else if($embedCount > 3) {
1072 $this->dbg(' more than 3 embeds');
1073 $toRemove = true;
1074 }
1075 } else {
1076 $this->dbg('Standard clean...');
1077 if ( $img > $p ) {
1078 $this->dbg(' more image elements than paragraph elements');
1079 $toRemove = true;
1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1082 $toRemove = true;
1083 } else if ( $input > floor($p/3) ) {
1084 $this->dbg(' too many <input> elements');
1085 $toRemove = true;
1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
1088 $toRemove = true;
1089 } else if($weight < 25 && $linkDensity > 0.2) {
1090 $this->dbg(' weight smaller than 25 and link density above 0.2');
1091 $toRemove = true;
1092 } else if($weight >= 25 && $linkDensity > 0.5) {
1093 $this->dbg(' weight above 25 but link density greater than 0.5');
1094 $toRemove = true;
1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
1097 $toRemove = true;
1098 }
1099 }
1100
1101 if ($toRemove) {
1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1104 }
1105 }
1106 }
1107 }
1108
1109 /**
1110 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1111 *
1112 * @param DOMElement $e
1113 * @return void
1114 */
1115 public function cleanHeaders($e) {
1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1117 $headers = $e->getElementsByTagName('h' . $headerIndex);
1118 for ($i=$headers->length-1; $i >=0; $i--) {
1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1120 $headers->item($i)->parentNode->removeChild($headers->item($i));
1121 }
1122 }
1123 }
1124 }
1125
1126 public function flagIsActive($flag) {
1127 return ($this->flags & $flag) > 0;
1128 }
1129
1130 public function addFlag($flag) {
1131 $this->flags = $this->flags | $flag;
1132 }
1133
1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag;
1136 }
1137
83 /** 1138 /**
84 * All of the regular expressions in use within readability. 1139 * Will recreate previously deleted body property
85 * Defined up here so we don't instantiate them repeatedly in loops. 1140 *
86 **/ 1141 * @return void
87 public $regexps = array( 1142 */
88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', 1143 protected function reinitBody() {
89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 1144 if (!isset($this->body->childNodes)) {
90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', 1145 $this->body = $this->dom->createElement('body');
91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
94 'replaceFonts' => '/<(\/?)font[^>]*>/i',
95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
96 'normalize' => '/\s{2,}/',
97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 );
101
102 /* constants */
103 const FLAG_STRIP_UNLIKELYS = 1;
104 const FLAG_WEIGHT_CLASSES = 2;
105 const FLAG_CLEAN_CONDITIONALLY = 4;
106
107 /**
108 * Create instance of Readability
109 * @param string UTF-8 encoded string
110 * @param string (optional) URL associated with HTML (used for footnotes)
111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 */
113 function __construct($html, $url=null, $parser='libxml')
114 {
115 $this->url = $url;
116 /* Turn all double br's into p's */
117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
120 if (trim($html) == '') $html = '<html></html>';
121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
122 // all good
123 } else {
124 $this->dom = new DOMDocument();
125 $this->dom->preserveWhiteSpace = false;
126 @$this->dom->loadHTML($html);
127 }
128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
129 }
130
131 /**
132 * Get article title element
133 * @return DOMElement
134 */
135 public function getTitle() {
136 return $this->articleTitle;
137 }
138
139 /**
140 * Get article content element
141 * @return DOMElement
142 */
143 public function getContent() {
144 return $this->articleContent;
145 }
146
147 /**
148 * Runs readability.
149 *
150 * Workflow:
151 * 1. Prep the document by removing script tags, css, etc.
152 * 2. Build readability's DOM tree.
153 * 3. Grab the article content from the current dom tree.
154 * 4. Replace the current DOM tree with the new one.
155 * 5. Read peacefully.
156 *
157 * @return boolean true if we found content, false otherwise
158 **/
159 public function init()
160 {
161 if (!isset($this->dom->documentElement)) return false;
162 $this->removeScripts($this->dom);
163 //die($this->getInnerHTML($this->dom->documentElement));
164
165 // Assume successful outcome
166 $this->success = true;
167
168 $bodyElems = $this->dom->getElementsByTagName('body');
169 if ($bodyElems->length > 0) {
170 if ($this->bodyCache == null) {
171 $this->bodyCache = $bodyElems->item(0)->innerHTML;
172 }
173 if ($this->body == null) {
174 $this->body = $bodyElems->item(0);
175 }
176 }
177
178 $this->prepDocument();
179
180 //die($this->dom->documentElement->parentNode->nodeType);
181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 //die($this->getInnerHTML($this->dom->documentElement));
183
184 /* Build readability's DOM tree */
185 $overlay = $this->dom->createElement('div');
186 $innerDiv = $this->dom->createElement('div');
187 $articleTitle = $this->getArticleTitle();
188 $articleContent = $this->grabArticle();
189
190 if (!$articleContent) {
191 $this->success = false;
192 $articleContent = $this->dom->createElement('div');
193 $articleContent->setAttribute('id', 'readability-content');
194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
195 }
196
197 $overlay->setAttribute('id', 'readOverlay');
198 $innerDiv->setAttribute('id', 'readInner');
199
200 /* Glue the structure of our document together. */
201 $innerDiv->appendChild($articleTitle);
202 $innerDiv->appendChild($articleContent);
203 $overlay->appendChild($innerDiv);
204
205 /* Clear the old HTML, insert the new content. */
206 $this->body->innerHTML = '';
207 $this->body->appendChild($overlay);
208 //document.body.insertBefore(overlay, document.body.firstChild);
209 $this->body->removeAttribute('style');
210
211 $this->postProcessContent($articleContent);
212
213 // Set title and content instance variables
214 $this->articleTitle = $articleTitle;
215 $this->articleContent = $articleContent;
216
217 return $this->success;
218 }
219
220 /**
221 * Debug
222 */
223 protected function dbg($msg) {
224 if ($this->debug) echo '* ',$msg, "\n";
225 }
226
227 /**
228 * Run any post-process modifications to article content as necessary.
229 *
230 * @param DOMElement
231 * @return void
232 */
233 public function postProcessContent($articleContent) {
234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 $this->addFootnotes($articleContent);
236 }
237 }
238
239 /**
240 * Get the article title as an H1.
241 *
242 * @return DOMElement
243 */
244 protected function getArticleTitle() {
245 $curTitle = '';
246 $origTitle = '';
247
248 try {
249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 } catch(Exception $e) {}
251
252 if (preg_match('/ [\|\-] /', $curTitle))
253 {
254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255
256 if (count(explode(' ', $curTitle)) < 3) {
257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 }
259 }
260 else if (strpos($curTitle, ': ') !== false)
261 {
262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
263
264 if (count(explode(' ', $curTitle)) < 3) {
265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
266 }
267 }
268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
269 {
270 $hOnes = $this->dom->getElementsByTagName('h1');
271 if($hOnes->length == 1)
272 {
273 $curTitle = $this->getInnerText($hOnes->item(0));
274 }
275 }
276
277 $curTitle = trim($curTitle);
278
279 if (count(explode(' ', $curTitle)) <= 4) {
280 $curTitle = $origTitle;
281 }
282
283 $articleTitle = $this->dom->createElement('h1');
284 $articleTitle->innerHTML = $curTitle;
285
286 return $articleTitle;
287 }
288
289 /**
290 * Prepare the HTML document for readability to scrape it.
291 * This includes things like stripping javascript, CSS, and handling terrible markup.
292 *
293 * @return void
294 **/
295 protected function prepDocument() {
296 /**
297 * In some cases a body element can't be found (if the HTML is totally hosed for example)
298 * so we create a new body node and append it to the document.
299 */
300 if ($this->body == null)
301 {
302 $this->body = $this->dom->createElement('body');
303 $this->dom->documentElement->appendChild($this->body);
304 }
305 $this->body->setAttribute('id', 'readabilityBody');
306
307 /* Remove all style tags in head */
308 $styleTags = $this->dom->getElementsByTagName('style');
309 for ($i = $styleTags->length-1; $i >= 0; $i--)
310 {
311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
312 }
313
314 /* Turn all double br's into p's */
315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
318 // Manipulating innerHTML as it's done in JS is not possible in PHP.
319 }
320
321 /**
322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
324 *
325 * @return void
326 **/
327 public function addFootnotes($articleContent) {
328 $footnotesWrapper = $this->dom->createElement('div');
329 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 $footnotesWrapper->innerHTML = '<h3>References</h3>';
331
332 $articleFootnotes = $this->dom->createElement('ol');
333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 $footnotesWrapper->appendChild($articleFootnotes);
335
336 $articleLinks = $articleContent->getElementsByTagName('a');
337
338 $linkCount = 0;
339 for ($i = 0; $i < $articleLinks->length; $i++)
340 {
341 $articleLink = $articleLinks->item($i);
342 $footnoteLink = $articleLink->cloneNode(true);
343 $refLink = $this->dom->createElement('a');
344 $footnote = $this->dom->createElement('li');
345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 $linkText = $this->getInnerText($articleLink);
349
350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 continue;
352 }
353
354 $linkCount++;
355
356 /** Add a superscript reference after the article link */
357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
359 $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 $refLink->setAttribute('style', 'color: inherit;');
361
362 //TODO: does this work or should we use DOMNode.isSameNode()?
363 if ($articleLink->parentNode->lastChild == $articleLink) {
364 $articleLink->parentNode->appendChild($refLink);
365 } else {
366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
367 }
368
369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
371
372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
373
374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376
377 $footnote->appendChild($footnoteLink);
378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
379
380 $articleFootnotes->appendChild($footnote);
381 }
382
383 if ($linkCount > 0) {
384 $articleContent->appendChild($footnotesWrapper);
385 }
386 }
387
388 /**
389 * Reverts P elements with class 'readability-styled'
390 * to text nodes - which is what they were before.
391 *
392 * @param DOMElement
393 * @return void
394 */
395 function revertReadabilityStyledElements($articleContent) {
396 $xpath = new DOMXPath($articleContent->ownerDocument);
397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
398 //$elems = $articleContent->getElementsByTagName('p');
399 for ($i = $elems->length-1; $i >= 0; $i--) {
400 $e = $elems->item($i);
401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
404 //}
405 }
406 }
407
408 /**
409 * Prepare the article node for display. Clean out any inline styles,
410 * iframes, forms, strip extraneous <p> tags, etc.
411 *
412 * @param DOMElement
413 * @return void
414 */
415 function prepArticle($articleContent) {
416 $this->cleanStyles($articleContent);
417 $this->killBreaks($articleContent);
418 if ($this->revertForcedParagraphElements) {
419 $this->revertReadabilityStyledElements($articleContent);
420 }
421
422 /* Clean out junk from the article content */
423 $this->cleanConditionally($articleContent, 'form');
424 $this->clean($articleContent, 'object');
425 $this->clean($articleContent, 'h1');
426
427 /**
428 * If there is only one h2, they are probably using it
429 * as a header and not a subheader, so remove it since we already have a header.
430 ***/
431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 $this->clean($articleContent, 'h2');
433 }
434 $this->clean($articleContent, 'iframe');
435
436 $this->cleanHeaders($articleContent);
437
438 /* Do these last as the previous stuff may have removed junk that will affect these */
439 $this->cleanConditionally($articleContent, 'table');
440 $this->cleanConditionally($articleContent, 'ul');
441 $this->cleanConditionally($articleContent, 'div');
442
443 /* Remove extra paragraphs */
444 $articleParagraphs = $articleContent->getElementsByTagName('p');
445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
446 {
447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451
452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 {
454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
455 }
456 }
457
458 try {
459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
461 }
462 catch (Exception $e) {
463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 }
465 }
466
467 /**
468 * Initialize a node with the readability object. Also checks the
469 * className/id for special names to add to its score.
470 *
471 * @param Element
472 * @return void
473 **/
474 protected function initializeNode($node) {
475 $readability = $this->dom->createAttribute('readability');
476 $readability->value = 0; // this is our contentScore
477 $node->setAttributeNode($readability);
478
479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 case 'DIV':
481 $readability->value += 5;
482 break;
483
484 case 'PRE':
485 case 'TD':
486 case 'BLOCKQUOTE':
487 $readability->value += 3;
488 break;
489
490 case 'ADDRESS':
491 case 'OL':
492 case 'UL':
493 case 'DL':
494 case 'DD':
495 case 'DT':
496 case 'LI':
497 case 'FORM':
498 $readability->value -= 3;
499 break;
500
501 case 'H1':
502 case 'H2':
503 case 'H3':
504 case 'H4':
505 case 'H5':
506 case 'H6':
507 case 'TH':
508 $readability->value -= 5;
509 break;
510 }
511 $readability->value += $this->getClassWeight($node);
512 }
513
514 /***
515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
517 *
518 * @return DOMElement
519 **/
520 protected function grabArticle($page=null) {
521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
522 if (!$page) $page = $this->dom;
523 $allElements = $page->getElementsByTagName('*');
524 /**
525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
527 *
528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
529 * TODO: Shouldn't this be a reverse traversal?
530 **/
531 $node = null;
532 $nodesToScore = array();
533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
535 //$node = $targetList->item($nodeIndex);
536 $tagName = strtoupper($node->tagName);
537 /* Remove unlikely candidates */
538 if ($stripUnlikelyCandidates) {
539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
540 if (
541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
543 $tagName != 'BODY'
544 )
545 {
546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
547 //$nodesToRemove[] = $node;
548 $node->parentNode->removeChild($node);
549 $nodeIndex--;
550 continue;
551 }
552 }
553
554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
555 $nodesToScore[] = $node;
556 }
557
558 /* Turn all divs that don't have children block level elements into p's */
559 if ($tagName == 'DIV') {
560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
561 //$this->dbg('Altering div to p');
562 $newNode = $this->dom->createElement('p');
563 try {
564 $newNode->innerHTML = $node->innerHTML;
565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
566 $node->parentNode->replaceChild($newNode, $node);
567 $nodeIndex--;
568 $nodesToScore[] = $node; // or $newNode?
569 }
570 catch(Exception $e) {
571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
572 }
573 }
574 else
575 {
576 /* EXPERIMENTAL */
577 // TODO: change these p elements back to text nodes after processing
578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
579 $childNode = $node->childNodes->item($i);
580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE
581 //$this->dbg('replacing text node with a p tag with the same content.');
582 $p = $this->dom->createElement('p');
583 $p->innerHTML = $childNode->nodeValue;
584 $p->setAttribute('style', 'display: inline;');
585 $p->setAttribute('class', 'readability-styled');
586 $childNode->parentNode->replaceChild($p, $childNode);
587 }
588 }
589 }
590 }
591 }
592
593 /**
594 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 * Then add their score to their parent node.
596 *
597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
598 **/
599 $candidates = array();
600 for ($pt=0; $pt < count($nodesToScore); $pt++) {
601 $parentNode = $nodesToScore[$pt]->parentNode;
602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
604 $innerText = $this->getInnerText($nodesToScore[$pt]);
605
606 if (!$parentNode || !isset($parentNode->tagName)) {
607 continue;
608 }
609
610 /* If this paragraph is less than 25 characters, don't even count it. */
611 if(strlen($innerText) < 25) {
612 continue;
613 }
614
615 /* Initialize readability data for the parent. */
616 if (!$parentNode->hasAttribute('readability'))
617 {
618 $this->initializeNode($parentNode);
619 $candidates[] = $parentNode;
620 }
621
622 /* Initialize readability data for the grandparent. */
623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
624 {
625 $this->initializeNode($grandParentNode);
626 $candidates[] = $grandParentNode;
627 }
628
629 $contentScore = 0;
630
631 /* Add a point for the paragraph itself as a base. */
632 $contentScore++;
633
634 /* Add points for any commas within this paragraph */
635 $contentScore += count(explode(',', $innerText));
636
637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 $contentScore += min(floor(strlen($innerText) / 100), 3);
639
640 /* Add the score to the parent. The grandparent gets half. */
641 $parentNode->getAttributeNode('readability')->value += $contentScore;
642
643 if ($grandParentNode) {
644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 }
646 }
647
648 /**
649 * After we've calculated scores, loop through all of the possible candidate nodes we found
650 * and find the one with the highest score.
651 **/
652 $topCandidate = null;
653 for ($c=0, $cl=count($candidates); $c < $cl; $c++)
654 {
655 /**
656 * Scale the final candidates score based on link density. Good content should have a
657 * relatively small link density (5% or less) and be mostly unaffected by this operation.
658 **/
659 $readability = $candidates[$c]->getAttributeNode('readability');
660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
661
662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
663
664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
665 $topCandidate = $candidates[$c];
666 }
667 }
668
669 /**
670 * If we still have no top candidate, just use the body as a last resort.
671 * We also have to copy the body node so it is something we can modify.
672 **/
673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
674 {
675 $topCandidate = $this->dom->createElement('div');
676 if ($page instanceof DOMDocument) {
677 if (!isset($page->documentElement)) {
678 // we don't have a body either? what a mess! :)
679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = '';
682 $page->documentElement->appendChild($topCandidate);
683 }
684 } else {
685 $topCandidate->innerHTML = $page->innerHTML;
686 $page->innerHTML = '';
687 $page->appendChild($topCandidate);
688 }
689 $this->initializeNode($topCandidate);
690 }
691
692 /**
693 * Now that we have the top candidate, look through its siblings for content that might also be related.
694 * Things like preambles, content split by ads that we removed, etc.
695 **/
696 $articleContent = $this->dom->createElement('div');
697 $articleContent->setAttribute('id', 'readability-content');
698 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
699 $siblingNodes = $topCandidate->parentNode->childNodes;
700 if (!isset($siblingNodes)) {
701 $siblingNodes = new stdClass;
702 $siblingNodes->length = 0;
703 }
704
705 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
706 {
707 $siblingNode = $siblingNodes->item($s);
708 $append = false;
709
710 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
711
712 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
713
714 if ($siblingNode === $topCandidate)
715 // or if ($siblingNode->isSameNode($topCandidate))
716 {
717 $append = true;
718 }
719
720 $contentBonus = 0;
721 /* Give a bonus if sibling nodes and top candidates have the example same classname */
722 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
723 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
724 }
725
726 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
727 {
728 $append = true;
729 }
730
731 if (strtoupper($siblingNode->nodeName) == 'P') {
732 $linkDensity = $this->getLinkDensity($siblingNode);
733 $nodeContent = $this->getInnerText($siblingNode);
734 $nodeLength = strlen($nodeContent);
735
736 if ($nodeLength > 80 && $linkDensity < 0.25)
737 {
738 $append = true;
739 }
740 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
741 {
742 $append = true;
743 }
744 }
745
746 if ($append)
747 {
748 $this->dbg('Appending node: ' . $siblingNode->nodeName);
749
750 $nodeToAppend = null;
751 $sibNodeName = strtoupper($siblingNode->nodeName);
752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
754
755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
756 $nodeToAppend = $this->dom->createElement('div');
757 try {
758 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
759 $nodeToAppend->innerHTML = $siblingNode->innerHTML;
760 }
761 catch(Exception $e)
762 {
763 $this->dbg('Could not alter siblingNode to div, reverting back to original.');
764 $nodeToAppend = $siblingNode;
765 $s--;
766 $sl--;
767 }
768 } else {
769 $nodeToAppend = $siblingNode;
770 $s--;
771 $sl--;
772 }
773
774 /* To ensure a node does not interfere with readability styles, remove its classnames */
775 $nodeToAppend->removeAttribute('class');
776
777 /* Append sibling and subtract from our list because it removes the node when you append to another node */
778 $articleContent->appendChild($nodeToAppend);
779 }
780 }
781
782 /**
783 * So we have all of the content that we need. Now we clean it up for presentation.
784 **/
785 $this->prepArticle($articleContent);
786
787 /**
788 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
789 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
790 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
791 * finding the -right- content.
792 **/
793 if (strlen($this->getInnerText($articleContent, false)) < 250)
794 {
795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
798 $this->body->innerHTML = $this->bodyCache; 1146 $this->body->innerHTML = $this->bodyCache;
799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 return $this->grabArticle($this->body);
803 }
804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 return $this->grabArticle($this->body);
807 }
808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
810 return $this->grabArticle($this->body);
811 }
812 else {
813 return false;
814 }
815 }
816 return $articleContent;
817 }
818
819 /**
820 * Remove script tags from document
821 *
822 * @param DOMElement
823 * @return void
824 */
825 public function removeScripts($doc) {
826 $scripts = $doc->getElementsByTagName('script');
827 for($i = $scripts->length-1; $i >= 0; $i--)
828 {
829 $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 }
831 }
832
833 /**
834 * Get the inner text of a node.
835 * This also strips out any excess whitespace to be found.
836 *
837 * @param DOMElement $
838 * @param boolean $normalizeSpaces (default: true)
839 * @return string
840 **/
841 public function getInnerText($e, $normalizeSpaces=true) {
842 $textContent = '';
843
844 if (!isset($e->textContent) || $e->textContent == '') {
845 return '';
846 }
847
848 $textContent = trim($e->textContent);
849
850 if ($normalizeSpaces) {
851 return preg_replace($this->regexps['normalize'], ' ', $textContent);
852 } else {
853 return $textContent;
854 }
855 }
856
857 /**
858 * Get the number of times a string $s appears in the node $e.
859 *
860 * @param DOMElement $e
861 * @param string - what to count. Default is ","
862 * @return number (integer)
863 **/
864 public function getCharCount($e, $s=',') {
865 return substr_count($this->getInnerText($e), $s);
866 }
867
868 /**
869 * Remove the style attribute on every $e and under.
870 *
871 * @param DOMElement $e
872 * @return void
873 */
874 public function cleanStyles($e) {
875 if (!is_object($e)) return;
876 $elems = $e->getElementsByTagName('*');
877 foreach ($elems as $elem) {
878 $elem->removeAttribute('style');
879 }
880 }
881
882 /**
883 * Get the density of links as a percentage of the content
884 * This is the amount of text that is inside a link divided by the total text in the node.
885 *
886 * @param DOMElement $e
887 * @return number (float)
888 */
889 public function getLinkDensity($e) {
890 $links = $e->getElementsByTagName('a');
891 $textLength = strlen($this->getInnerText($e));
892 $linkLength = 0;
893 for ($i=0, $il=$links->length; $i < $il; $i++)
894 {
895 $linkLength += strlen($this->getInnerText($links->item($i)));
896 }
897 if ($textLength > 0) {
898 return $linkLength / $textLength;
899 } else {
900 return 0;
901 }
902 }
903
904 /**
905 * Get an elements class/id weight. Uses regular expressions to tell if this
906 * element looks good or bad.
907 *
908 * @param DOMElement $e
909 * @return number (Integer)
910 */
911 public function getClassWeight($e) {
912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
913 return 0;
914 }
915
916 $weight = 0;
917
918 /* Look for a special classname */
919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
920 {
921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
922 $weight -= 25;
923 }
924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
925 $weight += 25;
926 }
927 }
928
929 /* Look for a special ID */
930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
931 {
932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
933 $weight -= 25;
934 }
935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
936 $weight += 25;
937 }
938 }
939 return $weight;
940 }
941
942 /**
943 * Remove extraneous break tags from a node.
944 *
945 * @param DOMElement $node
946 * @return void
947 */
948 public function killBreaks($node) {
949 $html = $node->innerHTML;
950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
951 $node->innerHTML = $html;
952 }
953
954 /**
955 * Clean a node of all elements of type "tag".
956 * (Unless it's a youtube/vimeo video. People love movies.)
957 *
958 * Updated 2012-09-18 to preserve youtube/vimeo iframes
959 *
960 * @param DOMElement $e
961 * @param string $tag
962 * @return void
963 */
964 public function clean($e, $tag) {
965 $targetList = $e->getElementsByTagName($tag);
966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967
968 for ($y=$targetList->length-1; $y >= 0; $y--) {
969 /* Allow youtube and vimeo videos through as people usually want to see those. */
970 if ($isEmbed) {
971 $attributeValues = '';
972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 }
975
976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 if (preg_match($this->regexps['video'], $attributeValues)) {
978 continue;
979 }
980
981 /* Then check the elements inside this element for the same. */
982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
983 continue;
984 }
985 }
986 $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 }
988 }
989
990 /**
991 * Clean an element of all tags of type "tag" if they look fishy.
992 * "Fishy" is an algorithm based on content length, classnames,
993 * link density, number of images & embeds, etc.
994 *
995 * @param DOMElement $e
996 * @param string $tag
997 * @return void
998 */
999 public function cleanConditionally($e, $tag) {
1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1001 return;
1002 }
1003
1004 $tagsList = $e->getElementsByTagName($tag);
1005 $curTagsLength = $tagsList->length;
1006
1007 /**
1008 * Gather counts for other typical elements embedded within.
1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1010 *
1011 * TODO: Consider taking into account original contentScore here.
1012 */
1013 for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 $weight = $this->getClassWeight($tagsList->item($i));
1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016
1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018
1019 if ($weight + $contentScore < 0) {
1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1021 }
1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1023 /**
1024 * If there are not very many commas, and the number of
1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1026 **/
1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length;
1032
1033 $embedCount = 0;
1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 $embedCount++;
1038 }
1039 }
1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 $embedCount++;
1044 }
1045 }
1046
1047 $linkDensity = $this->getLinkDensity($tagsList->item($i));
1048 $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1049 $toRemove = false;
1050
1051 if ($this->lightClean) {
1052 $this->dbg('Light clean...');
1053 if ( ($img > $p) && ($img > 4) ) {
1054 $this->dbg(' more than 4 images and more image elements than paragraph elements');
1055 $toRemove = true;
1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1058 $toRemove = true;
1059 } else if ( $input > floor($p/3) ) {
1060 $this->dbg(' too many <input> elements');
1061 $toRemove = true;
1062 } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
1063 $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
1064 $toRemove = true;
1065 } else if($weight < 25 && $linkDensity > 0.2) {
1066 $this->dbg(' weight smaller than 25 and link density above 0.2');
1067 $toRemove = true;
1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
1070 $toRemove = true;
1071 } else if($embedCount > 3) {
1072 $this->dbg(' more than 3 embeds');
1073 $toRemove = true;
1074 }
1075 } else {
1076 $this->dbg('Standard clean...');
1077 if ( $img > $p ) {
1078 $this->dbg(' more image elements than paragraph elements');
1079 $toRemove = true;
1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1082 $toRemove = true;
1083 } else if ( $input > floor($p/3) ) {
1084 $this->dbg(' too many <input> elements');
1085 $toRemove = true;
1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
1088 $toRemove = true;
1089 } else if($weight < 25 && $linkDensity > 0.2) {
1090 $this->dbg(' weight smaller than 25 and link density above 0.2');
1091 $toRemove = true;
1092 } else if($weight >= 25 && $linkDensity > 0.5) {
1093 $this->dbg(' weight above 25 but link density greater than 0.5');
1094 $toRemove = true;
1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
1097 $toRemove = true;
1098 }
1099 }
1100
1101 if ($toRemove) {
1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1104 }
1105 }
1106 }
1107 }
1108
1109 /**
1110 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1111 *
1112 * @param DOMElement $e
1113 * @return void
1114 */
1115 public function cleanHeaders($e) {
1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1117 $headers = $e->getElementsByTagName('h' . $headerIndex);
1118 for ($i=$headers->length-1; $i >=0; $i--) {
1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1120 $headers->item($i)->parentNode->removeChild($headers->item($i));
1121 }
1122 }
1123 } 1147 }
1124 } 1148 }
1125 1149
1126 public function flagIsActive($flag) { 1150}
1127 return ($this->flags & $flag) > 0;
1128 }
1129
1130 public function addFlag($flag) {
1131 $this->flags = $this->flags | $flag;
1132 }
1133
1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag;
1136 }
1137}
1138?> \ No newline at end of file 1151?> \ No newline at end of file
diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php
index 2852c4c2..a081f88b 100755
--- a/inc/3rdparty/makefulltextfeed.php
+++ b/inc/3rdparty/makefulltextfeed.php
@@ -3,8 +3,8 @@
3// Author: Keyvan Minoukadeh 3// Author: Keyvan Minoukadeh
4// Copyright (c) 2013 Keyvan Minoukadeh 4// Copyright (c) 2013 Keyvan Minoukadeh
5// License: AGPLv3 5// License: AGPLv3
6// Version: 3.1 6// Version: 3.2
7// Date: 2013-03-05 7// Date: 2013-05-13
8// More info: http://fivefilters.org/content-only/ 8// More info: http://fivefilters.org/content-only/
9// Help: http://help.fivefilters.org 9// Help: http://help.fivefilters.org
10 10
@@ -25,14 +25,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
25 25
26// Usage 26// Usage
27// ----- 27// -----
28// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org 28// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
29// The following options can be passed in the querystring: 29// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
30// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))
31// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)
32// * API key: key=[api key] (optional, refer to config.php)
33// * Max entries to process: max=[max number of items] (optional)
34 30
35error_reporting(E_ALL ^ E_NOTICE); 31//error_reporting(E_ALL ^ E_NOTICE);
36ini_set("display_errors", 1); 32ini_set("display_errors", 1);
37@set_time_limit(120); 33@set_time_limit(120);
38 34
@@ -55,42 +51,8 @@ if (get_magic_quotes_gpc()) {
55 51
56// set include path 52// set include path
57set_include_path(realpath(dirname(__FILE__).'/libraries').PATH_SEPARATOR.get_include_path()); 53set_include_path(realpath(dirname(__FILE__).'/libraries').PATH_SEPARATOR.get_include_path());
58// Autoloading of classes allows us to include files only when they're 54
59// needed. If we've got a cached copy, for example, only Zend_Cache is loaded. 55require_once dirname(__FILE__).'/makefulltextfeedHelpers.php';
60function autoload($class_name) {
61 static $dir = null;
62 if ($dir === null) $dir = dirname(__FILE__).'/libraries/';
63 static $mapping = array(
64 // Include FeedCreator for RSS/Atom creation
65 'FeedWriter' => 'feedwriter/FeedWriter.php',
66 'FeedItem' => 'feedwriter/FeedItem.php',
67 // Include ContentExtractor and Readability for identifying and extracting content from URLs
68 'ContentExtractor' => 'content-extractor/ContentExtractor.php',
69 'SiteConfig' => 'content-extractor/SiteConfig.php',
70 'Readability' => 'readability/Readability.php',
71 // Include Humble HTTP Agent to allow parallel requests and response caching
72 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php',
73 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
74 'CookieJar' => 'humble-http-agent/CookieJar.php',
75 // Include Zend Cache to improve performance (cache results)
76 'Zend_Cache' => 'Zend/Cache.php',
77 // Language detect
78 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php',
79 // HTML5 Lib
80 'HTML5_Parser' => 'html5/Parser.php',
81 // htmLawed - used if XSS filter is enabled (xss_filter)
82 'htmLawed' => 'htmLawed/htmLawed.php'
83 );
84 if (isset($mapping[$class_name])) {
85 debug("** Loading class $class_name ({$mapping[$class_name]})");
86 require $dir.$mapping[$class_name];
87 return true;
88 } else {
89 return false;
90 }
91}
92spl_autoload_register('autoload');
93require dirname(__FILE__).'/libraries/simplepie/autoloader.php';
94 56
95//////////////////////////////// 57////////////////////////////////
96// Load config file 58// Load config file
@@ -110,8 +72,8 @@ header('X-Robots-Tag: noindex, nofollow');
110//////////////////////////////// 72////////////////////////////////
111// Check if service is enabled 73// Check if service is enabled
112//////////////////////////////// 74////////////////////////////////
113if (!$options->enabled) { 75if (!$options->enabled) {
114 die('The full-text RSS service is currently disabled'); 76 die('The full-text RSS service is currently disabled');
115} 77}
116 78
117//////////////////////////////// 79////////////////////////////////
@@ -155,8 +117,8 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc');
155//////////////////////////////// 117////////////////////////////////
156// Check for feed URL 118// Check for feed URL
157//////////////////////////////// 119////////////////////////////////
158if (!isset($_GET['url'])) { 120if (!isset($_GET['url'])) {
159 die('No URL supplied'); 121 die('No URL supplied');
160} 122}
161$url = trim($_GET['url']); 123$url = trim($_GET['url']);
162if (strtolower(substr($url, 0, 7)) == 'feed://') { 124if (strtolower(substr($url, 0, 7)) == 'feed://') {
@@ -195,10 +157,12 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
195 if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); 157 if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
196 if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); 158 if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
197 if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); 159 if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
198 if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); 160 if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);
199 if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); 161 if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
200 if (isset($_GET['xss'])) $redirect .= '&xss'; 162 if (isset($_GET['xss'])) $redirect .= '&xss';
201 if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; 163 if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
164 if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
165 if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
202 if (isset($_GET['debug'])) $redirect .= '&debug'; 166 if (isset($_GET['debug'])) $redirect .= '&debug';
203 if ($debug_mode) { 167 if ($debug_mode) {
204 debug('Redirecting to hide access key, follow URL below to continue'); 168 debug('Redirecting to hide access key, follow URL below to continue');
@@ -211,7 +175,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
211 175
212/////////////////////////////////////////////// 176///////////////////////////////////////////////
213// Set timezone. 177// Set timezone.
214// Prevents warnings, but needs more testing - 178// Prevents warnings, but needs more testing -
215// perhaps if timezone is set in php.ini we 179// perhaps if timezone is set in php.ini we
216// don't need to set it at all... 180// don't need to set it at all...
217/////////////////////////////////////////////// 181///////////////////////////////////////////////
@@ -233,7 +197,7 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int
233} 197}
234$key_index = ($valid_key) ? (int)$_GET['key'] : 0; 198$key_index = ($valid_key) ? (int)$_GET['key'] : 0;
235if (!$valid_key && $options->key_required) { 199if (!$valid_key && $options->key_required) {
236 die('A valid key must be supplied'); 200 die('A valid key must be supplied');
237} 201}
238if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') { 202if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
239 die('The entered key is invalid'); 203 die('The entered key is invalid');
@@ -285,6 +249,28 @@ if ($options->favour_feed_titles == 'user') {
285} 249}
286 250
287/////////////////////////////////////////////// 251///////////////////////////////////////////////
252// Include full content in output?
253///////////////////////////////////////////////
254if ($options->content === 'user') {
255 if (isset($_GET['content']) && $_GET['content'] === '0') {
256 $options->content = false;
257 } else {
258 $options->content = true;
259 }
260}
261
262///////////////////////////////////////////////
263// Include summaries in output?
264///////////////////////////////////////////////
265if ($options->summary === 'user') {
266 if (isset($_GET['summary']) && $_GET['summary'] === '1') {
267 $options->summary = true;
268 } else {
269 $options->summary = false;
270 }
271}
272
273///////////////////////////////////////////////
288// Exclude items if extraction fails 274// Exclude items if extraction fails
289/////////////////////////////////////////////// 275///////////////////////////////////////////////
290if ($options->exclude_items_on_fail === 'user') { 276if ($options->exclude_items_on_fail === 'user') {
@@ -306,15 +292,6 @@ if ($options->detect_language === 'user') {
306 $detect_language = $options->detect_language; 292 $detect_language = $options->detect_language;
307} 293}
308 294
309if ($detect_language >= 2) {
310 $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
311 'cebuano' => 'ceb', // ISO 639-2
312 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
313 'hawaiian' => 'haw', // ISO 639-2
314 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
315 'pidgin' => 'cpe', // ISO 639-2
316 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
317}
318$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); 295$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
319 296
320///////////////////////////////////// 297/////////////////////////////////////
@@ -364,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
364////////////////////////////////// 341//////////////////////////////////
365if ($options->caching) { 342if ($options->caching) {
366 debug('Caching is enabled...'); 343 debug('Caching is enabled...');
367 $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); 344 $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
368 $check_cache = true; 345 $check_cache = true;
369 if ($options->apc && $options->smart_cache) { 346 if ($options->apc && $options->smart_cache) {
370 apc_add("cache.$cache_id", 0, 10*60); 347 apc_add("cache.$cache_id", 0, 10*60);
@@ -415,6 +392,7 @@ if (!$debug_mode) {
415////////////////////////////////// 392//////////////////////////////////
416// Set up HTTP agent 393// Set up HTTP agent
417////////////////////////////////// 394//////////////////////////////////
395global $http;
418$http = new HumbleHttpAgent(); 396$http = new HumbleHttpAgent();
419$http->debug = $debug_mode; 397$http->debug = $debug_mode;
420$http->userAgentMap = $options->user_agents; 398$http->userAgentMap = $options->user_agents;
@@ -478,29 +456,6 @@ if ($html_only || !$result) {
478 $isDummyFeed = true; 456 $isDummyFeed = true;
479 unset($feed, $result); 457 unset($feed, $result);
480 // create single item dummy feed object 458 // create single item dummy feed object
481 class DummySingleItemFeed {
482 public $item;
483 function __construct($url) { $this->item = new DummySingleItem($url); }
484 public function get_title() { return ''; }
485 public function get_description() { return 'Content extracted from '.$this->item->url; }
486 public function get_link() { return $this->item->url; }
487 public function get_language() { return false; }
488 public function get_image_url() { return false; }
489 public function get_items($start=0, $max=1) { return array(0=>$this->item); }
490 }
491 class DummySingleItem {
492 public $url;
493 function __construct($url) { $this->url = $url; }
494 public function get_permalink() { return $this->url; }
495 public function get_title() { return null; }
496 public function get_date($format='') { return false; }
497 public function get_author($key=0) { return null; }
498 public function get_authors() { return null; }
499 public function get_description() { return ''; }
500 public function get_enclosure($key=0, $prefer=null) { return null; }
501 public function get_enclosures() { return null; }
502 public function get_categories() { return null; }
503 }
504 $feed = new DummySingleItemFeed($url); 459 $feed = new DummySingleItemFeed($url);
505} 460}
506 461
@@ -524,7 +479,7 @@ if ($img_url = $feed->get_image_url()) {
524//////////////////////////////////////////// 479////////////////////////////////////////////
525// Loop through feed items 480// Loop through feed items
526//////////////////////////////////////////// 481////////////////////////////////////////////
527$items = $feed->get_items(0, $max); 482$items = $feed->get_items(0, $max);
528// Request all feed items in parallel (if supported) 483// Request all feed items in parallel (if supported)
529$urls_sanitized = array(); 484$urls_sanitized = array();
530$urls = array(); 485$urls = array();
@@ -606,24 +561,43 @@ foreach ($items as $key => $item) {
606 $is_single_page = false; 561 $is_single_page = false;
607 if ($single_page_response = getSinglePage($item, $html, $effective_url)) { 562 if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
608 $is_single_page = true; 563 $is_single_page = true;
609 $html = $single_page_response['body'];
610 // remove strange things
611 $html = str_replace('</[>', '', $html);
612 $html = convert_to_utf8($html, $single_page_response['headers']);
613 $effective_url = $single_page_response['effective_url']; 564 $effective_url = $single_page_response['effective_url'];
614 debug("Retrieved single-page view from $effective_url"); 565 // check if action defined for returned Content-Type
566 $mime_info = get_mime_action_info($single_page_response['headers']);
567 if (isset($mime_info['action'])) {
568 if ($mime_info['action'] == 'exclude') {
569 continue; // skip this feed item entry
570 } elseif ($mime_info['action'] == 'link') {
571 if ($mime_info['type'] == 'image') {
572 $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
573 } else {
574 $html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
575 }
576 $extracted_title = $mime_info['name'];
577 $do_content_extraction = false;
578 }
579 }
580 if ($do_content_extraction) {
581 $html = $single_page_response['body'];
582 // remove strange things
583 $html = str_replace('</[>', '', $html);
584 $html = convert_to_utf8($html, $single_page_response['headers']);
585 debug("Retrieved single-page view from $effective_url");
586 }
615 unset($single_page_response); 587 unset($single_page_response);
616 } 588 }
589 }
590 if ($do_content_extraction) {
617 debug('--------'); 591 debug('--------');
618 debug('Attempting to extract content'); 592 debug('Attempting to extract content');
619 $extract_result = $extractor->process($html, $effective_url); 593 $extract_result = $extractor->process($html, $effective_url);
620 $readability = $extractor->readability; 594 $readability = $extractor->readability;
621 $content_block = ($extract_result) ? $extractor->getContent() : null; 595 $content_block = ($extract_result) ? $extractor->getContent() : null;
622 $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; 596 $extracted_title = ($extract_result) ? $extractor->getTitle() : '';
623 // Deal with multi-page articles 597 // Deal with multi-page articles
624 //die('Next: '.$extractor->getNextPageUrl()); 598 //die('Next: '.$extractor->getNextPageUrl());
625 $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); 599 $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
626 if ($options->multipage && $is_multi_page) { 600 if ($options->multipage && $is_multi_page && $options->content) {
627 debug('--------'); 601 debug('--------');
628 debug('Attempting to process multi-page article'); 602 debug('Attempting to process multi-page article');
629 $multi_page_urls = array(); 603 $multi_page_urls = array();
@@ -636,7 +610,7 @@ foreach ($items as $key => $item) {
636 // check it's not what we have already! 610 // check it's not what we have already!
637 if (!in_array($next_page_url, $multi_page_urls)) { 611 if (!in_array($next_page_url, $multi_page_urls)) {
638 // it's not, so let's attempt to fetch it 612 // it's not, so let's attempt to fetch it
639 $multi_page_urls[] = $next_page_url; 613 $multi_page_urls[] = $next_page_url;
640 $_prev_ref = $http->referer; 614 $_prev_ref = $http->referer;
641 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { 615 if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {
642 // make sure mime type is not something with a different action associated 616 // make sure mime type is not something with a different action associated
@@ -661,13 +635,15 @@ foreach ($items as $key => $item) {
661 // did we successfully deal with this multi-page article? 635 // did we successfully deal with this multi-page article?
662 if (empty($multi_page_content)) { 636 if (empty($multi_page_content)) {
663 debug('Failed to extract all parts of multi-page article, so not going to include them'); 637 debug('Failed to extract all parts of multi-page article, so not going to include them');
664 $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; 638 $_page = $readability->dom->createElement('p');
639 $_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
640 $multi_page_content[] = $_page;
665 } 641 }
666 foreach ($multi_page_content as $_page) { 642 foreach ($multi_page_content as $_page) {
667 $_page = $content_block->ownerDocument->importNode($_page, true); 643 $_page = $content_block->ownerDocument->importNode($_page, true);
668 $content_block->appendChild($_page); 644 $content_block->appendChild($_page);
669 } 645 }
670 unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); 646 unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
671 } 647 }
672 } 648 }
673 // use extracted title for both feed and item title if we're using single-item dummy feed 649 // use extracted title for both feed and item title if we're using single-item dummy feed
@@ -695,7 +671,11 @@ foreach ($items as $key => $item) {
695 $html .= $item->get_description(); 671 $html .= $item->get_description();
696 } else { 672 } else {
697 $readability->clean($content_block, 'select'); 673 $readability->clean($content_block, 'select');
698 if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block); 674 // get base URL
675 $base_url = get_base_url($readability->dom);
676 if (!$base_url) $base_url = $effective_url;
677 // rewrite URLs
678 if ($options->rewrite_relative_urls) makeAbsolute($base_url, $content_block);
699 // footnotes 679 // footnotes
700 if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { 680 if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
701 $readability->addFootnotes($content_block); 681 $readability->addFootnotes($content_block);
@@ -714,7 +694,7 @@ foreach ($items as $key => $item) {
714 } else { 694 } else {
715 $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML 695 $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
716 } 696 }
717 unset($content_block); 697 //unset($content_block);
718 // post-processing cleanup 698 // post-processing cleanup
719 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html); 699 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
720 if ($links == 'remove') { 700 if ($links == 'remove') {
@@ -727,130 +707,155 @@ foreach ($items as $key => $item) {
727 } 707 }
728 } 708 }
729 709
730 if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment 710 if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
731 $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); 711 $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
712 } else {
713 $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
714 }
715 // filter xss?
716 if ($xss_filter) {
717 debug('Filtering HTML to remove XSS');
718 $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
719 }
720
721 // add content
722 if ($options->summary === true) {
723 // get summary
724 $summary = '';
725 if (!$do_content_extraction) {
726 $summary = $html;
732 } else { 727 } else {
733 $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); 728 // Try to get first few paragraphs
734 } 729 if (isset($content_block) && ($content_block instanceof DOMElement)) {
735 // filter xss? 730 $_paras = $content_block->getElementsByTagName('p');
736 if ($xss_filter) { 731 foreach ($_paras as $_para) {
737 debug('Filtering HTML to remove XSS'); 732 $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
738 $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); 733 if (strlen($summary) > 200) break;
739 }
740 $newitem->setDescription($html);
741
742 // set date
743 if ((int)$item->get_date('U') > 0) {
744 $newitem->setDate((int)$item->get_date('U'));
745 } elseif ($extractor->getDate()) {
746 $newitem->setDate($extractor->getDate());
747 }
748
749 // add authors
750 if ($authors = $item->get_authors()) {
751 foreach ($authors as $author) {
752 // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
753 if ($author->get_name() !== null) {
754 $newitem->addElement('dc:creator', $author->get_name());
755 } elseif ($author->get_email() !== null) {
756 $newitem->addElement('dc:creator', $author->get_email());
757 } 734 }
735 } else {
736 $summary = $html;
758 } 737 }
759 } elseif ($authors = $extractor->getAuthors()) { 738 }
760 //TODO: make sure the list size is reasonable 739 unset($_paras, $_para);
761 foreach ($authors as $author) { 740 $summary = get_excerpt($summary);
762 // TODO: xpath often selects authors from other articles linked from the page. 741 $newitem->setDescription($summary);
763 // for now choose first item 742 if ($options->content) $newitem->setElement('content:encoded', $html);
764 $newitem->addElement('dc:creator', $author); 743 } else {
765 break; 744 if ($options->content) $newitem->setDescription($html);
745 }
746
747 // set date
748 if ((int)$item->get_date('U') > 0) {
749 $newitem->setDate((int)$item->get_date('U'));
750 } elseif ($extractor->getDate()) {
751 $newitem->setDate($extractor->getDate());
752 }
753
754 // add authors
755 if ($authors = $item->get_authors()) {
756 foreach ($authors as $author) {
757 // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
758 if ($author->get_name() !== null) {
759 $newitem->addElement('dc:creator', $author->get_name());
760 } elseif ($author->get_email() !== null) {
761 $newitem->addElement('dc:creator', $author->get_email());
766 } 762 }
767 } 763 }
768 764 } elseif ($authors = $extractor->getAuthors()) {
769 // add language 765 //TODO: make sure the list size is reasonable
770 if ($detect_language) { 766 foreach ($authors as $author) {
771 $language = $extractor->getLanguage(); 767 // TODO: xpath often selects authors from other articles linked from the page.
772 if (!$language) $language = $feed->get_language(); 768 // for now choose first item
773 if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { 769 $newitem->addElement('dc:creator', $author);
774 try { 770 break;
775 if ($use_cld) { 771 }
776 // Use PHP-CLD extension 772 }
777 $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error 773
778 $res = $php_cld($text_sample); 774 // add language
779 if (is_array($res) && count($res) > 0) { 775 if ($detect_language) {
780 $language = $res[0]['code']; 776 $language = $extractor->getLanguage();
781 } 777 if (!$language) $language = $feed->get_language();
782 } else { 778 if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
783 //die('what'); 779 try {
784 // Use PEAR's Text_LanguageDetect 780 if ($use_cld) {
785 if (!isset($l)) { 781 // Use PHP-CLD extension
786 $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); 782 $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
787 } 783 $res = $php_cld($text_sample);
788 $l_result = $l->detect($text_sample, 1); 784 if (is_array($res) && count($res) > 0) {
789 if (count($l_result) > 0) { 785 $language = $res[0]['code'];
790 $language = $language_codes[key($l_result)]; 786 }
791 } 787 } else {
788 //die('what');
789 // Use PEAR's Text_LanguageDetect
790 if (!isset($l)) {
791 $l = new Text_LanguageDetect();
792 $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
793 }
794 $l_result = $l->detect($text_sample, 1);
795 if (count($l_result) > 0) {
796 $language = key($l_result);
792 } 797 }
793 } catch (Exception $e) {
794 //die('error: '.$e);
795 // do nothing
796 } 798 }
797 } 799 } catch (Exception $e) {
798 if ($language && (strlen($language) < 7)) { 800 //die('error: '.$e);
799 $newitem->addElement('dc:language', $language); 801 // do nothing
800 } 802 }
801 } 803 }
802 804 if ($language && (strlen($language) < 7)) {
803 // add MIME type (if it appeared in our exclusions lists) 805 $newitem->addElement('dc:language', $language);
804 if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
805 // add effective URL (URL after redirects)
806 if (isset($effective_url)) {
807 //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
808 //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi)
809 //temporary measure: use utf8_encode()
810 $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
811 } else {
812 $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
813 } 806 }
814 807 }
815 // add categories 808
816 if ($categories = $item->get_categories()) { 809 // add MIME type (if it appeared in our exclusions lists)
817 foreach ($categories as $category) { 810 if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
818 if ($category->get_label() !== null) { 811 // add effective URL (URL after redirects)
819 $newitem->addElement('category', $category->get_label()); 812 if (isset($effective_url)) {
820 } 813 //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
814 //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)
815 //temporary measure: use utf8_encode()
816 $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
817 } else {
818 $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
819 }
820
821 // add categories
822 if ($categories = $item->get_categories()) {
823 foreach ($categories as $category) {
824 if ($category->get_label() !== null) {
825 $newitem->addElement('category', $category->get_label());
821 } 826 }
822 } 827 }
823 828 }
824 // check for enclosures 829
825 if ($options->keep_enclosures) { 830 // check for enclosures
826 if ($enclosures = $item->get_enclosures()) { 831 if ($options->keep_enclosures) {
827 foreach ($enclosures as $enclosure) { 832 if ($enclosures = $item->get_enclosures()) {
828 // thumbnails 833 foreach ($enclosures as $enclosure) {
829 foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { 834 // thumbnails
830 $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); 835 foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
831 } 836 $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
832 if (!$enclosure->get_link()) continue;
833 $enc = array();
834 // Media RSS spec ($enc): http://search.yahoo.com/mrss
835 // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
836 $enc['url'] = $enclosure->get_link();
837 if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
838 if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
839 if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
840 if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
841 if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
842 if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
843 if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
844 if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
845 if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
846 if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
847 if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
848 if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
849 $newitem->addElement('media:content', '', $enc);
850 } 837 }
838 if (!$enclosure->get_link()) continue;
839 $enc = array();
840 // Media RSS spec ($enc): http://search.yahoo.com/mrss
841 // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
842 $enc['url'] = $enclosure->get_link();
843 if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
844 if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
845 if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
846 if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
847 if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
848 if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
849 if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
850 if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
851 if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
852 if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
853 if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
854 if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
855 $newitem->addElement('media:content', '', $enc);
851 } 856 }
852 } 857 }
853 /* } */ 858 }
854 $output->addItem($newitem); 859 $output->addItem($newitem);
855 unset($html); 860 unset($html);
856 $item_count++; 861 $item_count++;
@@ -887,7 +892,7 @@ if (!$debug_mode) {
887 } 892 }
888 if ($add_to_cache) { 893 if ($add_to_cache) {
889 ob_start(); 894 ob_start();
890 $output->genarateFeed(); 895 $output->genarateFeed(false);
891 $output = ob_get_contents(); 896 $output = ob_get_contents();
892 ob_end_clean(); 897 ob_end_clean();
893 if ($html_only && $item_count == 0) { 898 if ($html_only && $item_count == 0) {
@@ -898,299 +903,8 @@ if (!$debug_mode) {
898 } 903 }
899 echo $output; 904 echo $output;
900 } else { 905 } else {
901 $output->genarateFeed(); 906 $output->genarateFeed(false);
902 } 907 }
903 if ($callback) echo ');'; 908 if ($callback) echo ');';
904} 909}
905 910
906///////////////////////////////
907// HELPER FUNCTIONS
908///////////////////////////////
909
910function url_allowed($url) {
911 global $options;
912 if (!empty($options->allowed_urls)) {
913 $allowed = false;
914 foreach ($options->allowed_urls as $allowurl) {
915 if (stristr($url, $allowurl) !== false) {
916 $allowed = true;
917 break;
918 }
919 }
920 if (!$allowed) return false;
921 } else {
922 foreach ($options->blocked_urls as $blockurl) {
923 if (stristr($url, $blockurl) !== false) {
924 return false;
925 }
926 }
927 }
928 return true;
929}
930
931//////////////////////////////////////////////
932// Convert $html to UTF8
933// (uses HTTP headers and HTML to find encoding)
934// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
935//////////////////////////////////////////////
936function convert_to_utf8($html, $header=null)
937{
938 $encoding = null;
939 if ($html || $header) {
940 if (is_array($header)) $header = implode("\n", $header);
941 if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
942 // error parsing the response
943 debug('Could not find Content-Type header in HTTP response');
944 } else {
945 $match = end($match); // get last matched element (in case of redirects)
946 if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
947 }
948 // TODO: check to see if encoding is supported (can we convert it?)
949 // If it's not, result will be empty string.
950 // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
951 // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
952 if (!$encoding || $encoding == 'none') {
953 // search for encoding in HTML - only look at the first 50000 characters
954 // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
955 // TODO: improve this so it looks at smaller chunks first
956 $html_head = substr($html, 0, 50000);
957 if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
958 $encoding = trim($match[1], '"\'');
959 } elseif (preg_match('/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
960 $encoding = trim($match[1]);
961 } elseif (preg_match_all('/<meta\s+([^>]+)>/i', $html_head, $match)) {
962 foreach ($match[1] as $_test) {
963 if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
964 $encoding = trim($_m[1]);
965 break;
966 }
967 }
968 }
969 }
970 if (isset($encoding)) $encoding = trim($encoding);
971 // trim is important here!
972 if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
973 // replace MS Word smart qutoes
974 $trans = array();
975 $trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
976 $trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
977 $trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
978 $trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
979 $trans[chr(134)] = '&dagger;'; // Dagger
980 $trans[chr(135)] = '&Dagger;'; // Double Dagger
981 $trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
982 $trans[chr(137)] = '&permil;'; // Per Mille Sign
983 $trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
984 $trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
985 $trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
986 $trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
987 $trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
988 $trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
989 $trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
990 $trans[chr(149)] = '&bull;'; // Bullet
991 $trans[chr(150)] = '&ndash;'; // En Dash
992 $trans[chr(151)] = '&mdash;'; // Em Dash
993 $trans[chr(152)] = '&tilde;'; // Small Tilde
994 $trans[chr(153)] = '&trade;'; // Trade Mark Sign
995 $trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
996 $trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
997 $trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
998 $trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
999 $html = strtr($html, $trans);
1000 }
1001 if (!$encoding) {
1002 debug('No character encoding found, so treating as UTF-8');
1003 $encoding = 'utf-8';
1004 } else {
1005 debug('Character encoding: '.$encoding);
1006 if (strtolower($encoding) != 'utf-8') {
1007 debug('Converting to UTF-8');
1008 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
1009 /*
1010 if (function_exists('iconv')) {
1011 // iconv appears to handle certain character encodings better than mb_convert_encoding
1012 $html = iconv($encoding, 'utf-8', $html);
1013 } else {
1014 $html = mb_convert_encoding($html, 'utf-8', $encoding);
1015 }
1016 */
1017 }
1018 }
1019 }
1020 return $html;
1021}
1022
1023function makeAbsolute($base, $elem) {
1024 $base = new SimplePie_IRI($base);
1025 // remove '//' in URL path (used to prevent URLs from resolving properly)
1026 // TODO: check if this is still the case
1027 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
1028 foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
1029 $elems = $elem->getElementsByTagName($tag);
1030 for ($i = $elems->length-1; $i >= 0; $i--) {
1031 $e = $elems->item($i);
1032 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
1033 makeAbsoluteAttr($base, $e, $attr);
1034 }
1035 if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
1036 }
1037}
1038function makeAbsoluteAttr($base, $e, $attr) {
1039 if ($e->hasAttribute($attr)) {
1040 // Trim leading and trailing white space. I don't really like this but
1041 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
1042 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
1043 $url = str_replace(' ', '%20', $url);
1044 if (!preg_match('!https?://!i', $url)) {
1045 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
1046 $e->setAttribute($attr, $absolute);
1047 }
1048 }
1049 }
1050}
1051function makeAbsoluteStr($base, $url) {
1052 $base = new SimplePie_IRI($base);
1053 // remove '//' in URL path (causes URLs not to resolve properly)
1054 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
1055 if (preg_match('!^https?://!i', $url)) {
1056 // already absolute
1057 return $url;
1058 } else {
1059 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
1060 return $absolute;
1061 }
1062 return false;
1063 }
1064}
1065// returns single page response, or false if not found
1066function getSinglePage($item, $html, $url) {
1067 global $http, $extractor;
1068 debug('Looking for site config files to see if single page link exists');
1069 $site_config = $extractor->buildSiteConfig($url, $html);
1070 $splink = null;
1071 if (!empty($site_config->single_page_link)) {
1072 $splink = $site_config->single_page_link;
1073 } elseif (!empty($site_config->single_page_link_in_feed)) {
1074 // single page link xpath is targeted at feed
1075 $splink = $site_config->single_page_link_in_feed;
1076 // so let's replace HTML with feed item description
1077 $html = $item->get_description();
1078 }
1079 if (isset($splink)) {
1080 // Build DOM tree from HTML
1081 $readability = new Readability($html, $url);
1082 $xpath = new DOMXPath($readability->dom);
1083 // Loop through single_page_link xpath expressions
1084 $single_page_url = null;
1085 foreach ($splink as $pattern) {
1086 $elems = @$xpath->evaluate($pattern, $readability->dom);
1087 if (is_string($elems)) {
1088 $single_page_url = trim($elems);
1089 break;
1090 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
1091 foreach ($elems as $item) {
1092 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
1093 $single_page_url = $item->getAttribute('href');
1094 break 2;
1095 } elseif ($item instanceof DOMAttr && $item->value) {
1096 $single_page_url = $item->value;
1097 break 2;
1098 }
1099 }
1100 }
1101 }
1102 // If we've got URL, resolve against $url
1103 if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
1104 // check it's not what we have already!
1105 if ($single_page_url != $url) {
1106 // it's not, so let's try to fetch it...
1107 $_prev_ref = $http->referer;
1108 $http->referer = $single_page_url;
1109 if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
1110 $http->referer = $_prev_ref;
1111 return $response;
1112 }
1113 $http->referer = $_prev_ref;
1114 }
1115 }
1116 }
1117 return false;
1118}
1119
1120// based on content-type http header, decide what to do
1121// param: HTTP headers string
1122// return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
1123// e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
1124function get_mime_action_info($headers) {
1125 global $options;
1126 // check if action defined for returned Content-Type
1127 $info = array();
1128 if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $headers, $match)) {
1129 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
1130 // match[1] = full mime type, e.g. image/jpeg
1131 // match[2] = first part, e.g. image
1132 // match[3] = last part, e.g. jpeg
1133 $info['mime'] = strtolower(trim($match[1]));
1134 $info['type'] = strtolower(trim($match[2]));
1135 $info['subtype'] = strtolower(trim($match[3]));
1136 foreach (array($info['mime'], $info['type']) as $_mime) {
1137 if (isset($options->content_type_exc[$_mime])) {
1138 $info['action'] = $options->content_type_exc[$_mime]['action'];
1139 $info['name'] = $options->content_type_exc[$_mime]['name'];
1140 break;
1141 }
1142 }
1143 }
1144 return $info;
1145}
1146
1147function remove_url_cruft($url) {
1148 // remove google analytics for the time being
1149 // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
1150 // https://gist.github.com/758177
1151 return preg_replace('/(\?|\&)utm_[a-z]+=[^\&]+/', '', $url);
1152}
1153
1154function make_substitutions($string) {
1155 if ($string == '') return $string;
1156 global $item, $effective_url;
1157 $string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
1158 $string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
1159 return $string;
1160}
1161
1162function get_cache() {
1163 global $options, $valid_key;
1164 static $cache = null;
1165 if ($cache === null) {
1166 $frontendOptions = array(
1167 'lifetime' => 10*60, // cache lifetime of 10 minutes
1168 'automatic_serialization' => false,
1169 'write_control' => false,
1170 'automatic_cleaning_factor' => $options->cache_cleanup,
1171 'ignore_user_abort' => false
1172 );
1173 $backendOptions = array(
1174 'cache_dir' => ($valid_key) ? $options->cache_dir.'/rss-with-key/' : $options->cache_dir.'/rss/', // directory where to put the cache files
1175 'file_locking' => false,
1176 'read_control' => true,
1177 'read_control_type' => 'strlen',
1178 'hashed_directory_level' => $options->cache_directory_level,
1179 'hashed_directory_perm' => 0777,
1180 'cache_file_perm' => 0664,
1181 'file_name_prefix' => 'ff'
1182 );
1183 // getting a Zend_Cache_Core object
1184 $cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
1185 }
1186 return $cache;
1187}
1188
1189function debug($msg) {
1190 global $debug_mode;
1191 if ($debug_mode) {
1192 echo '* ',$msg,"\n";
1193 ob_flush();
1194 flush();
1195 }
1196} \ No newline at end of file
diff --git a/inc/3rdparty/makefulltextfeedHelpers.php b/inc/3rdparty/makefulltextfeedHelpers.php
new file mode 100755
index 00000000..ac872ab8
--- /dev/null
+++ b/inc/3rdparty/makefulltextfeedHelpers.php
@@ -0,0 +1,389 @@
1<?php
2
3// Autoloading of classes allows us to include files only when they're
4// needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
5function autoload($class_name) {
6 static $dir = null;
7 if ($dir === null) $dir = dirname(__FILE__).'/libraries/';
8 static $mapping = array(
9 // Include FeedCreator for RSS/Atom creation
10 'FeedWriter' => 'feedwriter/FeedWriter.php',
11 'FeedItem' => 'feedwriter/FeedItem.php',
12 // Include ContentExtractor and Readability for identifying and extracting content from URLs
13 'ContentExtractor' => 'content-extractor/ContentExtractor.php',
14 'SiteConfig' => 'content-extractor/SiteConfig.php',
15 'Readability' => 'readability/Readability.php',
16 // Include Humble HTTP Agent to allow parallel requests and response caching
17 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php',
18 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
19 'CookieJar' => 'humble-http-agent/CookieJar.php',
20 // Include Zend Cache to improve performance (cache results)
21 'Zend_Cache' => 'Zend/Cache.php',
22 // Language detect
23 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php',
24 // HTML5 Lib
25 'HTML5_Parser' => 'html5/Parser.php',
26 // htmLawed - used if XSS filter is enabled (xss_filter)
27 'htmLawed' => 'htmLawed/htmLawed.php'
28 );
29 if (isset($mapping[$class_name])) {
30 debug("** Loading class $class_name ({$mapping[$class_name]})");
31 require $dir.$mapping[$class_name];
32 return true;
33 } else {
34 return false;
35 }
36}
37spl_autoload_register('autoload');
38require dirname(__FILE__).'/libraries/simplepie/autoloader.php';
39
40
41class DummySingleItemFeed {
42 public $item;
43 function __construct($url) { $this->item = new DummySingleItem($url); }
44 public function get_title() { return ''; }
45 public function get_description() { return 'Content extracted from '.$this->item->url; }
46 public function get_link() { return $this->item->url; }
47 public function get_language() { return false; }
48 public function get_image_url() { return false; }
49 public function get_items($start=0, $max=1) { return array(0=>$this->item); }
50}
51class DummySingleItem {
52 public $url;
53 function __construct($url) { $this->url = $url; }
54 public function get_permalink() { return $this->url; }
55 public function get_title() { return null; }
56 public function get_date($format='') { return false; }
57 public function get_author($key=0) { return null; }
58 public function get_authors() { return null; }
59 public function get_description() { return ''; }
60 public function get_enclosure($key=0, $prefer=null) { return null; }
61 public function get_enclosures() { return null; }
62 public function get_categories() { return null; }
63}
64
65///////////////////////////////
66// HELPER FUNCTIONS
67///////////////////////////////
68
69// Adapted from WordPress
70// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
71function get_excerpt($text, $num_words=55, $more=null) {
72 if (null === $more) $more = '&hellip;';
73 $text = strip_tags($text);
74 //TODO: Check if word count is based on single characters (East Asian characters)
75 /*
76 if (1==2) {
77 $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
78 preg_match_all('/./u', $text, $words_array);
79 $words_array = array_slice($words_array[0], 0, $num_words + 1);
80 $sep = '';
81 } else {
82 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
83 $sep = ' ';
84 }
85 */
86 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
87 $sep = ' ';
88 if (count($words_array) > $num_words) {
89 array_pop($words_array);
90 $text = implode($sep, $words_array);
91 $text = $text.$more;
92 } else {
93 $text = implode($sep, $words_array);
94 }
95 // trim whitespace at beginning or end of string
96 // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
97 $text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
98 return $text;
99}
100
101function url_allowed($url) {
102 global $options;
103 if (!empty($options->allowed_urls)) {
104 $allowed = false;
105 foreach ($options->allowed_urls as $allowurl) {
106 if (stristr($url, $allowurl) !== false) {
107 $allowed = true;
108 break;
109 }
110 }
111 if (!$allowed) return false;
112 } else {
113 foreach ($options->blocked_urls as $blockurl) {
114 if (stristr($url, $blockurl) !== false) {
115 return false;
116 }
117 }
118 }
119 return true;
120}
121
122//////////////////////////////////////////////
123// Convert $html to UTF8
124// (uses HTTP headers and HTML to find encoding)
125// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
126//////////////////////////////////////////////
127function convert_to_utf8($html, $header=null)
128{
129 $encoding = null;
130 if ($html || $header) {
131 if (is_array($header)) $header = implode("\n", $header);
132 if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
133 // error parsing the response
134 debug('Could not find Content-Type header in HTTP response');
135 } else {
136 $match = end($match); // get last matched element (in case of redirects)
137 if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
138 }
139 // TODO: check to see if encoding is supported (can we convert it?)
140 // If it's not, result will be empty string.
141 // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
142 // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
143 if (!$encoding || $encoding == 'none') {
144 // search for encoding in HTML - only look at the first 50000 characters
145 // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
146 // TODO: improve this so it looks at smaller chunks first
147 $html_head = substr($html, 0, 50000);
148 if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
149 $encoding = trim($match[1], '"\'');
150 } elseif (preg_match('/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
151 $encoding = trim($match[1]);
152 } elseif (preg_match_all('/<meta\s+([^>]+)>/i', $html_head, $match)) {
153 foreach ($match[1] as $_test) {
154 if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
155 $encoding = trim($_m[1]);
156 break;
157 }
158 }
159 }
160 }
161 if (isset($encoding)) $encoding = trim($encoding);
162 // trim is important here!
163 if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
164 // replace MS Word smart qutoes
165 $trans = array();
166 $trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
167 $trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
168 $trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
169 $trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
170 $trans[chr(134)] = '&dagger;'; // Dagger
171 $trans[chr(135)] = '&Dagger;'; // Double Dagger
172 $trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
173 $trans[chr(137)] = '&permil;'; // Per Mille Sign
174 $trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
175 $trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
176 $trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
177 $trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
178 $trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
179 $trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
180 $trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
181 $trans[chr(149)] = '&bull;'; // Bullet
182 $trans[chr(150)] = '&ndash;'; // En Dash
183 $trans[chr(151)] = '&mdash;'; // Em Dash
184 $trans[chr(152)] = '&tilde;'; // Small Tilde
185 $trans[chr(153)] = '&trade;'; // Trade Mark Sign
186 $trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
187 $trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
188 $trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
189 $trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
190 $html = strtr($html, $trans);
191 }
192 if (!$encoding) {
193 debug('No character encoding found, so treating as UTF-8');
194 $encoding = 'utf-8';
195 } else {
196 debug('Character encoding: '.$encoding);
197 if (strtolower($encoding) != 'utf-8') {
198 debug('Converting to UTF-8');
199 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
200 }
201 }
202 }
203 return $html;
204}
205
206function makeAbsolute($base, $elem) {
207 $base = new SimplePie_IRI($base);
208 // remove '//' in URL path (used to prevent URLs from resolving properly)
209 // TODO: check if this is still the case
210 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
211 foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
212 $elems = $elem->getElementsByTagName($tag);
213 for ($i = $elems->length-1; $i >= 0; $i--) {
214 $e = $elems->item($i);
215 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
216 makeAbsoluteAttr($base, $e, $attr);
217 }
218 if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
219 }
220}
221function makeAbsoluteAttr($base, $e, $attr) {
222 if ($e->hasAttribute($attr)) {
223 // Trim leading and trailing white space. I don't really like this but
224 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
225 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
226 $url = str_replace(' ', '%20', $url);
227 if (!preg_match('!https?://!i', $url)) {
228 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
229 $e->setAttribute($attr, $absolute);
230 }
231 }
232 }
233}
234function makeAbsoluteStr($base, $url) {
235 $base = new SimplePie_IRI($base);
236 // remove '//' in URL path (causes URLs not to resolve properly)
237 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
238 if (preg_match('!^https?://!i', $url)) {
239 // already absolute
240 return $url;
241 } else {
242 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
243 return $absolute;
244 }
245 return false;
246 }
247}
248// returns single page response, or false if not found
249function getSinglePage($item, $html, $url) {
250 global $http, $extractor;
251 debug('Looking for site config files to see if single page link exists');
252 $site_config = $extractor->buildSiteConfig($url, $html);
253 $splink = null;
254 if (!empty($site_config->single_page_link)) {
255 $splink = $site_config->single_page_link;
256 } elseif (!empty($site_config->single_page_link_in_feed)) {
257 // single page link xpath is targeted at feed
258 $splink = $site_config->single_page_link_in_feed;
259 // so let's replace HTML with feed item description
260 $html = $item->get_description();
261 }
262 if (isset($splink)) {
263 // Build DOM tree from HTML
264 $readability = new Readability($html, $url);
265 $xpath = new DOMXPath($readability->dom);
266 // Loop through single_page_link xpath expressions
267 $single_page_url = null;
268 foreach ($splink as $pattern) {
269 $elems = @$xpath->evaluate($pattern, $readability->dom);
270 if (is_string($elems)) {
271 $single_page_url = trim($elems);
272 break;
273 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
274 foreach ($elems as $item) {
275 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
276 $single_page_url = $item->getAttribute('href');
277 break 2;
278 } elseif ($item instanceof DOMAttr && $item->value) {
279 $single_page_url = $item->value;
280 break 2;
281 }
282 }
283 }
284 }
285 // If we've got URL, resolve against $url
286 if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
287 // check it's not what we have already!
288 if ($single_page_url != $url) {
289 // it's not, so let's try to fetch it...
290 $_prev_ref = $http->referer;
291 $http->referer = $single_page_url;
292 if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
293 $http->referer = $_prev_ref;
294 return $response;
295 }
296 $http->referer = $_prev_ref;
297 }
298 }
299 }
300 return false;
301}
302
303// based on content-type http header, decide what to do
304// param: HTTP headers string
305// return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
306// e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
307function get_mime_action_info($headers) {
308 global $options;
309 // check if action defined for returned Content-Type
310 $info = array();
311 if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $headers, $match)) {
312 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
313 // match[1] = full mime type, e.g. image/jpeg
314 // match[2] = first part, e.g. image
315 // match[3] = last part, e.g. jpeg
316 $info['mime'] = strtolower(trim($match[1]));
317 $info['type'] = strtolower(trim($match[2]));
318 $info['subtype'] = strtolower(trim($match[3]));
319 foreach (array($info['mime'], $info['type']) as $_mime) {
320 if (isset($options->content_type_exc[$_mime])) {
321 $info['action'] = $options->content_type_exc[$_mime]['action'];
322 $info['name'] = $options->content_type_exc[$_mime]['name'];
323 break;
324 }
325 }
326 }
327 return $info;
328}
329
330function remove_url_cruft($url) {
331 // remove google analytics for the time being
332 // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
333 // https://gist.github.com/758177
334 return preg_replace('/(\?|\&)utm_[a-z]+=[^\&]+/', '', $url);
335}
336
337function make_substitutions($string) {
338 if ($string == '') return $string;
339 global $item, $effective_url;
340 $string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
341 $string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
342 return $string;
343}
344
345function get_cache() {
346 global $options, $valid_key;
347 static $cache = null;
348 if ($cache === null) {
349 $frontendOptions = array(
350 'lifetime' => 10*60, // cache lifetime of 10 minutes
351 'automatic_serialization' => false,
352 'write_control' => false,
353 'automatic_cleaning_factor' => $options->cache_cleanup,
354 'ignore_user_abort' => false
355 );
356 $backendOptions = array(
357 'cache_dir' => ($valid_key) ? $options->cache_dir.'/rss-with-key/' : $options->cache_dir.'/rss/', // directory where to put the cache files
358 'file_locking' => false,
359 'read_control' => true,
360 'read_control_type' => 'strlen',
361 'hashed_directory_level' => $options->cache_directory_level,
362 'hashed_directory_perm' => 0777,
363 'cache_file_perm' => 0664,
364 'file_name_prefix' => 'ff'
365 );
366 // getting a Zend_Cache_Core object
367 $cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
368 }
369 return $cache;
370}
371
372function debug($msg) {
373 global $debug_mode;
374 if ($debug_mode) {
375 echo '* ',$msg,"\n";
376 ob_flush();
377 flush();
378 }
379}
380
381function get_base_url($dom) {
382 $xpath = new DOMXPath($dom);
383 $base_url = @$xpath->evaluate('string(//head/base/@href)', $dom);
384 if ($base_url !== '') {
385 return $base_url;
386 } else {
387 return false;
388 }
389}
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php
index 43b94e57..9b73b105 100644..100755
--- a/inc/3rdparty/simple_html_dom.php
+++ b/inc/3rdparty/simple_html_dom.php
@@ -34,7 +34,7 @@
34 * @author S.C. Chen <me578022@gmail.com> 34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick 35 * @author John Schlick
36 * @author Rus Carroll 36 * @author Rus Carroll
37 * @version 1.5 ($Rev: 202 $) 37 * @version 1.5 ($Rev: 210 $)
38 * @package PlaceLocalInclude 38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom 39 * @subpackage simple_html_dom
40 */ 40 */
@@ -269,7 +269,10 @@ class simple_html_dom_node
269 { 269 {
270 return $this->children; 270 return $this->children;
271 } 271 }
272 if (isset($this->children[$idx])) return $this->children[$idx]; 272 if (isset($this->children[$idx]))
273 {
274 return $this->children[$idx];
275 }
273 return null; 276 return null;
274 } 277 }
275 278
@@ -330,14 +333,14 @@ class simple_html_dom_node
330 function find_ancestor_tag($tag) 333 function find_ancestor_tag($tag)
331 { 334 {
332 global $debug_object; 335 global $debug_object;
333 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 336 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
334 337
335 // Start by including ourselves in the comparison. 338 // Start by including ourselves in the comparison.
336 $returnDom = $this; 339 $returnDom = $this;
337 340
338 while (!is_null($returnDom)) 341 while (!is_null($returnDom))
339 { 342 {
340 if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); } 343 if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
341 344
342 if ($returnDom->tag == $tag) 345 if ($returnDom->tag == $tag)
343 { 346 {
@@ -374,7 +377,7 @@ class simple_html_dom_node
374 $text = " with text: " . $this->text; 377 $text = " with text: " . $this->text;
375 } 378 }
376 } 379 }
377 $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); 380 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
378 } 381 }
379 382
380 if ($this->tag==='root') return $this->innertext(); 383 if ($this->tag==='root') return $this->innertext();
@@ -532,7 +535,9 @@ class simple_html_dom_node
532 foreach ($head as $k=>$v) 535 foreach ($head as $k=>$v)
533 { 536 {
534 if (!isset($found_keys[$k])) 537 if (!isset($found_keys[$k]))
538 {
535 $found_keys[$k] = 1; 539 $found_keys[$k] = 1;
540 }
536 } 541 }
537 } 542 }
538 543
@@ -554,7 +559,7 @@ class simple_html_dom_node
554 protected function seek($selector, &$ret, $lowercase=false) 559 protected function seek($selector, &$ret, $lowercase=false)
555 { 560 {
556 global $debug_object; 561 global $debug_object;
557 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 562 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
558 563
559 list($tag, $key, $val, $exp, $no_key) = $selector; 564 list($tag, $key, $val, $exp, $no_key) = $selector;
560 565
@@ -615,7 +620,7 @@ class simple_html_dom_node
615 // this is a normal search, we want the value of that attribute of the tag. 620 // this is a normal search, we want the value of that attribute of the tag.
616 $nodeKeyValue = $node->attr[$key]; 621 $nodeKeyValue = $node->attr[$key];
617 } 622 }
618 if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 623 if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
619 624
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 625 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
621 if ($lowercase) { 626 if ($lowercase) {
@@ -623,7 +628,7 @@ class simple_html_dom_node
623 } else { 628 } else {
624 $check = $this->match($exp, $val, $nodeKeyValue); 629 $check = $this->match($exp, $val, $nodeKeyValue);
625 } 630 }
626 if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));} 631 if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
627 632
628 // handle multiple class 633 // handle multiple class
629 if (!$check && strcasecmp($key, 'class')===0) { 634 if (!$check && strcasecmp($key, 'class')===0) {
@@ -645,12 +650,12 @@ class simple_html_dom_node
645 unset($node); 650 unset($node);
646 } 651 }
647 // It's passed by reference so this is actually what this function returns. 652 // It's passed by reference so this is actually what this function returns.
648 if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);} 653 if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
649 } 654 }
650 655
651 protected function match($exp, $pattern, $value) { 656 protected function match($exp, $pattern, $value) {
652 global $debug_object; 657 global $debug_object;
653 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 658 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
654 659
655 switch ($exp) { 660 switch ($exp) {
656 case '=': 661 case '=':
@@ -672,7 +677,7 @@ class simple_html_dom_node
672 677
673 protected function parse_selector($selector_string) { 678 protected function parse_selector($selector_string) {
674 global $debug_object; 679 global $debug_object;
675 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 680 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
676 681
677 // pattern of CSS selectors, modified from mootools 682 // pattern of CSS selectors, modified from mootools
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 683 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
@@ -683,7 +688,7 @@ class simple_html_dom_node
683// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 688// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 689 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 690 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
686 if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);} 691 if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
687 692
688 $selectors = array(); 693 $selectors = array();
689 $result = array(); 694 $result = array();
@@ -718,12 +723,14 @@ class simple_html_dom_node
718 return $selectors; 723 return $selectors;
719 } 724 }
720 725
721 function __get($name) { 726 function __get($name)
727 {
722 if (isset($this->attr[$name])) 728 if (isset($this->attr[$name]))
723 { 729 {
724 return $this->convert_text($this->attr[$name]); 730 return $this->convert_text($this->attr[$name]);
725 } 731 }
726 switch ($name) { 732 switch ($name)
733 {
727 case 'outertext': return $this->outertext(); 734 case 'outertext': return $this->outertext();
728 case 'innertext': return $this->innertext(); 735 case 'innertext': return $this->innertext();
729 case 'plaintext': return $this->text(); 736 case 'plaintext': return $this->text();
@@ -732,22 +739,30 @@ class simple_html_dom_node
732 } 739 }
733 } 740 }
734 741
735 function __set($name, $value) { 742 function __set($name, $value)
736 switch ($name) { 743 {
744 global $debug_object;
745 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
746
747 switch ($name)
748 {
737 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 749 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
738 case 'innertext': 750 case 'innertext':
739 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 751 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
740 return $this->_[HDOM_INFO_INNER] = $value; 752 return $this->_[HDOM_INFO_INNER] = $value;
741 } 753 }
742 if (!isset($this->attr[$name])) { 754 if (!isset($this->attr[$name]))
755 {
743 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 756 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
744 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 757 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
745 } 758 }
746 $this->attr[$name] = $value; 759 $this->attr[$name] = $value;
747 } 760 }
748 761
749 function __isset($name) { 762 function __isset($name)
750 switch ($name) { 763 {
764 switch ($name)
765 {
751 case 'outertext': return true; 766 case 'outertext': return true;
752 case 'innertext': return true; 767 case 'innertext': return true;
753 case 'plaintext': return true; 768 case 'plaintext': return true;
@@ -765,7 +780,7 @@ class simple_html_dom_node
765 function convert_text($text) 780 function convert_text($text)
766 { 781 {
767 global $debug_object; 782 global $debug_object;
768 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 783 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
769 784
770 $converted_text = $text; 785 $converted_text = $text;
771 786
@@ -777,7 +792,7 @@ class simple_html_dom_node
777 $sourceCharset = strtoupper($this->dom->_charset); 792 $sourceCharset = strtoupper($this->dom->_charset);
778 $targetCharset = strtoupper($this->dom->_target_charset); 793 $targetCharset = strtoupper($this->dom->_target_charset);
779 } 794 }
780 if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 795 if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
781 796
782 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 797 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
783 { 798 {
@@ -1045,10 +1060,10 @@ class simple_html_dom
1045 1060
1046 // prepare 1061 // prepare
1047 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1062 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1048 // strip out comments
1049 $this->remove_noise("'<!--(.*?)-->'is");
1050 // strip out cdata 1063 // strip out cdata
1051 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1064 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1065 // strip out comments
1066 $this->remove_noise("'<!--(.*?)-->'is");
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1067 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 // Script tags removal now preceeds style tag removal. 1068 // Script tags removal now preceeds style tag removal.
1054 // strip out <script> tags 1069 // strip out <script> tags
@@ -1078,10 +1093,15 @@ class simple_html_dom
1078 // load html from file 1093 // load html from file
1079 function load_file() 1094 function load_file()
1080 { 1095 {
1096 //external error: NOT related to dom loading
1097 $extError=error_get_last();
1098
1081 $args = func_get_args(); 1099 $args = func_get_args();
1082 $this->load(call_user_func_array('file_get_contents', $args), true); 1100 $this->load(call_user_func_array('file_get_contents', $args), true);
1101
1083 // Throw an error if we can't properly load the dom. 1102 // Throw an error if we can't properly load the dom.
1084 if (($error=error_get_last())!==null) { 1103 $error=error_get_last();
1104 if ($error!==$extError) {
1085 $this->clear(); 1105 $this->clear();
1086 return false; 1106 return false;
1087 } 1107 }
@@ -1198,22 +1218,22 @@ class simple_html_dom
1198 if ($success) 1218 if ($success)
1199 { 1219 {
1200 $charset = $matches[1]; 1220 $charset = $matches[1];
1201 if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);} 1221 if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1202 } 1222 }
1203 1223
1204 } 1224 }
1205 1225
1206 if (empty($charset)) 1226 if (empty($charset))
1207 { 1227 {
1208 $el = $this->root->find('meta[http-equiv=Content-Type]',0); 1228 $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1209 if (!empty($el)) 1229 if (!empty($el))
1210 { 1230 {
1211 $fullvalue = $el->content; 1231 $fullvalue = $el->content;
1212 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);} 1232 if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1213 1233
1214 if (!empty($fullvalue)) 1234 if (!empty($fullvalue))
1215 { 1235 {
1216 $success = preg_match('/charset=(.+)/', $fullvalue, $matches); 1236 $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1217 if ($success) 1237 if ($success)
1218 { 1238 {
1219 $charset = $matches[1]; 1239 $charset = $matches[1];
@@ -1221,7 +1241,7 @@ class simple_html_dom
1221 else 1241 else
1222 { 1242 {
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 1243 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1224 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 1244 if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1225 $charset = 'ISO-8859-1'; 1245 $charset = 'ISO-8859-1';
1226 } 1246 }
1227 } 1247 }
@@ -1231,14 +1251,19 @@ class simple_html_dom
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got... 1251 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1232 if (empty($charset)) 1252 if (empty($charset))
1233 { 1253 {
1234 // Have php try to detect the encoding from the text given to us. 1254 // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1235 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 1255 $charset = false;
1236 if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);} 1256 if (function_exists('mb_detect_encoding'))
1257 {
1258 // Have php try to detect the encoding from the text given to us.
1259 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1260 if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1261 }
1237 1262
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 1263 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1239 if ($charset === false) 1264 if ($charset === false)
1240 { 1265 {
1241 if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');} 1266 if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1242 $charset = 'UTF-8'; 1267 $charset = 'UTF-8';
1243 } 1268 }
1244 } 1269 }
@@ -1246,11 +1271,11 @@ class simple_html_dom
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 1271 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1247 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 1272 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1248 { 1273 {
1249 if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 1274 if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1250 $charset = 'CP1252'; 1275 $charset = 'CP1252';
1251 } 1276 }
1252 1277
1253 if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);} 1278 if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1254 1279
1255 return $this->_charset = $charset; 1280 return $this->_charset = $charset;
1256 } 1281 }
@@ -1616,14 +1641,14 @@ class simple_html_dom
1616 protected function remove_noise($pattern, $remove_tag=false) 1641 protected function remove_noise($pattern, $remove_tag=false)
1617 { 1642 {
1618 global $debug_object; 1643 global $debug_object;
1619 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1644 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1620 1645
1621 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 1646 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1622 1647
1623 for ($i=$count-1; $i>-1; --$i) 1648 for ($i=$count-1; $i>-1; --$i)
1624 { 1649 {
1625 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 1650 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1626 if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); } 1651 if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
1627 $idx = ($remove_tag) ? 0 : 1; 1652 $idx = ($remove_tag) ? 0 : 1;
1628 $this->noise[$key] = $matches[$i][$idx][0]; 1653 $this->noise[$key] = $matches[$i][$idx][0];
1629 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 1654 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
@@ -1641,7 +1666,7 @@ class simple_html_dom
1641 function restore_noise($text) 1666 function restore_noise($text)
1642 { 1667 {
1643 global $debug_object; 1668 global $debug_object;
1644 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1669 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1645 1670
1646 while (($pos=strpos($text, '___noise___'))!==false) 1671 while (($pos=strpos($text, '___noise___'))!==false)
1647 { 1672 {
@@ -1649,7 +1674,7 @@ class simple_html_dom
1649 if (strlen($text) > $pos+15) 1674 if (strlen($text) > $pos+15)
1650 { 1675 {
1651 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 1676 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1652 if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); } 1677 if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
1653 1678
1654 if (isset($this->noise[$key])) 1679 if (isset($this->noise[$key]))
1655 { 1680 {
@@ -1674,7 +1699,7 @@ class simple_html_dom
1674 function search_noise($text) 1699 function search_noise($text)
1675 { 1700 {
1676 global $debug_object; 1701 global $debug_object;
1677 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1702 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1678 1703
1679 foreach($this->noise as $noiseElement) 1704 foreach($this->noise as $noiseElement)
1680 { 1705 {
diff --git a/inc/3rdparty/site_config/custom/dailymotion.com.txt b/inc/3rdparty/site_config/custom/dailymotion.com.txt
new file mode 100755
index 00000000..0cad808f
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/dailymotion.com.txt
@@ -0,0 +1,12 @@
1title: //title
2body: //iframe
3
4replace_string(<![CDATA[): _
5replace_string(]]>): _
6
7single_page_link: //link[@type='application/xml+oembed']
8
9prune: no
10tidy: no
11
12http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people
diff --git a/inc/3rdparty/site_config/custom/index.php b/inc/3rdparty/site_config/custom/index.php
new file mode 100644
index 00000000..a3d5f739
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/index.php
@@ -0,0 +1,3 @@
1<?php
2// this is here to prevent directory listing over the web
3?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt b/inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt
new file mode 100644
index 00000000..24aec5c3
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt
@@ -0,0 +1,6 @@
1title: //h2
2body: div[@id='illustration'] | //p
3prune: no
4tidy: no
5
6test_url: http://mobile.lemondeinformatique.fr/actualites/lire-les-datacenters-d-apple-google-et-facebook-eco-responsables-selon-greenpeace-le-monde-informatique-57122.html
diff --git a/inc/3rdparty/site_config/custom/ted.com.txt b/inc/3rdparty/site_config/custom/ted.com.txt
new file mode 100755
index 00000000..4940d2bc
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/ted.com.txt
@@ -0,0 +1,11 @@
1title: //title
2body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image']
3
4strip_id_or_class: talk-transcript__para__time
5
6single_page_link: //a[@id='hero-transcript-link']
7
8#prune: no
9tidy: no
10
11test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are
diff --git a/inc/3rdparty/site_config/index.php b/inc/3rdparty/site_config/index.php
index a1b767fd..76ca8b3c 100644
--- a/inc/3rdparty/site_config/index.php
+++ b/inc/3rdparty/site_config/index.php
@@ -1,3 +1,2 @@
1<?php 1<?php
2// this is here to prevent directory listing over the web 2// this is here to prevent directory listing over the web \ No newline at end of file
3?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt
new file mode 100644
index 00000000..e1ebaee3
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/.about.com.txt
@@ -0,0 +1,14 @@
1body: //div[@id='articlebody']
2title: //h1
3author: //p[@id='by']//a
4
5next_page_link: //span[@class='next']/a
6# Not the same as below!
7
8prune: yes
9tidy: no
10
11# Annoying 'next' links plainly inside the article body
12strip: //*[text()[contains(.,'Next: ')]]
13
14test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm
diff --git a/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt
new file mode 100644
index 00000000..24c949e9
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt
@@ -0,0 +1,9 @@
1title: //div[@id='header']//h1[1]
2
3body: //div[@id='content']
4
5strip_id_or_class: toc
6
7prune: no
8
9test_url: http://moo.nac.uci.edu/~hjm/HOWTO_move_data.html
diff --git a/inc/3rdparty/site_config/standard/politico.com.txt b/inc/3rdparty/site_config/standard/politico.com.txt
index 121fd5b9..c5302d1b 100644..100755
--- a/inc/3rdparty/site_config/standard/politico.com.txt
+++ b/inc/3rdparty/site_config/standard/politico.com.txt
@@ -4,10 +4,14 @@ body://div[contains(@class,"story-text")]
4# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] 4# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"]
5 5
6next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a 6next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a
7next_page_link://div[contains(@class,"pagination")]/ol/li[contains(@class, "current")]/following-sibling::node()/a
7date://meta[@name="publish_date"]/@content 8date://meta[@name="publish_date"]/@content
8 9
9strip://div[contains(@class, "breadcrumbs")] 10strip://div[contains(@class, "breadcrumbs")]
10strip://a[contains(@class, "hidden")] 11strip://a[contains(@class, "hidden")]
11strip://div[contains(@class, "story-embed")] 12strip://div[contains(@class, "story-embed")]
12strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. 13strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/..
14strip://div[contains(@class, "story-interrupt")]
15strip://footer[contains(@class, "author-bio")]
16
13test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file 17test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/version.txt b/inc/3rdparty/site_config/standard/version.txt
index bf0d87ab..eaf01ebd 100644
--- a/inc/3rdparty/site_config/standard/version.txt
+++ b/inc/3rdparty/site_config/standard/version.txt
@@ -1 +1 @@
4 \ No newline at end of file 2013-05-12T22:53:07Z \ No newline at end of file