diff options
author | Nicolas Lœuillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
---|---|---|
committer | Nicolas Lœuillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
commit | a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch) | |
tree | 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc | |
parent | 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff) | |
parent | 8038b38802769031e050c753fc0a388a2276629e (diff) | |
download | wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip |
Merge pull request #712 from wallabag/dev1.7.0
1.7, call me "Premium version"
Diffstat (limited to 'inc')
38 files changed, 11458 insertions, 4308 deletions
diff --git a/inc/3rdparty/config.php b/inc/3rdparty/config.php index e618117b..ec680d86 100755 --- a/inc/3rdparty/config.php +++ b/inc/3rdparty/config.php | |||
@@ -19,7 +19,7 @@ if (!isset($options)) $options = new stdClass(); | |||
19 | // Enable service | 19 | // Enable service |
20 | // ---------------------- | 20 | // ---------------------- |
21 | // Set this to false if you want to disable the service. | 21 | // Set this to false if you want to disable the service. |
22 | // If set to false, no feed is produced and users will | 22 | // If set to false, no feed is produced and users will |
23 | // be told that the service is disabled. | 23 | // be told that the service is disabled. |
24 | $options->enabled = true; | 24 | $options->enabled = true; |
25 | 25 | ||
@@ -43,10 +43,64 @@ $options->default_entries = 5; | |||
43 | // ---------------------- | 43 | // ---------------------- |
44 | // The maximum number of feed items to process when no access key is supplied. | 44 | // The maximum number of feed items to process when no access key is supplied. |
45 | // This limits the user-supplied &max=x value. For example, if the user | 45 | // This limits the user-supplied &max=x value. For example, if the user |
46 | // asks for 20 items to be processed (&max=20), if max_entries is set to | 46 | // asks for 20 items to be processed (&max=20), if max_entries is set to |
47 | // 10, only 10 will be processed. | 47 | // 10, only 10 will be processed. |
48 | $options->max_entries = 10; | 48 | $options->max_entries = 10; |
49 | 49 | ||
50 | // Full content | ||
51 | // ---------------------- | ||
52 | // By default Full-Text RSS includes the extracted content in the output. | ||
53 | // You can exclude this from the output by passing '&content=0' in the querystring. | ||
54 | // | ||
55 | // Possible values... | ||
56 | // Always include: true | ||
57 | // Never include: false | ||
58 | // Include unless user overrides (&content=0): 'user' (default) | ||
59 | // | ||
60 | // Note: currently this does not disable full content extraction. It simply omits it | ||
61 | // from the output. | ||
62 | $options->content = 'user'; | ||
63 | |||
64 | // Excerpts | ||
65 | // ---------------------- | ||
66 | // By default Full-Text RSS does not include excerpts in the output. | ||
67 | // You can enable this by passing '&summary=1' in the querystring. | ||
68 | // This will include a plain text excerpt from the extracted content. | ||
69 | // | ||
70 | // Possible values... | ||
71 | // Always include: true (recommended for new users) | ||
72 | // Never include: false | ||
73 | // Don't include unless user overrides (&summary=1): 'user' (default) | ||
74 | // | ||
75 | // Important: if both content and excerpts are requested, the excerpt will be | ||
76 | // placed in the description element and the full content inside content:encoded. | ||
77 | // If excerpts are not requested, the full content will go inside the description element. | ||
78 | // | ||
79 | // Why are we not returning both excerpts and content by default? | ||
80 | // Mainly for backward compatibility. | ||
81 | // Excerpts should appear in the feed item's description element. Previous versions | ||
82 | // of Full-Text RSS did not return excerpts, so the description element was always | ||
83 | // used for the full content (as recommended by the RSS advisory). When returning both, | ||
84 | // we need somewhere else to place the content (content:encoded). | ||
85 | // Having both enabled should not create any problems for news readers, but it may create | ||
86 | // problems for developers upgrading from one of our earlier versions who may now find | ||
87 | // their applications are returning excerpts instead of the full content they were | ||
88 | // expecting. To avoid such surprises for users who are upgrading Full-Text RSS, | ||
89 | // excerpts must be explicitly requested in the querystring by default. | ||
90 | // | ||
91 | // Why not use a different element name for excerpts? | ||
92 | // According to the RSS advisory: | ||
93 | // "Publishers who employ summaries should store the summary in description and | ||
94 | // the full content in content:encoded, ordering description first within the item. | ||
95 | // On items with no summary, the full content should be stored in description." | ||
96 | // See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded | ||
97 | // | ||
98 | // For more consistent element naming, we recommend new users set this option to true. | ||
99 | // The full content can still be excluded via the querystring, but the element names | ||
100 | // will not change: when $options->summary = true, the description element will always | ||
101 | // be reserved for the excerpt and content:encoded always for full content. | ||
102 | $options->summary = 'user'; | ||
103 | |||
50 | // Rewrite relative URLs | 104 | // Rewrite relative URLs |
51 | // ---------------------- | 105 | // ---------------------- |
52 | // With this enabled relative URLs found in the extracted content | 106 | // With this enabled relative URLs found in the extracted content |
@@ -67,7 +121,7 @@ $options->exclude_items_on_fail = 'user'; | |||
67 | // Enable multi-page support | 121 | // Enable multi-page support |
68 | // ------------------------- | 122 | // ------------------------- |
69 | // If enabled, we will try to follow next page links on multi-page articles. | 123 | // If enabled, we will try to follow next page links on multi-page articles. |
70 | // Currently this only happens for sites where next_page_link has been defined | 124 | // Currently this only happens for sites where next_page_link has been defined |
71 | // in a site config file. | 125 | // in a site config file. |
72 | $options->multipage = true; | 126 | $options->multipage = true; |
73 | 127 | ||
@@ -125,10 +179,10 @@ $options->detect_language = 1; | |||
125 | 179 | ||
126 | // Registration key | 180 | // Registration key |
127 | // --------------- | 181 | // --------------- |
128 | // The registration key is optional. It is not required to use Full-Text RSS, | 182 | // The registration key is optional. It is not required to use Full-Text RSS, |
129 | // and does not affect the normal operation of Full-Text RSS. It is currently | 183 | // and does not affect the normal operation of Full-Text RSS. It is currently |
130 | // only used on admin pages which help you update site patterns with the | 184 | // only used on admin pages which help you update site patterns with the |
131 | // latest version offered by FiveFilters.org. For these admin-related | 185 | // latest version offered by FiveFilters.org. For these admin-related |
132 | // tasks to complete, we will require a valid registration key. | 186 | // tasks to complete, we will require a valid registration key. |
133 | // If you would like one, you can purchase the latest version of Full-Text RSS | 187 | // If you would like one, you can purchase the latest version of Full-Text RSS |
134 | // at http://fivefilters.org/content-only/ | 188 | // at http://fivefilters.org/content-only/ |
@@ -144,12 +198,12 @@ $options->registration_key = ''; | |||
144 | // ---------------------- | 198 | // ---------------------- |
145 | // Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials. | 199 | // Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials. |
146 | // To use these pages, enter a password here and you'll be prompted for it when you try to access those pages. | 200 | // To use these pages, enter a password here and you'll be prompted for it when you try to access those pages. |
147 | // If no password or username is set, pages requiring admin privelages will be inaccessible. | 201 | // If no password or username is set, pages requiring admin privelages will be inaccessible. |
148 | // The default username is 'admin'. | 202 | // The default username is 'admin'. |
149 | // If overriding with an environment variable, separate username and password with a colon, e.g.: | 203 | // If overriding with an environment variable, separate username and password with a colon, e.g.: |
150 | // ftr_admin_credentials: admin:my-secret-password | 204 | // ftr_admin_credentials: admin:my-secret-password |
151 | // Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password'); | 205 | // Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password'); |
152 | $options->admin_credentials = array('username'=>'admin', 'password'=>'admin'); | 206 | $options->admin_credentials = array('username'=>'admin', 'password'=>''); |
153 | 207 | ||
154 | // URLs to allow | 208 | // URLs to allow |
155 | // ---------------------- | 209 | // ---------------------- |
@@ -178,12 +232,12 @@ $options->key_required = false; | |||
178 | // ---------------------- | 232 | // ---------------------- |
179 | // By default, when processing feeds, we assume item titles in the feed | 233 | // By default, when processing feeds, we assume item titles in the feed |
180 | // have not been truncated. So after processing web pages, the extracted titles | 234 | // have not been truncated. So after processing web pages, the extracted titles |
181 | // are not used in the generated feed. If you prefer to have extracted titles in | 235 | // are not used in the generated feed. If you prefer to have extracted titles in |
182 | // the feed you can either set this to false, in which case we will always favour | 236 | // the feed you can either set this to false, in which case we will always favour |
183 | // extracted titles. Alternatively, if set to 'user' (default) we'll use the | 237 | // extracted titles. Alternatively, if set to 'user' (default) we'll use the |
184 | // extracted title if you pass '&use_extracted_title' in the querystring. | 238 | // extracted title if you pass '&use_extracted_title' in the querystring. |
185 | // Possible values: | 239 | // Possible values: |
186 | // * Favour feed titles: true | 240 | // * Favour feed titles: true |
187 | // * Favour extracted titles: false | 241 | // * Favour extracted titles: false |
188 | // * Favour feed titles with user override: 'user' (default) | 242 | // * Favour feed titles with user override: 'user' (default) |
189 | // Note: this has no effect when the input URL is to a web page - in these cases | 243 | // Note: this has no effect when the input URL is to a web page - in these cases |
@@ -192,17 +246,17 @@ $options->favour_feed_titles = 'user'; | |||
192 | 246 | ||
193 | // Access keys (password protected access) | 247 | // Access keys (password protected access) |
194 | // ------------------------------------ | 248 | // ------------------------------------ |
195 | // NOTE: You do not need an API key from fivefilters.org to run your own | 249 | // NOTE: You do not need an API key from fivefilters.org to run your own |
196 | // copy of the code. This is here if you'd like to restrict access to | 250 | // copy of the code. This is here if you'd like to restrict access to |
197 | // _your_ copy. | 251 | // _your_ copy. |
198 | // Keys let you group users - those with a key and those without - and | 252 | // Keys let you group users - those with a key and those without - and |
199 | // restrict access to the service to those without a key. | 253 | // restrict access to the service to those without a key. |
200 | // If you want everyone to access the service in the same way, you can | 254 | // If you want everyone to access the service in the same way, you can |
201 | // leave the array below empty and ignore the access key options further down. | 255 | // leave the array below empty and ignore the access key options further down. |
202 | // The options further down let you control how the service should behave | 256 | // The options further down let you control how the service should behave |
203 | // in each mode. | 257 | // in each mode. |
204 | // Note: Explicitly including the index number (1 and 2 in the examples below) | 258 | // Note: Explicitly including the index number (1 and 2 in the examples below) |
205 | // is highly recommended (when generating feeds, we encode the key and | 259 | // is highly recommended (when generating feeds, we encode the key and |
206 | // refer to it by index number and hash). | 260 | // refer to it by index number and hash). |
207 | $options->api_keys = array(); | 261 | $options->api_keys = array(); |
208 | // Example: | 262 | // Example: |
@@ -232,13 +286,13 @@ $options->max_entries_with_key = 10; | |||
232 | // filter the resulting HTML for XSS attacks, making it redundant for | 286 | // filter the resulting HTML for XSS attacks, making it redundant for |
233 | // Full-Text RSS do the same. Similarly with frameworks/CMS which display | 287 | // Full-Text RSS do the same. Similarly with frameworks/CMS which display |
234 | // feed content - the content should be treated like any other user-submitted content. | 288 | // feed content - the content should be treated like any other user-submitted content. |
235 | // | 289 | // |
236 | // If you are writing an application yourself which is processing feeds generated by | 290 | // If you are writing an application yourself which is processing feeds generated by |
237 | // Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks | 291 | // Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks |
238 | // or enable this option. This might be useful if you are processing our generated | 292 | // or enable this option. This might be useful if you are processing our generated |
239 | // feeds with JavaScript on the client side - although there's client side xss | 293 | // feeds with JavaScript on the client side - although there's client side xss |
240 | // filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer | 294 | // filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer |
241 | // | 295 | // |
242 | // If enabled, we'll pass retrieved HTML content through htmLawed with | 296 | // If enabled, we'll pass retrieved HTML content through htmLawed with |
243 | // safe flag on and style attributes denied, see | 297 | // safe flag on and style attributes denied, see |
244 | // http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6 | 298 | // http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6 |
@@ -253,8 +307,8 @@ $options->xss_filter = 'user'; | |||
253 | // Allowed parsers | 307 | // Allowed parsers |
254 | // ---------------------- | 308 | // ---------------------- |
255 | // Full-Text RSS attempts to use PHP's libxml extension to process HTML. | 309 | // Full-Text RSS attempts to use PHP's libxml extension to process HTML. |
256 | // While fast, on some sites it may not always produce good results. | 310 | // While fast, on some sites it may not always produce good results. |
257 | // For these sites, you can specify an alternative HTML parser: | 311 | // For these sites, you can specify an alternative HTML parser: |
258 | // parser: html5lib | 312 | // parser: html5lib |
259 | // The html5lib parser is bundled with Full-Text RSS. | 313 | // The html5lib parser is bundled with Full-Text RSS. |
260 | // see http://code.google.com/p/html5lib/ | 314 | // see http://code.google.com/p/html5lib/ |
@@ -273,7 +327,7 @@ $options->cors = false; | |||
273 | 327 | ||
274 | // Use APC user cache? | 328 | // Use APC user cache? |
275 | // ---------------------- | 329 | // ---------------------- |
276 | // If enabled we will store site config files (when requested | 330 | // If enabled we will store site config files (when requested |
277 | // for the first time) in APC's user cache. Keys prefixed with 'sc.' | 331 | // for the first time) in APC's user cache. Keys prefixed with 'sc.' |
278 | // This improves performance by reducing disk access. | 332 | // This improves performance by reducing disk access. |
279 | // Note: this has no effect if APC is unavailable on your server. | 333 | // Note: this has no effect if APC is unavailable on your server. |
@@ -346,7 +400,7 @@ $options->rewrite_url = array( | |||
346 | // Valid actions: | 400 | // Valid actions: |
347 | // * 'exclude' - exclude this item from the result | 401 | // * 'exclude' - exclude this item from the result |
348 | // * 'link' - create HTML link to the item | 402 | // * 'link' - create HTML link to the item |
349 | $options->content_type_exc = array( | 403 | $options->content_type_exc = array( |
350 | 'application/pdf' => array('action'=>'link', 'name'=>'PDF'), | 404 | 'application/pdf' => array('action'=>'link', 'name'=>'PDF'), |
351 | 'image' => array('action'=>'link', 'name'=>'Image'), | 405 | 'image' => array('action'=>'link', 'name'=>'Image'), |
352 | 'audio' => array('action'=>'link', 'name'=>'Audio'), | 406 | 'audio' => array('action'=>'link', 'name'=>'Audio'), |
@@ -375,13 +429,13 @@ $options->cache_cleanup = 100; | |||
375 | /// DO NOT CHANGE ANYTHING BELOW THIS /////////// | 429 | /// DO NOT CHANGE ANYTHING BELOW THIS /////////// |
376 | ///////////////////////////////////////////////// | 430 | ///////////////////////////////////////////////// |
377 | 431 | ||
378 | if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1'); | 432 | if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2'); |
379 | 433 | ||
380 | if (basename(__FILE__) == 'config.php') { | 434 | if (basename(__FILE__) == 'config.php') { |
381 | if (file_exists(dirname(__FILE__).'/custom_config.php')) { | 435 | if (file_exists(dirname(__FILE__).'/custom_config.php')) { |
382 | require_once dirname(__FILE__).'/custom_config.php'; | 436 | require_once dirname(__FILE__).'/custom_config.php'; |
383 | } | 437 | } |
384 | 438 | ||
385 | // check for environment variables - often used on cloud platforms | 439 | // check for environment variables - often used on cloud platforms |
386 | // environment variables should be prefixed with 'ftr_', e.g. | 440 | // environment variables should be prefixed with 'ftr_', e.g. |
387 | // ftr_max_entries: 1 | 441 | // ftr_max_entries: 1 |
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php new file mode 100644 index 00000000..376b6133 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php | |||
@@ -0,0 +1,266 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * This should be a complete list of all HTML entities, mapped to their UTF-8 character codes. | ||
4 | * | ||
5 | * @author A. Grandt | ||
6 | * @copyright A. Grandt 2009-2013 | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 3.00 | ||
9 | */ | ||
10 | global $htmlEntities; | ||
11 | $htmlEntities = array(); | ||
12 | |||
13 | $htmlEntities["""] ="\x22"; // " ((double) quotation mark) | ||
14 | $htmlEntities["&"] ="\x26"; // & (ampersand) | ||
15 | $htmlEntities["'"] ="\x27"; // ' (apostrophe = apostrophe-quote) | ||
16 | $htmlEntities["<"] ="\x3C"; // < (less-than sign) | ||
17 | $htmlEntities[">"] ="\x3E"; // > (greater-than sign) | ||
18 | $htmlEntities[" "] ="\xC2\xA0"; //   (non-breaking space) | ||
19 | $htmlEntities["¡"] ="\xC2\xA1"; // ¡ (inverted exclamation mark) | ||
20 | $htmlEntities["¢"] ="\xC2\xA2"; // ¢ (cent) | ||
21 | $htmlEntities["£"] ="\xC2\xA3"; // £ (pound) | ||
22 | $htmlEntities["¤"] ="\xC2\xA4"; // ¤ (currency) | ||
23 | $htmlEntities["¥"] ="\xC2\xA5"; // ¥ (yen) | ||
24 | $htmlEntities["¦"] ="\xC2\xA6"; // ¦ (broken vertical bar) | ||
25 | $htmlEntities["§"] ="\xC2\xA7"; // § (section) | ||
26 | $htmlEntities["¨"] ="\xC2\xA8"; // ¨ (spacing diaeresis) | ||
27 | $htmlEntities["©"] ="\xC2\xA9"; // © (copyright) | ||
28 | $htmlEntities["ª"] ="\xC2\xAA"; // ª (feminine ordinal indicator) | ||
29 | $htmlEntities["«"] ="\xC2\xAB"; // « (angle quotation mark (left)) | ||
30 | $htmlEntities["¬"] ="\xC2\xAC"; // ¬ (negation) | ||
31 | $htmlEntities["­"] ="\xC2\xAD"; // ­ (soft hyphen) | ||
32 | $htmlEntities["®"] ="\xC2\xAE"; // ® (registered trademark) | ||
33 | $htmlEntities["¯"] ="\xC2\xAF"; // ¯ (spacing macron) | ||
34 | $htmlEntities["°"] ="\xC2\xB0"; // ° (degree) | ||
35 | $htmlEntities["±"] ="\xC2\xB1"; // ± (plus-or-minus) | ||
36 | $htmlEntities["²"] ="\xC2\xB2"; // ² (superscript 2) | ||
37 | $htmlEntities["³"] ="\xC2\xB3"; // ³ (superscript 3) | ||
38 | $htmlEntities["´"] ="\xC2\xB4"; // ´ (spacing acute) | ||
39 | $htmlEntities["µ"] ="\xC2\xB5"; // µ (micro) | ||
40 | $htmlEntities["¶"] ="\xC2\xB6"; // ¶ (paragraph) | ||
41 | $htmlEntities["·"] ="\xC2\xB7"; // · (middle dot) | ||
42 | $htmlEntities["¸"] ="\xC2\xB8"; // ¸ (spacing cedilla) | ||
43 | $htmlEntities["¹"] ="\xC2\xB9"; // ¹ (superscript 1) | ||
44 | $htmlEntities["º"] ="\xC2\xBA"; // º (masculine ordinal indicator) | ||
45 | $htmlEntities["»"] ="\xC2\xBB"; // » (angle quotation mark (right)) | ||
46 | $htmlEntities["¼"] ="\xC2\xBC"; // ¼ (fraction 1/4) | ||
47 | $htmlEntities["½"] ="\xC2\xBD"; // ½ (fraction 1/2) | ||
48 | $htmlEntities["¾"] ="\xC2\xBE"; // ¾ (fraction 3/4) | ||
49 | $htmlEntities["¿"] ="\xC2\xBF"; // ¿ (inverted question mark) | ||
50 | $htmlEntities["À"] ="\xC3\x80"; // À (capital a, grave accent) | ||
51 | $htmlEntities["Á"] ="\xC3\x81"; // Á (capital a, acute accent) | ||
52 | $htmlEntities["Â"] ="\xC3\x82"; // Â (capital a, circumflex accent) | ||
53 | $htmlEntities["Ã"] ="\xC3\x83"; // Ã (capital a, tilde) | ||
54 | $htmlEntities["Ä"] ="\xC3\x84"; // Ä (capital a, umlaut mark) | ||
55 | $htmlEntities["Å"] ="\xC3\x85"; // Å (capital a, ring) | ||
56 | $htmlEntities["Æ"] ="\xC3\x86"; // Æ (capital ae) | ||
57 | $htmlEntities["Ç"] ="\xC3\x87"; // Ç (capital c, cedilla) | ||
58 | $htmlEntities["È"] ="\xC3\x88"; // È (capital e, grave accent) | ||
59 | $htmlEntities["É"] ="\xC3\x89"; // É (capital e, acute accent) | ||
60 | $htmlEntities["Ê"] ="\xC3\x8A"; // Ê (capital e, circumflex accent) | ||
61 | $htmlEntities["Ë"] ="\xC3\x8B"; // Ë (capital e, umlaut mark) | ||
62 | $htmlEntities["Ì"] ="\xC3\x8C"; // Ì (capital i, grave accent) | ||
63 | $htmlEntities["Í"] ="\xC3\x8D"; // Í (capital i, acute accent) | ||
64 | $htmlEntities["Î"] ="\xC3\x8E"; // Î (capital i, circumflex accent) | ||
65 | $htmlEntities["Ï"] ="\xC3\x8F"; // Ï (capital i, umlaut mark) | ||
66 | $htmlEntities["Ð"] ="\xC3\x90"; // Ð (capital eth, Icelandic) | ||
67 | $htmlEntities["Ñ"] ="\xC3\x91"; // Ñ (capital n, tilde) | ||
68 | $htmlEntities["Ò"] ="\xC3\x92"; // Ò (capital o, grave accent) | ||
69 | $htmlEntities["Ó"] ="\xC3\x93"; // Ó (capital o, acute accent) | ||
70 | $htmlEntities["Ô"] ="\xC3\x94"; // Ô (capital o, circumflex accent) | ||
71 | $htmlEntities["Õ"] ="\xC3\x95"; // Õ (capital o, tilde) | ||
72 | $htmlEntities["Ö"] ="\xC3\x96"; // Ö (capital o, umlaut mark) | ||
73 | $htmlEntities["×"] ="\xC3\x97"; // × (multiplication) | ||
74 | $htmlEntities["Ø"] ="\xC3\x98"; // Ø (capital o, slash) | ||
75 | $htmlEntities["Ù"] ="\xC3\x99"; // Ù (capital u, grave accent) | ||
76 | $htmlEntities["Ú"] ="\xC3\x9A"; // Ú (capital u, acute accent) | ||
77 | $htmlEntities["Û"] ="\xC3\x9B"; // Û (capital u, circumflex accent) | ||
78 | $htmlEntities["Ü"] ="\xC3\x9C"; // Ü (capital u, umlaut mark) | ||
79 | $htmlEntities["Ý"] ="\xC3\x9D"; // Ý (capital y, acute accent) | ||
80 | $htmlEntities["Þ"] ="\xC3\x9E"; // Þ (capital THORN, Icelandic) | ||
81 | $htmlEntities["ß"] ="\xC3\x9F"; // ß (small sharp s, German) | ||
82 | $htmlEntities["à"] ="\xC3\xA0"; // à (small a, grave accent) | ||
83 | $htmlEntities["á"] ="\xC3\xA1"; // á (small a, acute accent) | ||
84 | $htmlEntities["â"] ="\xC3\xA2"; // â (small a, circumflex accent) | ||
85 | $htmlEntities["ã"] ="\xC3\xA3"; // ã (small a, tilde) | ||
86 | $htmlEntities["ä"] ="\xC3\xA4"; // ä (small a, umlaut mark) | ||
87 | $htmlEntities["å"] ="\xC3\xA5"; // å (small a, ring) | ||
88 | $htmlEntities["æ"] ="\xC3\xA6"; // æ (small ae) | ||
89 | $htmlEntities["ç"] ="\xC3\xA7"; // ç (small c, cedilla) | ||
90 | $htmlEntities["è"] ="\xC3\xA8"; // è (small e, grave accent) | ||
91 | $htmlEntities["é"] ="\xC3\xA9"; // é (small e, acute accent) | ||
92 | $htmlEntities["ê"] ="\xC3\xAA"; // ê (small e, circumflex accent) | ||
93 | $htmlEntities["ë"] ="\xC3\xAB"; // ë (small e, umlaut mark) | ||
94 | $htmlEntities["ì"] ="\xC3\xAC"; // ì (small i, grave accent) | ||
95 | $htmlEntities["í"] ="\xC3\xAD"; // í (small i, acute accent) | ||
96 | $htmlEntities["î"] ="\xC3\xAE"; // î (small i, circumflex accent) | ||
97 | $htmlEntities["ï"] ="\xC3\xAF"; // ï (small i, umlaut mark) | ||
98 | $htmlEntities["ð"] ="\xC3\xB0"; // ð (small eth, Icelandic) | ||
99 | $htmlEntities["ñ"] ="\xC3\xB1"; // ñ (small n, tilde) | ||
100 | $htmlEntities["ò"] ="\xC3\xB2"; // ò (small o, grave accent) | ||
101 | $htmlEntities["ó"] ="\xC3\xB3"; // ó (small o, acute accent) | ||
102 | $htmlEntities["ô"] ="\xC3\xB4"; // ô (small o, circumflex accent) | ||
103 | $htmlEntities["õ"] ="\xC3\xB5"; // õ (small o, tilde) | ||
104 | $htmlEntities["ö"] ="\xC3\xB6"; // ö (small o, umlaut mark) | ||
105 | $htmlEntities["÷"] ="\xC3\xB7"; // ÷ (division) | ||
106 | $htmlEntities["ø"] ="\xC3\xB8"; // ø (small o, slash) | ||
107 | $htmlEntities["ù"] ="\xC3\xB9"; // ù (small u, grave accent) | ||
108 | $htmlEntities["ú"] ="\xC3\xBA"; // ú (small u, acute accent) | ||
109 | $htmlEntities["û"] ="\xC3\xBB"; // û (small u, circumflex accent) | ||
110 | $htmlEntities["ü"] ="\xC3\xBC"; // ü (small u, umlaut mark) | ||
111 | $htmlEntities["ý"] ="\xC3\xBD"; // ý (small y, acute accent) | ||
112 | $htmlEntities["þ"] ="\xC3\xBE"; // þ (small thorn, Icelandic) | ||
113 | $htmlEntities["ÿ"] ="\xC3\xBF"; // ÿ (small y, umlaut mark) | ||
114 | $htmlEntities["Œ"] ="\xC5\x92"; // Œ (capital ligature OE) | ||
115 | $htmlEntities["œ"] ="\xC5\x93"; // œ (small ligature oe) | ||
116 | $htmlEntities["Š"] ="\xC5\xA0"; // Š (capital S with caron) | ||
117 | $htmlEntities["š"] ="\xC5\xA1"; // š (small S with caron) | ||
118 | $htmlEntities["Ÿ"] ="\xC5\xB8"; // Ÿ (capital Y with diaeres) | ||
119 | $htmlEntities["ƒ"] ="\xC6\x92"; // ƒ (f with hook) | ||
120 | $htmlEntities["ˆ"] ="\xCB\x86"; // ˆ (modifier letter circumflex accent) | ||
121 | $htmlEntities["˜"] ="\xCB\x9C"; // ˜ (small tilde) | ||
122 | $htmlEntities["Α"] ="\xCE\x91"; // Α (Alpha) | ||
123 | $htmlEntities["Β"] ="\xCE\x92"; // Β (Beta) | ||
124 | $htmlEntities["Γ"] ="\xCE\x93"; // Γ (Gamma) | ||
125 | $htmlEntities["Δ"] ="\xCE\x94"; // Δ (Delta) | ||
126 | $htmlEntities["Ε"] ="\xCE\x95"; // Ε (Epsilon) | ||
127 | $htmlEntities["Ζ"] ="\xCE\x96"; // Ζ (Zeta) | ||
128 | $htmlEntities["Η"] ="\xCE\x97"; // Η (Eta) | ||
129 | $htmlEntities["Θ"] ="\xCE\x98"; // Θ (Theta) | ||
130 | $htmlEntities["Ι"] ="\xCE\x99"; // Ι (Iota) | ||
131 | $htmlEntities["Κ"] ="\xCE\x9A"; // Κ (Kappa) | ||
132 | $htmlEntities["Λ"] ="\xCE\x9B"; // Λ (Lambda) | ||
133 | $htmlEntities["Μ"] ="\xCE\x9C"; // Μ (Mu) | ||
134 | $htmlEntities["Ν"] ="\xCE\x9D"; // Ν (Nu) | ||
135 | $htmlEntities["Ξ"] ="\xCE\x9E"; // Ξ (Xi) | ||
136 | $htmlEntities["Ο"] ="\xCE\x9F"; // Ο (Omicron) | ||
137 | $htmlEntities["Π"] ="\xCE\xA0"; // Π (Pi) | ||
138 | $htmlEntities["Ρ"] ="\xCE\xA1"; // Ρ (Rho) | ||
139 | $htmlEntities["Σ"] ="\xCE\xA3"; // Σ (Sigma) | ||
140 | $htmlEntities["Τ"] ="\xCE\xA4"; // Τ (Tau) | ||
141 | $htmlEntities["Υ"] ="\xCE\xA5"; // Υ (Upsilon) | ||
142 | $htmlEntities["Φ"] ="\xCE\xA6"; // Φ (Phi) | ||
143 | $htmlEntities["Χ"] ="\xCE\xA7"; // Χ (Chi) | ||
144 | $htmlEntities["Ψ"] ="\xCE\xA8"; // Ψ (Psi) | ||
145 | $htmlEntities["Ω"] ="\xCE\xA9"; // Ω (Omega) | ||
146 | $htmlEntities["α"] ="\xCE\xB1"; // α (alpha) | ||
147 | $htmlEntities["β"] ="\xCE\xB2"; // β (beta) | ||
148 | $htmlEntities["γ"] ="\xCE\xB3"; // γ (gamma) | ||
149 | $htmlEntities["δ"] ="\xCE\xB4"; // δ (delta) | ||
150 | $htmlEntities["ε"] ="\xCE\xB5"; // ε (epsilon) | ||
151 | $htmlEntities["ζ"] ="\xCE\xB6"; // ζ (zeta) | ||
152 | $htmlEntities["η"] ="\xCE\xB7"; // η (eta) | ||
153 | $htmlEntities["θ"] ="\xCE\xB8"; // θ (theta) | ||
154 | $htmlEntities["ι"] ="\xCE\xB9"; // ι (iota) | ||
155 | $htmlEntities["κ"] ="\xCE\xBA"; // κ (kappa) | ||
156 | $htmlEntities["λ"] ="\xCE\xBB"; // λ (lambda) | ||
157 | $htmlEntities["μ"] ="\xCE\xBC"; // μ (mu) | ||
158 | $htmlEntities["ν"] ="\xCE\xBD"; // ν (nu) | ||
159 | $htmlEntities["ξ"] ="\xCE\xBE"; // ξ (xi) | ||
160 | $htmlEntities["ο"] ="\xCE\xBF"; // ο (omicron) | ||
161 | $htmlEntities["π"] ="\xCF\x80"; // π (pi) | ||
162 | $htmlEntities["ρ"] ="\xCF\x81"; // ρ (rho) | ||
163 | $htmlEntities["ς"] ="\xCF\x82"; // ς (sigmaf) | ||
164 | $htmlEntities["σ"] ="\xCF\x83"; // σ (sigma) | ||
165 | $htmlEntities["τ"] ="\xCF\x84"; // τ (tau) | ||
166 | $htmlEntities["υ"] ="\xCF\x85"; // υ (upsilon) | ||
167 | $htmlEntities["φ"] ="\xCF\x86"; // φ (phi) | ||
168 | $htmlEntities["χ"] ="\xCF\x87"; // χ (chi) | ||
169 | $htmlEntities["ψ"] ="\xCF\x88"; // ψ (psi) | ||
170 | $htmlEntities["ω"] ="\xCF\x89"; // ω (omega) | ||
171 | $htmlEntities["ϑ"] ="\xCF\x91"; // ϑ (theta symbol) | ||
172 | $htmlEntities["ϒ"] ="\xCF\x92"; // ϒ (upsilon symbol) | ||
173 | $htmlEntities["ϖ"] ="\xCF\x96"; // ϖ (pi symbol) | ||
174 | $htmlEntities[" "] ="\xE2\x80\x82"; //   (en space) | ||
175 | $htmlEntities[" "] ="\xE2\x80\x83"; //   (em space) | ||
176 | $htmlEntities[" "] ="\xE2\x80\x89"; //   (thin space) | ||
177 | $htmlEntities["‌"] ="\xE2\x80\x8C"; // ‌ (zero width non-joiner) | ||
178 | $htmlEntities["‍"] ="\xE2\x80\x8D"; // ‍ (zero width joiner) | ||
179 | $htmlEntities["‎"] ="\xE2\x80\x8E"; // ‎ (left-to-right mark) | ||
180 | $htmlEntities["‏"] ="\xE2\x80\x8F"; // ‏ (right-to-left mark) | ||
181 | $htmlEntities["–"] ="\xE2\x80\x93"; // – (en dash) | ||
182 | $htmlEntities["—"] ="\xE2\x80\x94"; // — (em dash) | ||
183 | $htmlEntities["‘"] ="\xE2\x80\x98"; // ‘ (left single quotation mark) | ||
184 | $htmlEntities["’"] ="\xE2\x80\x99"; // ’ (right single quotation mark) | ||
185 | $htmlEntities["‚"] ="\xE2\x80\x9A"; // ‚ (single low-9 quotation mark) | ||
186 | $htmlEntities["“"] ="\xE2\x80\x9C"; // “ (left double quotation mark) | ||
187 | $htmlEntities["”"] ="\xE2\x80\x9D"; // ” (right double quotation mark) | ||
188 | $htmlEntities["„"] ="\xE2\x80\x9E"; // „ (double low-9 quotation mark) | ||
189 | $htmlEntities["†"] ="\xE2\x80\xA0"; // † (dagger) | ||
190 | $htmlEntities["‡"] ="\xE2\x80\xA1"; // ‡ (double dagger) | ||
191 | $htmlEntities["•"] ="\xE2\x80\xA2"; // • (bullet) | ||
192 | $htmlEntities["…"] ="\xE2\x80\xA6"; // … (horizontal ellipsis) | ||
193 | $htmlEntities["‰"] ="\xE2\x80\xB0"; // ‰ (per mille) | ||
194 | $htmlEntities["′"] ="\xE2\x80\xB2"; // ′ (minutes or prime) | ||
195 | $htmlEntities["″"] ="\xE2\x80\xB3"; // ″ (seconds or Double Prime) | ||
196 | $htmlEntities["‹"] ="\xE2\x80\xB9"; // ‹ (single left angle quotation) | ||
197 | $htmlEntities["›"] ="\xE2\x80\xBA"; // › (single right angle quotation) | ||
198 | $htmlEntities["‾"] ="\xE2\x80\xBE"; // ‾ (overline) | ||
199 | $htmlEntities["⁄"] ="\xE2\x81\x84"; // ⁄ (fraction slash) | ||
200 | $htmlEntities["€"] ="\xE2\x82\xAC"; // € (euro) | ||
201 | $htmlEntities["ℑ"] ="\xE2\x84\x91"; // ℑ (blackletter capital I) | ||
202 | $htmlEntities["℘"] ="\xE2\x84\x98"; // ℘ (script capital P) | ||
203 | $htmlEntities["ℜ"] ="\xE2\x84\x9C"; // ℜ (blackletter capital R) | ||
204 | $htmlEntities["™"] ="\xE2\x84\xA2"; // ™ (trademark) | ||
205 | $htmlEntities["ℵ"] ="\xE2\x84\xB5"; // ℵ (alef) | ||
206 | $htmlEntities["←"] ="\xE2\x86\x90"; // ← (left arrow) | ||
207 | $htmlEntities["↑"] ="\xE2\x86\x91"; // ↑ (up arrow) | ||
208 | $htmlEntities["→"] ="\xE2\x86\x92"; // → (right arrow) | ||
209 | $htmlEntities["↓"] ="\xE2\x86\x93"; // ↓ (down arrow) | ||
210 | $htmlEntities["↔"] ="\xE2\x86\x94"; // ↔ (left right arrow) | ||
211 | $htmlEntities["↵"] ="\xE2\x86\xB5"; // ↵ (carriage return arrow) | ||
212 | $htmlEntities["⇐"] ="\xE2\x87\x90"; // ⇐ (left double arrow) | ||
213 | $htmlEntities["⇑"] ="\xE2\x87\x91"; // ⇑ (up double arrow) | ||
214 | $htmlEntities["⇒"] ="\xE2\x87\x92"; // ⇒ (right double arrow) | ||
215 | $htmlEntities["⇓"] ="\xE2\x87\x93"; // ⇓ (down double arrow) | ||
216 | $htmlEntities["⇔"] ="\xE2\x87\x94"; // ⇔ (left right double arrow) | ||
217 | $htmlEntities["∀"] ="\xE2\x88\x80"; // ∀ (for all) | ||
218 | $htmlEntities["∂"] ="\xE2\x88\x82"; // ∂ (partial differential) | ||
219 | $htmlEntities["∃"] ="\xE2\x88\x83"; // ∃ (there exists) | ||
220 | $htmlEntities["∅"] ="\xE2\x88\x85"; // ∅ (empty set) | ||
221 | $htmlEntities["∇"] ="\xE2\x88\x87"; // ∇ (backward difference) | ||
222 | $htmlEntities["∈"] ="\xE2\x88\x88"; // ∈ (element of) | ||
223 | $htmlEntities["∉"] ="\xE2\x88\x89"; // ∉ (not an element of) | ||
224 | $htmlEntities["∋"] ="\xE2\x88\x8B"; // ∋ (ni = contains as member) | ||
225 | $htmlEntities["∏"] ="\xE2\x88\x8F"; // ∏ (n-ary product) | ||
226 | $htmlEntities["∑"] ="\xE2\x88\x91"; // ∑ (n-ary sumation) | ||
227 | $htmlEntities["−"] ="\xE2\x88\x92"; // − (minus) | ||
228 | $htmlEntities["∗"] ="\xE2\x88\x97"; // ∗ (asterisk operator) | ||
229 | $htmlEntities["√"] ="\xE2\x88\x9A"; // √ (square root) | ||
230 | $htmlEntities["∝"] ="\xE2\x88\x9D"; // ∝ (proportional to) | ||
231 | $htmlEntities["∞"] ="\xE2\x88\x9E"; // ∞ (infinity) | ||
232 | $htmlEntities["∠"] ="\xE2\x88\xA0"; // ∠ (angle) | ||
233 | $htmlEntities["∧"] ="\xE2\x88\xA7"; // ∧ (logical and) | ||
234 | $htmlEntities["∨"] ="\xE2\x88\xA8"; // ∨ (logical or) | ||
235 | $htmlEntities["∩"] ="\xE2\x88\xA9"; // ∩ (intersection) | ||
236 | $htmlEntities["∪"] ="\xE2\x88\xAA"; // ∪ (union) | ||
237 | $htmlEntities["∫"] ="\xE2\x88\xAB"; // ∫ (integral) | ||
238 | $htmlEntities["∴"] ="\xE2\x88\xB4"; // ∴ (therefore) | ||
239 | $htmlEntities["∼"] ="\xE2\x88\xBC"; // ∼ (similar to) | ||
240 | $htmlEntities["≅"] ="\xE2\x89\x85"; // ≅ (congruent to) | ||
241 | $htmlEntities["≈"] ="\xE2\x89\x88"; // ≈ (approximately equal) | ||
242 | $htmlEntities["≠"] ="\xE2\x89\xA0"; // ≠ (not equal) | ||
243 | $htmlEntities["≡"] ="\xE2\x89\xA1"; // ≡ (equivalent) | ||
244 | $htmlEntities["≤"] ="\xE2\x89\xA4"; // ≤ (less or equal) | ||
245 | $htmlEntities["≥"] ="\xE2\x89\xA5"; // ≥ (greater or equal) | ||
246 | $htmlEntities["⊂"] ="\xE2\x8A\x82"; // ⊂ (subset of) | ||
247 | $htmlEntities["⊃"] ="\xE2\x8A\x83"; // ⊃ (superset of) | ||
248 | $htmlEntities["⊄"] ="\xE2\x8A\x84"; // ⊄ (not subset of) | ||
249 | $htmlEntities["⊆"] ="\xE2\x8A\x86"; // ⊆ (subset or equal) | ||
250 | $htmlEntities["⊇"] ="\xE2\x8A\x87"; // ⊇ (superset or equal) | ||
251 | $htmlEntities["⊕"] ="\xE2\x8A\x95"; // ⊕ (circled plus) | ||
252 | $htmlEntities["⊗"] ="\xE2\x8A\x87"; // ⊗ (circled times) | ||
253 | $htmlEntities["⊥"] ="\xE2\x8A\xA5"; // ⊥ (perpendicular) | ||
254 | $htmlEntities["⋅"] ="\xE2\x8C\x85"; // ⋅ (dot operator) | ||
255 | $htmlEntities["⌈"] ="\xE2\x8C\x88"; // ⌈ (left ceiling) | ||
256 | $htmlEntities["⌉"] ="\xE2\x8C\x89"; // ⌉ (right ceiling) | ||
257 | $htmlEntities["⌊"] ="\xE2\x8C\x8A"; // ⌊ (left floor) | ||
258 | $htmlEntities["⌋"] ="\xE2\x8C\x8B"; // ⌋ (right floor) | ||
259 | $htmlEntities["⟨"] ="\xE2\x8C\xA9"; // 〈 (left angle bracket = bra) | ||
260 | $htmlEntities["⟩"] ="\xE2\x8C\xAA"; // 〉 (right angle bracket = ket) | ||
261 | $htmlEntities["◊"] ="\xE2\x97\x8A"; // ◊ (lozenge) | ||
262 | $htmlEntities["♠"] ="\xE2\x99\xA0"; // ♠ (spade) | ||
263 | $htmlEntities["♣"] ="\xE2\x99\xA3"; // ♣ (club) | ||
264 | $htmlEntities["♥"] ="\xE2\x99\xA5"; // ♥ (heart) | ||
265 | $htmlEntities["♦"] ="\xE2\x99\xA6"; // ♦ (diamond) | ||
266 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.NCX.php b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php new file mode 100644 index 00000000..e5da05cd --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php | |||
@@ -0,0 +1,782 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * ePub NCX file structure | ||
4 | * | ||
5 | * @author A. Grandt <php@grandt.com> | ||
6 | * @copyright 2009-2014 A. Grandt | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 3.20 | ||
9 | */ | ||
10 | class Ncx { | ||
11 | const _VERSION = 3.20; | ||
12 | |||
13 | const MIMETYPE = "application/x-dtbncx+xml"; | ||
14 | |||
15 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
16 | |||
17 | private $navMap = NULL; | ||
18 | private $uid = NULL; | ||
19 | private $meta = array(); | ||
20 | private $docTitle = NULL; | ||
21 | private $docAuthor = NULL; | ||
22 | |||
23 | private $currentLevel = NULL; | ||
24 | private $lastLevel = NULL; | ||
25 | |||
26 | private $languageCode = "en"; | ||
27 | private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT; | ||
28 | |||
29 | public $chapterList = array(); | ||
30 | public $referencesTitle = "Guide"; | ||
31 | public $referencesClass = "references"; | ||
32 | public $referencesId = "references"; | ||
33 | public $referencesList = array(); | ||
34 | public $referencesName = array(); | ||
35 | public $referencesOrder = NULL; | ||
36 | |||
37 | /** | ||
38 | * Class constructor. | ||
39 | * | ||
40 | * @param string $uid | ||
41 | * @param string $docTitle | ||
42 | * @param string $docAuthor | ||
43 | * @param string $languageCode | ||
44 | * @param string $writingDirection | ||
45 | */ | ||
46 | function __construct($uid = NULL, $docTitle = NULL, $docAuthor = NULL, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { | ||
47 | $this->navMap = new NavMap($writingDirection); | ||
48 | $this->currentLevel = $this->navMap; | ||
49 | $this->setUid($uid); | ||
50 | $this->setDocTitle($docTitle); | ||
51 | $this->setDocAuthor($docAuthor); | ||
52 | $this->setLanguageCode($languageCode); | ||
53 | $this->setWritingDirection($writingDirection); | ||
54 | } | ||
55 | |||
56 | /** | ||
57 | * Class destructor | ||
58 | * | ||
59 | * @return void | ||
60 | */ | ||
61 | function __destruct() { | ||
62 | unset($this->bookVersion, $this->navMap, $this->uid, $this->meta); | ||
63 | unset($this->docTitle, $this->docAuthor, $this->currentLevel, $this->lastLevel); | ||
64 | unset($this->languageCode, $this->writingDirection, $this->chapterList, $this->referencesTitle); | ||
65 | unset($this->referencesClass, $this->referencesId, $this->referencesList, $this->referencesName); | ||
66 | unset($this->referencesOrder); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * | ||
71 | * Enter description here ... | ||
72 | * | ||
73 | * @param string $bookVersion | ||
74 | */ | ||
75 | function setVersion($bookVersion) { | ||
76 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
77 | } | ||
78 | |||
79 | /** | ||
80 | * | ||
81 | * @return bool TRUE if the book is set to type ePub 2 | ||
82 | */ | ||
83 | function isEPubVersion2() { | ||
84 | return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; | ||
85 | } | ||
86 | |||
87 | /** | ||
88 | * | ||
89 | * Enter description here ... | ||
90 | * | ||
91 | * @param string $uid | ||
92 | */ | ||
93 | function setUid($uid) { | ||
94 | $this->uid = is_string($uid) ? trim($uid) : NULL; | ||
95 | } | ||
96 | |||
97 | /** | ||
98 | * | ||
99 | * Enter description here ... | ||
100 | * | ||
101 | * @param string $docTitle | ||
102 | */ | ||
103 | function setDocTitle($docTitle) { | ||
104 | $this->docTitle = is_string($docTitle) ? trim($docTitle) : NULL; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * | ||
109 | * Enter description here ... | ||
110 | * | ||
111 | * @param string $docAuthor | ||
112 | */ | ||
113 | function setDocAuthor($docAuthor) { | ||
114 | $this->docAuthor = is_string($docAuthor) ? trim($docAuthor) : NULL; | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * | ||
119 | * Enter description here ... | ||
120 | * | ||
121 | * @param string $languageCode | ||
122 | */ | ||
123 | function setLanguageCode($languageCode) { | ||
124 | $this->languageCode = is_string($languageCode) ? trim($languageCode) : "en"; | ||
125 | } | ||
126 | |||
127 | /** | ||
128 | * | ||
129 | * Enter description here ... | ||
130 | * | ||
131 | * @param string $writingDirection | ||
132 | */ | ||
133 | function setWritingDirection($writingDirection) { | ||
134 | $this->writingDirection = is_string($writingDirection) ? trim($writingDirection) : EPub::DIRECTION_LEFT_TO_RIGHT; | ||
135 | } | ||
136 | |||
137 | /** | ||
138 | * | ||
139 | * Enter description here ... | ||
140 | * | ||
141 | * @param NavMap $navMap | ||
142 | */ | ||
143 | function setNavMap($navMap) { | ||
144 | if ($navMap != NULL && is_object($navMap) && get_class($navMap) === "NavMap") { | ||
145 | $this->navMap = $navMap; | ||
146 | } | ||
147 | } | ||
148 | |||
149 | /** | ||
150 | * Add one chapter level. | ||
151 | * | ||
152 | * Subsequent chapters will be added to this level. | ||
153 | * | ||
154 | * @param string $navTitle | ||
155 | * @param string $navId | ||
156 | * @param string $navClass | ||
157 | * @param string $isNavHidden | ||
158 | * @param string $writingDirection | ||
159 | * @return NavPoint | ||
160 | */ | ||
161 | function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { | ||
162 | $navPoint = FALSE; | ||
163 | if (isset($navTitle) && isset($navClass)) { | ||
164 | $navPoint = new NavPoint($navTitle, NULL, $navId, $navClass, $isNavHidden, $writingDirection); | ||
165 | $this->addNavPoint($navPoint); | ||
166 | } | ||
167 | if ($this->lastLevel !== NULL) { | ||
168 | $this->currentLevel = $this->lastLevel; | ||
169 | } | ||
170 | return $navPoint; | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * Step back one chapter level. | ||
175 | * | ||
176 | * Subsequent chapters will be added to this chapters parent level. | ||
177 | */ | ||
178 | function backLevel() { | ||
179 | $this->lastLevel = $this->currentLevel; | ||
180 | $this->currentLevel = $this->currentLevel->getParent(); | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * Step back to the root level. | ||
185 | * | ||
186 | * Subsequent chapters will be added to the rooot NavMap. | ||
187 | */ | ||
188 | function rootLevel() { | ||
189 | $this->lastLevel = $this->currentLevel; | ||
190 | $this->currentLevel = $this->navMap; | ||
191 | } | ||
192 | |||
193 | /** | ||
194 | * Step back to the given level. | ||
195 | * Useful for returning to a previous level from deep within the structure. | ||
196 | * Values below 2 will have the same effect as rootLevel() | ||
197 | * | ||
198 | * @param int $newLevel | ||
199 | */ | ||
200 | function setCurrentLevel($newLevel) { | ||
201 | if ($newLevel <= 1) { | ||
202 | $this->rootLevel(); | ||
203 | } else { | ||
204 | while ($this->currentLevel->getLevel() > $newLevel) { | ||
205 | $this->backLevel(); | ||
206 | } | ||
207 | } | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * Get current level count. | ||
212 | * The indentation of the current structure point. | ||
213 | * | ||
214 | * @return current level count; | ||
215 | */ | ||
216 | function getCurrentLevel() { | ||
217 | return $this->currentLevel->getLevel(); | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * Add child NavPoints to current level. | ||
222 | * | ||
223 | * @param NavPoint $navPoint | ||
224 | */ | ||
225 | function addNavPoint($navPoint) { | ||
226 | $this->lastLevel = $this->currentLevel->addNavPoint($navPoint); | ||
227 | } | ||
228 | |||
229 | /** | ||
230 | * | ||
231 | * Enter description here ... | ||
232 | * | ||
233 | * @return NavMap | ||
234 | */ | ||
235 | function getNavMap() { | ||
236 | return $this->navMap; | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * | ||
241 | * Enter description here ... | ||
242 | * | ||
243 | * @param string $name | ||
244 | * @param string $content | ||
245 | */ | ||
246 | function addMetaEntry($name, $content) { | ||
247 | $name = is_string($name) ? trim($name) : NULL; | ||
248 | $content = is_string($content) ? trim($content) : NULL; | ||
249 | |||
250 | if ($name != NULL && $content != NULL) { | ||
251 | $this->meta[] = array($name => $content); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * | ||
257 | * Enter description here ... | ||
258 | * | ||
259 | * @return string | ||
260 | */ | ||
261 | function finalize() { | ||
262 | $nav = $this->navMap->finalize(); | ||
263 | |||
264 | $ncx = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; | ||
265 | if ($this->isEPubVersion2()) { | ||
266 | $ncx .= "<!DOCTYPE ncx PUBLIC \"-//NISO//DTD ncx 2005-1//EN\"\n" | ||
267 | . " \"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd\">\n"; | ||
268 | } | ||
269 | $ncx .= "<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\" xml:lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n" | ||
270 | . "\t<head>\n" | ||
271 | . "\t\t<meta name=\"dtb:uid\" content=\"" . $this->uid . "\" />\n" | ||
272 | . "\t\t<meta name=\"dtb:depth\" content=\"" . $this->navMap->getNavLevels() . "\" />\n" | ||
273 | . "\t\t<meta name=\"dtb:totalPageCount\" content=\"0\" />\n" | ||
274 | . "\t\t<meta name=\"dtb:maxPageNumber\" content=\"0\" />\n"; | ||
275 | |||
276 | if (sizeof($this->meta)) { | ||
277 | foreach ($this->meta as $metaEntry) { | ||
278 | list($name, $content) = each($metaEntry); | ||
279 | $ncx .= "\t\t<meta name=\"" . $name . "\" content=\"" . $content . "\" />\n"; | ||
280 | } | ||
281 | } | ||
282 | |||
283 | $ncx .= "\t</head>\n\n\t<docTitle>\n\t\t<text>" | ||
284 | . $this->docTitle | ||
285 | . "</text>\n\t</docTitle>\n\n\t<docAuthor>\n\t\t<text>" | ||
286 | . $this->docAuthor | ||
287 | . "</text>\n\t</docAuthor>\n\n" | ||
288 | . $nav; | ||
289 | |||
290 | return $ncx . "</ncx>\n"; | ||
291 | } | ||
292 | |||
293 | /** | ||
294 | * | ||
295 | * @param string $title | ||
296 | * @param string $cssFileName | ||
297 | * @return string | ||
298 | */ | ||
299 | function finalizeEPub3($title = "Table of Contents", $cssFileName = NULL) { | ||
300 | $end = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | ||
301 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\"\n" | ||
302 | . " xmlns:epub=\"http://www.idpf.org/2007/ops\"\n" | ||
303 | . " xml:lang=\"" . $this->languageCode . "\" lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n" | ||
304 | . "\t<head>\n" | ||
305 | . "\t\t<title>" . $this->docTitle . "</title>\n" | ||
306 | . "\t\t<meta http-equiv=\"default-style\" content=\"text/html; charset=utf-8\"/>\n"; | ||
307 | if ($cssFileName !== NULL) { | ||
308 | $end .= "\t\t<link rel=\"stylesheet\" href=\"" . $cssFileName . "\" type=\"text/css\"/>\n"; | ||
309 | } | ||
310 | $end .= "\t</head>\n" | ||
311 | . "\t<body epub:type=\"frontmatter toc\">\n" | ||
312 | . "\t\t<header>\n" | ||
313 | . "\t\t\t<h1>" . $title . "</h1>\n" | ||
314 | . "\t\t</header>\n" | ||
315 | . $this->navMap->finalizeEPub3() | ||
316 | . $this->finalizeEPub3Landmarks() | ||
317 | . "\t</body>\n" | ||
318 | . "</html>\n"; | ||
319 | |||
320 | return $end; | ||
321 | } | ||
322 | |||
323 | /** | ||
324 | * Build the references for the ePub 2 toc. | ||
325 | * These are merely reference pages added to the end of the navMap though. | ||
326 | * | ||
327 | * @return string | ||
328 | */ | ||
329 | function finalizeReferences() { | ||
330 | if (isset($this->referencesList) && sizeof($this->referencesList) > 0) { | ||
331 | $this->rootLevel(); | ||
332 | $this->subLevel($this->referencesTitle, $this->referencesId, $this->referencesClass); | ||
333 | $refId = 1; | ||
334 | while (list($item, $descriptive) = each($this->referencesOrder)) { | ||
335 | if (array_key_exists($item, $this->referencesList)) { | ||
336 | $name = (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item]); | ||
337 | $navPoint = new NavPoint($name, $this->referencesList[$item], "ref-" . $refId++); | ||
338 | $this->addNavPoint($navPoint); | ||
339 | } | ||
340 | } | ||
341 | } | ||
342 | } | ||
343 | |||
344 | /** | ||
345 | * Build the landmarks for the ePub 3 toc. | ||
346 | * @return string | ||
347 | */ | ||
348 | function finalizeEPub3Landmarks() { | ||
349 | $lm = ""; | ||
350 | if (isset($this->referencesList) && sizeof($this->referencesList) > 0) { | ||
351 | $lm = "\t\t\t<nav epub:type=\"landmarks\">\n" | ||
352 | . "\t\t\t\t<h2" | ||
353 | . ($this->writingDirection === EPub::DIRECTION_RIGHT_TO_LEFT ? " dir=\"rtl\"" : "") | ||
354 | . ">" . $this->referencesTitle . "</h2>\n" | ||
355 | . "\t\t\t\t<ol>\n"; | ||
356 | |||
357 | $li = ""; | ||
358 | while (list($item, $descriptive) = each($this->referencesOrder)) { | ||
359 | if (array_key_exists($item, $this->referencesList)) { | ||
360 | $li .= "\t\t\t\t\t<li><a epub:type=\"" | ||
361 | . $item | ||
362 | . "\" href=\"" . $this->referencesList[$item] . "\">" | ||
363 | . (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item]) | ||
364 | . "</a></li>\n"; | ||
365 | } | ||
366 | } | ||
367 | if (empty($li)) { | ||
368 | return ""; | ||
369 | } | ||
370 | |||
371 | $lm .= $li | ||
372 | . "\t\t\t\t</ol>\n" | ||
373 | . "\t\t\t</nav>\n"; | ||
374 | } | ||
375 | return $lm; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | /** | ||
380 | * ePub NavMap class | ||
381 | */ | ||
382 | class NavMap { | ||
383 | const _VERSION = 3.00; | ||
384 | |||
385 | private $navPoints = array(); | ||
386 | private $navLevels = 0; | ||
387 | private $writingDirection = NULL; | ||
388 | |||
389 | /** | ||
390 | * Class constructor. | ||
391 | * | ||
392 | * @return void | ||
393 | */ | ||
394 | function __construct($writingDirection = NULL) { | ||
395 | $this->setWritingDirection($writingDirection); | ||
396 | } | ||
397 | |||
398 | /** | ||
399 | * Class destructor | ||
400 | * | ||
401 | * @return void | ||
402 | */ | ||
403 | function __destruct() { | ||
404 | unset($this->navPoints, $this->navLevels, $this->writingDirection); | ||
405 | } | ||
406 | |||
407 | /** | ||
408 | * Set the writing direction to be used for this NavPoint. | ||
409 | * | ||
410 | * @param string $writingDirection | ||
411 | */ | ||
412 | function setWritingDirection($writingDirection) { | ||
413 | $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL; | ||
414 | } | ||
415 | |||
416 | function getWritingDirection() { | ||
417 | return $this->writingDirection; | ||
418 | } | ||
419 | |||
420 | /** | ||
421 | * Add a navPoint to the root of the NavMap. | ||
422 | * | ||
423 | * @param NavPoint $navPoint | ||
424 | * @return NavMap | ||
425 | */ | ||
426 | function addNavPoint($navPoint) { | ||
427 | if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") { | ||
428 | $navPoint->setParent($this); | ||
429 | if ($navPoint->getWritingDirection() == NULL) { | ||
430 | $navPoint->setWritingDirection($this->writingDirection); | ||
431 | } | ||
432 | $this->navPoints[] = $navPoint; | ||
433 | return $navPoint; | ||
434 | } | ||
435 | return $this; | ||
436 | } | ||
437 | |||
438 | /** | ||
439 | * The final max depth for the "dtb:depth" meta attribute | ||
440 | * Only available after finalize have been called. | ||
441 | * | ||
442 | * @return number | ||
443 | */ | ||
444 | function getNavLevels() { | ||
445 | return $this->navLevels+1; | ||
446 | } | ||
447 | |||
448 | function getLevel() { | ||
449 | return 1; | ||
450 | } | ||
451 | |||
452 | function getParent() { | ||
453 | return $this; | ||
454 | } | ||
455 | |||
456 | /** | ||
457 | * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization | ||
458 | * | ||
459 | */ | ||
460 | function finalize() { | ||
461 | $playOrder = 0; | ||
462 | $this->navLevels = 0; | ||
463 | |||
464 | $nav = "\t<navMap>\n"; | ||
465 | if (sizeof($this->navPoints) > 0) { | ||
466 | $this->navLevels++; | ||
467 | foreach ($this->navPoints as $navPoint) { | ||
468 | $retLevel = $navPoint->finalize($nav, $playOrder, 0); | ||
469 | if ($retLevel > $this->navLevels) { | ||
470 | $this->navLevels = $retLevel; | ||
471 | } | ||
472 | } | ||
473 | } | ||
474 | return $nav . "\t</navMap>\n"; | ||
475 | } | ||
476 | |||
477 | /** | ||
478 | * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization | ||
479 | * | ||
480 | */ | ||
481 | function finalizeEPub3() { | ||
482 | $playOrder = 0; | ||
483 | $level = 0; | ||
484 | $this->navLevels = 0; | ||
485 | |||
486 | $nav = "\t\t<nav epub:type=\"toc\" id=\"toc\">\n"; | ||
487 | |||
488 | if (sizeof($this->navPoints) > 0) { | ||
489 | $this->navLevels++; | ||
490 | |||
491 | $nav .= str_repeat("\t", $level) . "\t\t\t<ol epub:type=\"list\">\n"; | ||
492 | foreach ($this->navPoints as $navPoint) { | ||
493 | $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, 0); | ||
494 | if ($retLevel > $this->navLevels) { | ||
495 | $this->navLevels = $retLevel; | ||
496 | } | ||
497 | } | ||
498 | $nav .= str_repeat("\t", $level) . "\t\t\t</ol>\n"; | ||
499 | } | ||
500 | |||
501 | return $nav . "\t\t</nav>\n"; | ||
502 | } | ||
503 | } | ||
504 | |||
505 | /** | ||
506 | * ePub NavPoint class | ||
507 | */ | ||
508 | class NavPoint { | ||
509 | const _VERSION = 3.00; | ||
510 | |||
511 | private $label = NULL; | ||
512 | private $contentSrc = NULL; | ||
513 | private $id = NULL; | ||
514 | private $navClass = NULL; | ||
515 | private $isNavHidden = FALSE; | ||
516 | private $navPoints = array(); | ||
517 | private $parent = NULL; | ||
518 | |||
519 | /** | ||
520 | * Class constructor. | ||
521 | * | ||
522 | * All three attributes are mandatory, though if ID is set to null (default) the value will be generated. | ||
523 | * | ||
524 | * @param string $label | ||
525 | * @param string $contentSrc | ||
526 | * @param string $id | ||
527 | * @param string $navClass | ||
528 | * @param bool $isNavHidden | ||
529 | * @param string $writingDirection | ||
530 | */ | ||
531 | function __construct($label, $contentSrc = NULL, $id = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { | ||
532 | $this->setLabel($label); | ||
533 | $this->setContentSrc($contentSrc); | ||
534 | $this->setId($id); | ||
535 | $this->setNavClass($navClass); | ||
536 | $this->setNavHidden($isNavHidden); | ||
537 | $this->setWritingDirection($writingDirection); | ||
538 | } | ||
539 | |||
540 | /** | ||
541 | * Class destructor | ||
542 | * | ||
543 | * @return void | ||
544 | */ | ||
545 | function __destruct() { | ||
546 | unset($this->label, $this->contentSrc, $this->id, $this->navClass); | ||
547 | unset($this->isNavHidden, $this->navPoints, $this->parent); | ||
548 | } | ||
549 | |||
550 | /** | ||
551 | * Set the Text label for the NavPoint. | ||
552 | * | ||
553 | * The label is mandatory. | ||
554 | * | ||
555 | * @param string $label | ||
556 | */ | ||
557 | function setLabel($label) { | ||
558 | $this->label = is_string($label) ? trim($label) : NULL; | ||
559 | } | ||
560 | |||
561 | /** | ||
562 | * Get the Text label for the NavPoint. | ||
563 | * | ||
564 | * @return string Label | ||
565 | */ | ||
566 | function getLabel() { | ||
567 | return $this->label; | ||
568 | } | ||
569 | |||
570 | /** | ||
571 | * Set the src reference for the NavPoint. | ||
572 | * | ||
573 | * The src is mandatory for ePub 2. | ||
574 | * | ||
575 | * @param string $contentSrc | ||
576 | */ | ||
577 | function setContentSrc($contentSrc) { | ||
578 | $this->contentSrc = isset($contentSrc) && is_string($contentSrc) ? trim($contentSrc) : NULL; | ||
579 | } | ||
580 | |||
581 | /** | ||
582 | * Get the src reference for the NavPoint. | ||
583 | * | ||
584 | * @return string content src url. | ||
585 | */ | ||
586 | function getContentSrc() { | ||
587 | return $this->contentSrc; | ||
588 | } | ||
589 | /** | ||
590 | * Set the parent for this NavPoint. | ||
591 | * | ||
592 | * @param NavPoint or NavMap $parent | ||
593 | */ | ||
594 | function setParent($parent) { | ||
595 | if ($parent != NULL && is_object($parent) && | ||
596 | (get_class($parent) === "NavPoint" || get_class($parent) === "NavMap") ) { | ||
597 | $this->parent = $parent; | ||
598 | } | ||
599 | } | ||
600 | |||
601 | /** | ||
602 | * Get the parent to this NavPoint. | ||
603 | * | ||
604 | * @return NavPoint, or NavMap if the parent is the root. | ||
605 | */ | ||
606 | function getParent() { | ||
607 | return $this->parent; | ||
608 | } | ||
609 | |||
610 | /** | ||
611 | * Get the current level. 1 = document root. | ||
612 | * | ||
613 | * @return int level | ||
614 | */ | ||
615 | function getLevel() { | ||
616 | return $this->parent === NULL ? 1 : $this->parent->getLevel()+1; | ||
617 | } | ||
618 | |||
619 | /** | ||
620 | * Set the id for the NavPoint. | ||
621 | * | ||
622 | * The id must be unique, and is mandatory. | ||
623 | * | ||
624 | * @param string $id | ||
625 | */ | ||
626 | function setId($id) { | ||
627 | $this->id = is_string($id) ? trim($id) : NULL; | ||
628 | } | ||
629 | |||
630 | /** | ||
631 | * Set the class to be used for this NavPoint. | ||
632 | * | ||
633 | * @param string $navClass | ||
634 | */ | ||
635 | function setNavClass($navClass) { | ||
636 | $this->navClass = isset($navClass) && is_string($navClass) ? trim($navClass) : NULL; | ||
637 | } | ||
638 | |||
639 | /** | ||
640 | * Set the class to be used for this NavPoint. | ||
641 | * | ||
642 | * @param string $navClass | ||
643 | */ | ||
644 | function setNavHidden($isNavHidden) { | ||
645 | $this->isNavHidden = $isNavHidden === TRUE; | ||
646 | } | ||
647 | |||
648 | /** | ||
649 | * Set the writing direction to be used for this NavPoint. | ||
650 | * | ||
651 | * @param string $writingDirection | ||
652 | */ | ||
653 | function setWritingDirection($writingDirection) { | ||
654 | $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL; | ||
655 | } | ||
656 | |||
657 | function getWritingDirection() { | ||
658 | return $this->writingDirection; | ||
659 | } | ||
660 | |||
661 | /** | ||
662 | * Add child NavPoints for multi level NavMaps. | ||
663 | * | ||
664 | * @param NavPoint $navPoint | ||
665 | */ | ||
666 | function addNavPoint($navPoint) { | ||
667 | if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") { | ||
668 | $navPoint->setParent($this); | ||
669 | if ($navPoint->getWritingDirection() == NULL) { | ||
670 | $navPoint->setWritingDirection($this->writingDirection); | ||
671 | } | ||
672 | $this->navPoints[] = $navPoint; | ||
673 | return $navPoint; | ||
674 | } | ||
675 | return $this; | ||
676 | } | ||
677 | |||
678 | /** | ||
679 | * | ||
680 | * Enter description here ... | ||
681 | * | ||
682 | * @param string $nav | ||
683 | * @param int $playOrder | ||
684 | * @param int $level | ||
685 | * @return int | ||
686 | */ | ||
687 | function finalize(&$nav = "", &$playOrder = 0, $level = 0) { | ||
688 | $maxLevel = $level; | ||
689 | $levelAdjust = 0; | ||
690 | |||
691 | if ($this->isNavHidden) { | ||
692 | return $maxLevel; | ||
693 | } | ||
694 | |||
695 | if (isset($this->contentSrc)) { | ||
696 | $playOrder++; | ||
697 | |||
698 | if ($this->id == NULL) { | ||
699 | $this->id = "navpoint-" . $playOrder; | ||
700 | } | ||
701 | $nav .= str_repeat("\t", $level) . "\t\t<navPoint id=\"" . $this->id . "\" playOrder=\"" . $playOrder . "\">\n" | ||
702 | . str_repeat("\t", $level) . "\t\t\t<navLabel>\n" | ||
703 | . str_repeat("\t", $level) . "\t\t\t\t<text>" . $this->label . "</text>\n" | ||
704 | . str_repeat("\t", $level) . "\t\t\t</navLabel>\n" | ||
705 | . str_repeat("\t", $level) . "\t\t\t<content src=\"" . $this->contentSrc . "\" />\n"; | ||
706 | } else { | ||
707 | $levelAdjust++; | ||
708 | } | ||
709 | |||
710 | if (sizeof($this->navPoints) > 0) { | ||
711 | $maxLevel++; | ||
712 | foreach ($this->navPoints as $navPoint) { | ||
713 | $retLevel = $navPoint->finalize($nav, $playOrder, ($level+1+$levelAdjust)); | ||
714 | if ($retLevel > $maxLevel) { | ||
715 | $maxLevel = $retLevel; | ||
716 | } | ||
717 | } | ||
718 | } | ||
719 | |||
720 | if (isset($this->contentSrc)) { | ||
721 | $nav .= str_repeat("\t", $level) . "\t\t</navPoint>\n"; | ||
722 | } | ||
723 | |||
724 | return $maxLevel; | ||
725 | } | ||
726 | |||
727 | /** | ||
728 | * | ||
729 | * Enter description here ... | ||
730 | * | ||
731 | * @param string $nav | ||
732 | * @param int $playOrder | ||
733 | * @param int $level | ||
734 | * @return int | ||
735 | */ | ||
736 | function finalizeEPub3(&$nav = "", &$playOrder = 0, $level = 0, $subLevelClass = NULL, $subLevelHidden = FALSE) { | ||
737 | $maxLevel = $level; | ||
738 | |||
739 | if ($this->id == NULL) { | ||
740 | $this->id = "navpoint-" . $playOrder; | ||
741 | } | ||
742 | $indent = str_repeat("\t", $level) . "\t\t\t\t"; | ||
743 | |||
744 | $nav .= $indent . "<li id=\"" . $this->id . "\""; | ||
745 | if (isset($this->writingDirection)) { | ||
746 | $nav .= " dir=\"" . $this->writingDirection . "\""; | ||
747 | } | ||
748 | $nav .= ">\n"; | ||
749 | |||
750 | if (isset($this->contentSrc)) { | ||
751 | $nav .= $indent . "\t<a href=\"" . $this->contentSrc . "\">" . $this->label . "</a>\n"; | ||
752 | } else { | ||
753 | $nav .= $indent . "\t<span>" . $this->label . "</span>\n"; | ||
754 | } | ||
755 | |||
756 | if (sizeof($this->navPoints) > 0) { | ||
757 | $maxLevel++; | ||
758 | |||
759 | $nav .= $indent . "\t<ol epub:type=\"list\""; | ||
760 | if (isset($subLevelClass)) { | ||
761 | $nav .= " class=\"" . $subLevelClass . "\""; | ||
762 | } | ||
763 | if ($subLevelHidden) { | ||
764 | $nav .= " hidden=\"hidden\""; | ||
765 | } | ||
766 | $nav .= ">\n"; | ||
767 | |||
768 | foreach ($this->navPoints as $navPoint) { | ||
769 | $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, ($level+2), $subLevelClass, $subLevelHidden); | ||
770 | if ($retLevel > $maxLevel) { | ||
771 | $maxLevel = $retLevel; | ||
772 | } | ||
773 | } | ||
774 | $nav .= $indent . "\t</ol>\n"; | ||
775 | } | ||
776 | |||
777 | $nav .= $indent . "</li>\n"; | ||
778 | |||
779 | return $maxLevel; | ||
780 | } | ||
781 | } | ||
782 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.OPF.php b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php new file mode 100644 index 00000000..803a2108 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php | |||
@@ -0,0 +1,1226 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * ePub OPF file structure | ||
4 | * | ||
5 | * @author A. Grandt <php@grandt.com> | ||
6 | * @copyright 2009-2014 A. Grandt | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 3.20 | ||
9 | */ | ||
10 | class Opf { | ||
11 | const _VERSION = 3.20; | ||
12 | |||
13 | /* Core Media types. | ||
14 | * These types are the only guaranteed mime types any ePub reader must understand. | ||
15 | * Any other type muse define a fall back whose fallback chain will end in one of these. | ||
16 | */ | ||
17 | const TYPE_GIF = "image/gif"; | ||
18 | const TYPE_JPEG = "image/jpeg"; | ||
19 | const TYPE_PNG = "image/png"; | ||
20 | const TYPE_SVG = "image/svg+xml"; | ||
21 | const TYPE_XHTML = "application/xhtml+xml"; | ||
22 | const TYPE_DTBOOK = "application/x-dtbook+xml"; | ||
23 | const TYPE_CSS = "text/css"; | ||
24 | const TYPE_XML = "application/xml"; | ||
25 | const TYPE_OEB1_DOC = "text/x-oeb1-document"; // Deprecated | ||
26 | const TYPE_OEB1_CSS = "text/x-oeb1-css"; // Deprecated | ||
27 | const TYPE_NCX = "application/x-dtbncx+xml"; | ||
28 | |||
29 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
30 | private $ident = "BookId"; | ||
31 | |||
32 | public $date = NULL; | ||
33 | public $metadata = NULL; | ||
34 | public $manifest = NULL; | ||
35 | public $spine = NULL; | ||
36 | public $guide = NULL; | ||
37 | |||
38 | /** | ||
39 | * Class constructor. | ||
40 | * | ||
41 | * @return void | ||
42 | */ | ||
43 | function __construct($ident = "BookId", $bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
44 | $this->setIdent($ident); | ||
45 | $this->setVersion($bookVersion); | ||
46 | $this->metadata = new Metadata(); | ||
47 | $this->manifest = new Manifest(); | ||
48 | $this->spine = new Spine(); | ||
49 | $this->guide = new Guide(); | ||
50 | } | ||
51 | |||
52 | /** | ||
53 | * Class destructor | ||
54 | * | ||
55 | * @return void | ||
56 | */ | ||
57 | function __destruct() { | ||
58 | unset ($this->bookVersion, $this->ident, $this->date, $this->metadata, $this->manifest, $this->spine, $this->guide); | ||
59 | } | ||
60 | |||
61 | /** | ||
62 | * | ||
63 | * Enter description here ... | ||
64 | * | ||
65 | * @param string $ident | ||
66 | */ | ||
67 | function setVersion($bookVersion) { | ||
68 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
69 | } | ||
70 | |||
71 | function isEPubVersion2() { | ||
72 | return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * | ||
77 | * Enter description here ... | ||
78 | * | ||
79 | * @param string $ident | ||
80 | */ | ||
81 | function setIdent($ident = "BookId") { | ||
82 | $this->ident = is_string($ident) ? trim($ident) : "BookId"; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * | ||
87 | * Enter description here ... | ||
88 | * | ||
89 | * @return string | ||
90 | */ | ||
91 | function finalize() { | ||
92 | $opf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
93 | . "<package xmlns=\"http://www.idpf.org/2007/opf\" unique-identifier=\"" . $this->ident . "\" version=\"" . $this->bookVersion . "\">\n"; | ||
94 | |||
95 | $opf .= $this->metadata->finalize($this->bookVersion, $this->date); | ||
96 | $opf .= $this->manifest->finalize($this->bookVersion); | ||
97 | $opf .= $this->spine->finalize(); | ||
98 | |||
99 | if ($this->guide->length() > 0) { | ||
100 | $opf .= $this->guide->finalize(); | ||
101 | } | ||
102 | |||
103 | return $opf . "</package>\n"; | ||
104 | } | ||
105 | |||
106 | // Convenience functions: | ||
107 | |||
108 | /** | ||
109 | * | ||
110 | * Enter description here ... | ||
111 | * | ||
112 | * @param string $title | ||
113 | * @param string $language | ||
114 | * @param string $identifier | ||
115 | * @param string $identifierScheme | ||
116 | */ | ||
117 | function initialize($title, $language, $identifier, $identifierScheme) { | ||
118 | $this->metadata->addDublinCore(new DublinCore("title", $title)); | ||
119 | $this->metadata->addDublinCore(new DublinCore("language", $language)); | ||
120 | |||
121 | $dc = new DublinCore("identifier", $identifier); | ||
122 | $dc->addAttr("id", $this->ident); | ||
123 | $dc->addOpfAttr("scheme", $identifierScheme); | ||
124 | $this->metadata->addDublinCore($dc); | ||
125 | } | ||
126 | |||
127 | /** | ||
128 | * | ||
129 | * Enter description here ... | ||
130 | * | ||
131 | * @param string $id | ||
132 | * @param string $href | ||
133 | * @param string $mediaType | ||
134 | */ | ||
135 | function addItem($id, $href, $mediaType, $properties = NULL) { | ||
136 | $this->manifest->addItem(new Item($id, $href, $mediaType, $properties)); | ||
137 | } | ||
138 | |||
139 | /** | ||
140 | * | ||
141 | * Enter description here ... | ||
142 | * | ||
143 | * @param string $idref | ||
144 | * @param bool $linear | ||
145 | */ | ||
146 | function addItemRef($idref, $linear = TRUE) { | ||
147 | $this->spine->addItemref(new Itemref($idref, $linear)); | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * | ||
152 | * Enter description here ... | ||
153 | * | ||
154 | * @param string $type | ||
155 | * @param string $title | ||
156 | * @param string $href | ||
157 | */ | ||
158 | function addReference($type, $title, $href) { | ||
159 | $this->guide->addReference(new Reference($type, $title, $href)); | ||
160 | } | ||
161 | |||
162 | /** | ||
163 | * | ||
164 | * Enter description here ... | ||
165 | * | ||
166 | * @param string $name | ||
167 | * @param string $value | ||
168 | */ | ||
169 | function addDCMeta($name, $value) { | ||
170 | $this->metadata->addDublinCore(new DublinCore($name, $value)); | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * | ||
175 | * Enter description here ... | ||
176 | * | ||
177 | * @param string $name | ||
178 | * @param string $content | ||
179 | */ | ||
180 | function addMeta($name, $content) { | ||
181 | $this->metadata->addMeta($name, $content); | ||
182 | } | ||
183 | |||
184 | /** | ||
185 | * | ||
186 | * Enter description here ... | ||
187 | * | ||
188 | * @param string $name | ||
189 | * @param string $fileAs | ||
190 | * @param string $role Use the MarcCode constants | ||
191 | */ | ||
192 | function addCreator($name, $fileAs = NULL, $role = NULL) { | ||
193 | $dc = new DublinCore(DublinCore::CREATOR, trim($name)); | ||
194 | |||
195 | if ($fileAs !== NULL) { | ||
196 | $dc->addOpfAttr("file-as", trim($fileAs)); | ||
197 | } | ||
198 | |||
199 | if ($role !== NULL) { | ||
200 | $dc->addOpfAttr("role", trim($role)); | ||
201 | } | ||
202 | |||
203 | $this->metadata->addDublinCore($dc); | ||
204 | } | ||
205 | |||
206 | /** | ||
207 | * | ||
208 | * Enter description here ... | ||
209 | * | ||
210 | * @param string $name | ||
211 | * @param string $fileAs | ||
212 | * @param string $role Use the MarcCode constants | ||
213 | */ | ||
214 | function addColaborator($name, $fileAs = NULL, $role = NULL) { | ||
215 | $dc = new DublinCore(DublinCore::CONTRIBUTOR, trim($name)); | ||
216 | |||
217 | if ($fileAs !== NULL) { | ||
218 | $dc->addOpfAttr("file-as", trim($fileAs)); | ||
219 | } | ||
220 | |||
221 | if ($role !== NULL) { | ||
222 | $dc->addOpfAttr("role", trim($role)); | ||
223 | } | ||
224 | |||
225 | $this->metadata->addDublinCore($dc); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | /** | ||
230 | * ePub OPF Metadata structures | ||
231 | */ | ||
232 | class Metadata { | ||
233 | const _VERSION = 3.00; | ||
234 | |||
235 | private $dc = array(); | ||
236 | private $meta = array(); | ||
237 | |||
238 | /** | ||
239 | * Class constructor. | ||
240 | * | ||
241 | * @return void | ||
242 | */ | ||
243 | function __construct() { | ||
244 | } | ||
245 | |||
246 | /** | ||
247 | * Class destructor | ||
248 | * | ||
249 | * @return void | ||
250 | */ | ||
251 | function __destruct() { | ||
252 | unset ($this->dc, $this->meta); | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * | ||
257 | * Enter description here ... | ||
258 | * | ||
259 | * @param DublinCore $dc | ||
260 | */ | ||
261 | function addDublinCore($dc) { | ||
262 | if ($dc != NULL && is_object($dc) && get_class($dc) === "DublinCore") { | ||
263 | $this->dc[] = $dc; | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /** | ||
268 | * | ||
269 | * Enter description here ... | ||
270 | * | ||
271 | * @param string $name | ||
272 | * @param string $content | ||
273 | */ | ||
274 | function addMeta($name, $content) { | ||
275 | $name = is_string($name) ? trim($name) : NULL; | ||
276 | if (isset($name)) { | ||
277 | $content = is_string($content) ? trim($content) : NULL; | ||
278 | } | ||
279 | if (isset($content)) { | ||
280 | $this->meta[] = array ($name => $content); | ||
281 | } | ||
282 | } | ||
283 | |||
284 | /** | ||
285 | * | ||
286 | * @param string $bookVersion | ||
287 | * @param int $date | ||
288 | * @return string | ||
289 | */ | ||
290 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2, $date = NULL) { | ||
291 | $metadata = "\t<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"; | ||
292 | if ($bookVersion === EPub::BOOK_VERSION_EPUB2) { | ||
293 | $metadata .= "\t\txmlns:opf=\"http://www.idpf.org/2007/opf\"\n\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"; | ||
294 | } else { | ||
295 | $metadata .= "\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"; | ||
296 | if (!isset($date)) { | ||
297 | $date = time(); | ||
298 | } | ||
299 | $metadata .= "\t\t<meta property=\"dcterms:modified\">" . gmdate("Y-m-d\TH:i:s\Z", $date) . "</meta>\n"; | ||
300 | } | ||
301 | |||
302 | foreach ($this->dc as $dc) { | ||
303 | $metadata .= $dc->finalize($bookVersion); | ||
304 | } | ||
305 | |||
306 | foreach ($this->meta as $data) { | ||
307 | list($name, $content) = each($data); | ||
308 | $metadata .= "\t\t<meta name=\"" . $name . "\" content=\"" . $content . "\" />\n"; | ||
309 | } | ||
310 | |||
311 | return $metadata . "\t</metadata>\n"; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | /** | ||
316 | * ePub OPF Dublin Core (dc:) Metadata structures | ||
317 | */ | ||
318 | class DublinCore { | ||
319 | const _VERSION = 3.00; | ||
320 | |||
321 | const CONTRIBUTOR = "contributor"; | ||
322 | const COVERAGE = "coverage"; | ||
323 | const CREATOR = "creator"; | ||
324 | const DATE = "date"; | ||
325 | const DESCRIPTION = "description"; | ||
326 | const FORMAT = "format"; | ||
327 | const IDENTIFIER = "identifier"; | ||
328 | const LANGUAGE = "language"; | ||
329 | const PUBLISHER = "publisher"; | ||
330 | const RELATION = "relation"; | ||
331 | const RIGHTS = "rights"; | ||
332 | const SOURCE = "source"; | ||
333 | const SUBJECT = "subject"; | ||
334 | const TITLE = "title"; | ||
335 | const TYPE = "type"; | ||
336 | |||
337 | private $dcName = NULL; | ||
338 | private $dcValue = NULL; | ||
339 | private $attr = array(); | ||
340 | private $opfAttr = array(); | ||
341 | |||
342 | /** | ||
343 | * Class constructor. | ||
344 | * | ||
345 | * @return void | ||
346 | */ | ||
347 | function __construct($name, $value) { | ||
348 | $this->setDc($name, $value); | ||
349 | } | ||
350 | |||
351 | /** | ||
352 | * Class destructor | ||
353 | * | ||
354 | * @return void | ||
355 | */ | ||
356 | function __destruct() { | ||
357 | unset ($this->dcName, $this->dcValue, $this->attr, $this->opfAttr); | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * | ||
362 | * Enter description here ... | ||
363 | * | ||
364 | * @param string $name | ||
365 | * @param string $value | ||
366 | */ | ||
367 | function setDc($name, $value) { | ||
368 | $this->dcName = is_string($name) ? trim($name) : NULL; | ||
369 | if (isset($this->dcName)) { | ||
370 | $this->dcValue = isset($value) ? (string)$value : NULL; | ||
371 | } | ||
372 | if (! isset($this->dcValue)) { | ||
373 | $this->dcName = NULL; | ||
374 | } | ||
375 | } | ||
376 | |||
377 | /** | ||
378 | * | ||
379 | * Enter description here ... | ||
380 | * | ||
381 | * @param string $attrName | ||
382 | * @param string $attrValue | ||
383 | */ | ||
384 | function addAttr($attrName, $attrValue) { | ||
385 | $attrName = is_string($attrName) ? trim($attrName) : NULL; | ||
386 | if (isset($attrName)) { | ||
387 | $attrValue = is_string($attrValue) ? trim($attrValue) : NULL; | ||
388 | } | ||
389 | if (isset($attrValue)) { | ||
390 | $this->attr[$attrName] = $attrValue; | ||
391 | } | ||
392 | } | ||
393 | |||
394 | /** | ||
395 | * | ||
396 | * Enter description here ... | ||
397 | * | ||
398 | * @param string $opfAttrName | ||
399 | * @param string $opfAttrValue | ||
400 | */ | ||
401 | function addOpfAttr($opfAttrName, $opfAttrValue) { | ||
402 | $opfAttrName = is_string($opfAttrName) ? trim($opfAttrName) : NULL; | ||
403 | if (isset($opfAttrName)) { | ||
404 | $opfAttrValue = is_string($opfAttrValue) ? trim($opfAttrValue) : NULL; | ||
405 | } | ||
406 | if (isset($opfAttrValue)) { | ||
407 | $this->opfAttr[$opfAttrName] = $opfAttrValue; | ||
408 | } | ||
409 | } | ||
410 | |||
411 | |||
412 | /** | ||
413 | * | ||
414 | * @param string $bookVersion | ||
415 | * @return string | ||
416 | */ | ||
417 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
418 | $dc = "\t\t<dc:" . $this->dcName; | ||
419 | |||
420 | if (sizeof($this->attr) > 0) { | ||
421 | while (list($name, $content) = each($this->attr)) { | ||
422 | $dc .= " " . $name . "=\"" . $content . "\""; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | if ($bookVersion === EPub::BOOK_VERSION_EPUB2 && sizeof($this->opfAttr) > 0) { | ||
427 | while (list($name, $content) = each($this->opfAttr)) { | ||
428 | $dc .= " opf:" . $name . "=\"" . $content . "\""; | ||
429 | } | ||
430 | } | ||
431 | |||
432 | return $dc . ">" . $this->dcValue . "</dc:" . $this->dcName . ">\n"; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | /** | ||
437 | * ePub OPF Manifest structure | ||
438 | */ | ||
439 | class Manifest { | ||
440 | const _VERSION = 3.00; | ||
441 | |||
442 | private $items = array(); | ||
443 | |||
444 | /** | ||
445 | * Class constructor. | ||
446 | * | ||
447 | * @return void | ||
448 | */ | ||
449 | function __construct() { | ||
450 | } | ||
451 | |||
452 | /** | ||
453 | * Class destructor | ||
454 | * | ||
455 | * @return void | ||
456 | */ | ||
457 | function __destruct() { | ||
458 | unset ($this->items); | ||
459 | } | ||
460 | |||
461 | /** | ||
462 | * | ||
463 | * Enter description here ... | ||
464 | * | ||
465 | * @param Item $item | ||
466 | */ | ||
467 | function addItem($item) { | ||
468 | if ($item != NULL && is_object($item) && get_class($item) === "Item") { | ||
469 | $this->items[] = $item; | ||
470 | } | ||
471 | } | ||
472 | |||
473 | /** | ||
474 | * | ||
475 | * @param string $bookVersion | ||
476 | * @return string | ||
477 | */ | ||
478 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
479 | $manifest = "\n\t<manifest>\n"; | ||
480 | foreach ($this->items as $item) { | ||
481 | $manifest .= $item->finalize($bookVersion); | ||
482 | } | ||
483 | return $manifest . "\t</manifest>\n"; | ||
484 | } | ||
485 | } | ||
486 | |||
487 | /** | ||
488 | * ePub OPF Item structure | ||
489 | */ | ||
490 | class Item { | ||
491 | const _VERSION = 3.00; | ||
492 | |||
493 | private $id = NULL; | ||
494 | private $href = NULL; | ||
495 | private $mediaType = NULL; | ||
496 | private $properties = NULL; | ||
497 | private $requiredNamespace = NULL; | ||
498 | private $requiredModules = NULL; | ||
499 | private $fallback = NULL; | ||
500 | private $fallbackStyle = NULL; | ||
501 | |||
502 | /** | ||
503 | * Class constructor. | ||
504 | * | ||
505 | * @return void | ||
506 | */ | ||
507 | function __construct($id, $href, $mediaType, $properties = NULL) { | ||
508 | $this->setId($id); | ||
509 | $this->setHref($href); | ||
510 | $this->setMediaType($mediaType); | ||
511 | $this->setProperties($properties); | ||
512 | } | ||
513 | |||
514 | /** | ||
515 | * Class destructor | ||
516 | * | ||
517 | * @return void | ||
518 | */ | ||
519 | function __destruct() { | ||
520 | unset ($this->id, $this->href, $this->mediaType); | ||
521 | unset ($this->properties, $this->requiredNamespace, $this->requiredModules, $this->fallback, $this->fallbackStyle); | ||
522 | } | ||
523 | |||
524 | /** | ||
525 | * | ||
526 | * Enter description here ... | ||
527 | * | ||
528 | * @param string $id | ||
529 | */ | ||
530 | function setId($id) { | ||
531 | $this->id = is_string($id) ? trim($id) : NULL; | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * | ||
536 | * Enter description here ... | ||
537 | * | ||
538 | * @param string $href | ||
539 | */ | ||
540 | function setHref($href) { | ||
541 | $this->href = is_string($href) ? trim($href) : NULL; | ||
542 | } | ||
543 | |||
544 | /** | ||
545 | * | ||
546 | * Enter description here ... | ||
547 | * | ||
548 | * @param string $mediaType | ||
549 | */ | ||
550 | function setMediaType($mediaType) { | ||
551 | $this->mediaType = is_string($mediaType) ? trim($mediaType) : NULL; | ||
552 | } | ||
553 | |||
554 | /** | ||
555 | * | ||
556 | * Enter description here ... | ||
557 | * | ||
558 | * @param string $properties | ||
559 | */ | ||
560 | function setProperties($properties) { | ||
561 | $this->properties = is_string($properties) ? trim($properties) : NULL; | ||
562 | } | ||
563 | |||
564 | /** | ||
565 | * | ||
566 | * Enter description here ... | ||
567 | * | ||
568 | * @param string $requiredNamespace | ||
569 | */ | ||
570 | function setRequiredNamespace($requiredNamespace) { | ||
571 | $this->requiredNamespace = is_string($requiredNamespace) ? trim($requiredNamespace) : NULL; | ||
572 | } | ||
573 | |||
574 | /** | ||
575 | * | ||
576 | * Enter description here ... | ||
577 | * | ||
578 | * @param string $requiredModules | ||
579 | */ | ||
580 | function setRequiredModules($requiredModules) { | ||
581 | $this->requiredModules = is_string($requiredModules) ? trim($requiredModules) : NULL; | ||
582 | } | ||
583 | |||
584 | /** | ||
585 | * | ||
586 | * Enter description here ... | ||
587 | * | ||
588 | * @param string $fallback | ||
589 | */ | ||
590 | function setfallback($fallback) { | ||
591 | $this->fallback = is_string($fallback) ? trim($fallback) : NULL; | ||
592 | } | ||
593 | |||
594 | /** | ||
595 | * | ||
596 | * Enter description here ... | ||
597 | * | ||
598 | * @param string $fallbackStyle | ||
599 | */ | ||
600 | function setFallbackStyle($fallbackStyle) { | ||
601 | $this->fallbackStyle = is_string($fallbackStyle) ? trim($fallbackStyle) : NULL; | ||
602 | } | ||
603 | |||
604 | /** | ||
605 | * | ||
606 | * @param string $bookVersion | ||
607 | * @return string | ||
608 | */ | ||
609 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
610 | $item = "\t\t<item id=\"" . $this->id . "\" href=\"" . $this->href . "\" media-type=\"" . $this->mediaType . "\" "; | ||
611 | if ($bookVersion === EPub::BOOK_VERSION_EPUB3 && isset($this->properties)) { | ||
612 | $item .= "properties=\"" . $this->properties . "\" "; | ||
613 | } | ||
614 | if (isset($this->requiredNamespace)) { | ||
615 | $item .= "\n\t\t\trequired-namespace=\"" . $this->requiredNamespace . "\" "; | ||
616 | if (isset($this->requiredModules)) { | ||
617 | $item .= "required-modules=\"" . $this->requiredModules . "\" "; | ||
618 | } | ||
619 | } | ||
620 | if (isset($this->fallback)) { | ||
621 | $item .= "\n\t\t\tfallback=\"" . $this->fallback . "\" "; | ||
622 | } | ||
623 | if (isset($this->fallbackStyle)) { | ||
624 | $item .= "\n\t\t\tfallback-style=\"" . $this->fallbackStyle . "\" "; | ||
625 | } | ||
626 | return $item . "/>\n"; | ||
627 | } | ||
628 | } | ||
629 | |||
630 | /** | ||
631 | * ePub OPF Spine structure | ||
632 | */ | ||
633 | class Spine { | ||
634 | const _VERSION = 1.00; | ||
635 | |||
636 | private $itemrefs = array(); | ||
637 | private $toc = NULL; | ||
638 | |||
639 | /** | ||
640 | * Class constructor. | ||
641 | * | ||
642 | * @return void | ||
643 | */ | ||
644 | function __construct($toc = "ncx") { | ||
645 | $this->setToc($toc); | ||
646 | } | ||
647 | |||
648 | /** | ||
649 | * Class destructor | ||
650 | * | ||
651 | * @return void | ||
652 | */ | ||
653 | function __destruct() { | ||
654 | unset ($this->itemrefs, $this->toc); | ||
655 | } | ||
656 | |||
657 | /** | ||
658 | * | ||
659 | * Enter description here ... | ||
660 | * | ||
661 | * @param string $toc | ||
662 | */ | ||
663 | function setToc($toc) { | ||
664 | $this->toc = is_string($toc) ? trim($toc) : NULL; | ||
665 | } | ||
666 | |||
667 | /** | ||
668 | * | ||
669 | * Enter description here ... | ||
670 | * | ||
671 | * @param Itemref $itemref | ||
672 | */ | ||
673 | function addItemref($itemref) { | ||
674 | if ($itemref != NULL | ||
675 | && is_object($itemref) | ||
676 | && get_class($itemref) === "Itemref" | ||
677 | && !isset($this->itemrefs[$itemref->getIdref()])) { | ||
678 | $this->itemrefs[$itemref->getIdref()] = $itemref; | ||
679 | } | ||
680 | } | ||
681 | |||
682 | /** | ||
683 | * | ||
684 | * Enter description here ... | ||
685 | * | ||
686 | * @return string | ||
687 | */ | ||
688 | function finalize() { | ||
689 | $spine = "\n\t<spine toc=\"" . $this->toc . "\">\n"; | ||
690 | foreach ($this->itemrefs as $itemref) { | ||
691 | $spine .= $itemref->finalize(); | ||
692 | } | ||
693 | return $spine . "\t</spine>\n"; | ||
694 | } | ||
695 | } | ||
696 | |||
697 | /** | ||
698 | * ePub OPF ItemRef structure | ||
699 | */ | ||
700 | class Itemref { | ||
701 | const _VERSION = 3.00; | ||
702 | |||
703 | private $idref = NULL; | ||
704 | private $linear = TRUE; | ||
705 | |||
706 | /** | ||
707 | * Class constructor. | ||
708 | * | ||
709 | * @return void | ||
710 | */ | ||
711 | function __construct($idref, $linear = TRUE) { | ||
712 | $this->setIdref($idref); | ||
713 | $this->setLinear($linear); | ||
714 | } | ||
715 | |||
716 | /** | ||
717 | * Class destructor | ||
718 | * | ||
719 | * @return void | ||
720 | */ | ||
721 | function __destruct() { | ||
722 | unset ($this->idref, $this->linear); | ||
723 | } | ||
724 | |||
725 | /** | ||
726 | * | ||
727 | * Enter description here ... | ||
728 | * | ||
729 | * @param string $idref | ||
730 | */ | ||
731 | function setIdref($idref) { | ||
732 | $this->idref = is_string($idref) ? trim($idref) : NULL; | ||
733 | } | ||
734 | |||
735 | /** | ||
736 | * | ||
737 | * Enter description here ... | ||
738 | * | ||
739 | * @return string $idref | ||
740 | */ | ||
741 | function getIdref() { | ||
742 | return $this->idref; | ||
743 | } | ||
744 | |||
745 | /** | ||
746 | * | ||
747 | * Enter description here ... | ||
748 | * | ||
749 | * @param bool $linear | ||
750 | */ | ||
751 | function setLinear($linear = TRUE) { | ||
752 | $this->linear = $linear === TRUE; | ||
753 | } | ||
754 | |||
755 | /** | ||
756 | * | ||
757 | * Enter description here ... | ||
758 | * | ||
759 | * @return string | ||
760 | */ | ||
761 | function finalize() { | ||
762 | $itemref = "\t\t<itemref idref=\"" . $this->idref . "\""; | ||
763 | if ($this->linear == FALSE) { | ||
764 | return $itemref .= " linear=\"no\" />\n"; | ||
765 | } | ||
766 | return $itemref . " />\n"; | ||
767 | } | ||
768 | } | ||
769 | |||
770 | /** | ||
771 | * ePub OPF Guide structure | ||
772 | */ | ||
773 | class Guide { | ||
774 | const _VERSION = 3.00; | ||
775 | |||
776 | private $references = array(); | ||
777 | |||
778 | /** | ||
779 | * Class constructor. | ||
780 | * | ||
781 | * @return void | ||
782 | */ | ||
783 | function __construct() { | ||
784 | } | ||
785 | |||
786 | /** | ||
787 | * Class destructor | ||
788 | * | ||
789 | * @return void | ||
790 | */ | ||
791 | function __destruct() { | ||
792 | unset ($this->references); | ||
793 | } | ||
794 | |||
795 | /** | ||
796 | * | ||
797 | * Enter description here ... | ||
798 | * | ||
799 | */ | ||
800 | function length() { | ||
801 | return sizeof($this->references); | ||
802 | } | ||
803 | |||
804 | /** | ||
805 | * | ||
806 | * Enter description here ... | ||
807 | * | ||
808 | * @param Reference $reference | ||
809 | */ | ||
810 | function addReference($reference) { | ||
811 | if ($reference != NULL && is_object($reference) && get_class($reference) === "Reference") { | ||
812 | $this->references[] = $reference; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | /** | ||
817 | * | ||
818 | * Enter description here ... | ||
819 | * | ||
820 | * @return string | ||
821 | */ | ||
822 | function finalize() { | ||
823 | $ref = ""; | ||
824 | if (sizeof($this->references) > 0) { | ||
825 | $ref = "\n\t<guide>\n"; | ||
826 | foreach ($this->references as $reference) { | ||
827 | $ref .= $reference->finalize(); | ||
828 | } | ||
829 | $ref .= "\t</guide>\n"; | ||
830 | } | ||
831 | return $ref; | ||
832 | } | ||
833 | } | ||
834 | |||
835 | /** | ||
836 | * Reference constants | ||
837 | */ | ||
838 | class Reference { | ||
839 | const _VERSION = 1.00; | ||
840 | |||
841 | /* REFERENCE types are derived from the "Chicago Manual of Style" | ||
842 | */ | ||
843 | |||
844 | /** Acknowledgements page */ | ||
845 | const ACKNOWLEDGEMENTS = "acknowledgements"; | ||
846 | |||
847 | /** Bibliography page */ | ||
848 | const BIBLIOGRAPHY = "bibliography"; | ||
849 | |||
850 | /** Colophon page */ | ||
851 | const COLOPHON = "colophon"; | ||
852 | |||
853 | /** Copyright page */ | ||
854 | const COPYRIGHT_PAGE = "copyright-page"; | ||
855 | |||
856 | /** Dedication */ | ||
857 | const DEDICATION = "dedication"; | ||
858 | |||
859 | /** Epigraph */ | ||
860 | const EPIGRAPH = "epigraph"; | ||
861 | |||
862 | /** Foreword */ | ||
863 | const FOREWORD = "foreword"; | ||
864 | |||
865 | /** Glossary page */ | ||
866 | const GLOSSARY = "glossary"; | ||
867 | |||
868 | /** back-of-book style index */ | ||
869 | const INDEX = "index"; | ||
870 | |||
871 | /** List of illustrations */ | ||
872 | const LIST_OF_ILLUSTRATIONS = "loi"; | ||
873 | |||
874 | /** List of tables */ | ||
875 | const LIST_OF_TABLES = "lot"; | ||
876 | |||
877 | /** Notes page */ | ||
878 | const NOTES = "notes"; | ||
879 | |||
880 | /** Preface page */ | ||
881 | const PREFACE = "preface"; | ||
882 | |||
883 | /** Table of contents */ | ||
884 | const TABLE_OF_CONTENTS = "toc"; | ||
885 | |||
886 | /** Page with possibly title, author, publisher, and other metadata */ | ||
887 | const TITLE_PAGE = "titlepage"; | ||
888 | |||
889 | /** First page of the book, ie. first page of the first chapter */ | ||
890 | const TEXT = "text"; | ||
891 | |||
892 | // ****************** | ||
893 | // ePub3 constants | ||
894 | // ****************** | ||
895 | |||
896 | // Document partitions | ||
897 | /** The publications cover(s), jacket information, etc. This is officially in ePub3, but works for ePub 2 as well */ | ||
898 | const COVER = "cover"; | ||
899 | |||
900 | /** Preliminary material to the content body, such as tables of contents, dedications, etc. */ | ||
901 | const FRONTMATTER = "frontmatter"; | ||
902 | |||
903 | /** The main (body) content of a document. */ | ||
904 | const BODYMATTER = "bodymatter"; | ||
905 | |||
906 | /** Ancillary material occurring after the document body, such as indices, appendices, etc. */ | ||
907 | const BACKMATTER = "backmatter"; | ||
908 | |||
909 | |||
910 | private $type = NULL; | ||
911 | private $title = NULL; | ||
912 | private $href = NULL; | ||
913 | |||
914 | /** | ||
915 | * Class constructor. | ||
916 | * | ||
917 | * @param string $type | ||
918 | * @param string $title | ||
919 | * @param string $href | ||
920 | */ | ||
921 | function __construct($type, $title, $href) { | ||
922 | $this->setType($type); | ||
923 | $this->setTitle($title); | ||
924 | $this->setHref($href); | ||
925 | } | ||
926 | |||
927 | /** | ||
928 | * Class destructor | ||
929 | * | ||
930 | * @return void | ||
931 | */ | ||
932 | function __destruct() { | ||
933 | unset ($this->type, $this->title, $this->href); | ||
934 | } | ||
935 | |||
936 | /** | ||
937 | * | ||
938 | * Enter description here ... | ||
939 | * | ||
940 | * @param string $type | ||
941 | */ | ||
942 | function setType($type) { | ||
943 | $this->type = is_string($type) ? trim($type) : NULL; | ||
944 | } | ||
945 | |||
946 | /** | ||
947 | * | ||
948 | * Enter description here ... | ||
949 | * | ||
950 | * @param string $title | ||
951 | */ | ||
952 | function setTitle($title) { | ||
953 | $this->title = is_string($title) ? trim($title) : NULL; | ||
954 | } | ||
955 | |||
956 | /** | ||
957 | * | ||
958 | * Enter description here ... | ||
959 | * | ||
960 | * @param string $href | ||
961 | */ | ||
962 | function setHref($href) { | ||
963 | $this->href = is_string($href) ? trim($href) : NULL; | ||
964 | } | ||
965 | |||
966 | /** | ||
967 | * | ||
968 | * Enter description here ... | ||
969 | * | ||
970 | * @return string | ||
971 | */ | ||
972 | function finalize() { | ||
973 | return "\t\t<reference type=\"" . $this->type . "\" title=\"" . $this->title . "\" href=\"" . $this->href . "\" />\n"; | ||
974 | } | ||
975 | } | ||
976 | |||
977 | /** | ||
978 | * Common Marc codes. | ||
979 | * Ref: http://www.loc.gov/marc/relators/ | ||
980 | */ | ||
981 | class MarcCode { | ||
982 | const _VERSION = 3.00; | ||
983 | |||
984 | /** | ||
985 | * Adapter | ||
986 | * | ||
987 | * Use for a person who | ||
988 | * 1) reworks a musical composition, usually for a different medium, or | ||
989 | * 2) rewrites novels or stories for motion pictures or other audiovisual medium. | ||
990 | */ | ||
991 | const ADAPTER = "adp"; | ||
992 | |||
993 | /** | ||
994 | * Annotator | ||
995 | * | ||
996 | * Use for a person who writes manuscript annotations on a printed item. | ||
997 | */ | ||
998 | const ANNOTATOR = "ann"; | ||
999 | |||
1000 | /** | ||
1001 | * Arranger | ||
1002 | * | ||
1003 | * Use for a person who transcribes a musical composition, usually for a different | ||
1004 | * medium from that of the original; in an arrangement the musical substance remains | ||
1005 | * essentially unchanged. | ||
1006 | */ | ||
1007 | const ARRANGER = "arr"; | ||
1008 | |||
1009 | /** | ||
1010 | * Artist | ||
1011 | * | ||
1012 | * Use for a person (e.g., a painter) who conceives, and perhaps also implements, | ||
1013 | * an original graphic design or work of art, if specific codes (e.g., [egr], | ||
1014 | * [etr]) are not desired. For book illustrators, prefer Illustrator [ill]. | ||
1015 | */ | ||
1016 | const ARTIST = "art"; | ||
1017 | |||
1018 | /** | ||
1019 | * Associated name | ||
1020 | * | ||
1021 | * Use as a general relator for a name associated with or found in an item or | ||
1022 | * collection, or which cannot be determined to be that of a Former owner [fmo] | ||
1023 | * or other designated relator indicative of provenance. | ||
1024 | */ | ||
1025 | const ASSOCIATED_NAME = "asn"; | ||
1026 | |||
1027 | /** | ||
1028 | * Author | ||
1029 | * | ||
1030 | * Use for a person or corporate body chiefly responsible for the intellectual | ||
1031 | * or artistic content of a work. This term may also be used when more than one | ||
1032 | * person or body bears such responsibility. | ||
1033 | */ | ||
1034 | const AUTHOR = "aut"; | ||
1035 | |||
1036 | /** | ||
1037 | * Author in quotations or text extracts | ||
1038 | * | ||
1039 | * Use for a person whose work is largely quoted or extracted in a works to which | ||
1040 | * he or she did not contribute directly. Such quotations are found particularly | ||
1041 | * in exhibition catalogs, collections of photographs, etc. | ||
1042 | */ | ||
1043 | const AUTHOR_IN_QUOTES = "aqt"; | ||
1044 | |||
1045 | /** | ||
1046 | * Author of afterword, colophon, etc. | ||
1047 | * | ||
1048 | * Use for a person or corporate body responsible for an afterword, postface, | ||
1049 | * colophon, etc. but who is not the chief author of a work. | ||
1050 | */ | ||
1051 | const AUTHOR_OF_AFTERWORD = "aft"; | ||
1052 | |||
1053 | /** | ||
1054 | * Author of introduction, etc. | ||
1055 | * | ||
1056 | * Use for a person or corporate body responsible for an introduction, preface, | ||
1057 | * foreword, or other critical matter, but who is not the chief author. | ||
1058 | */ | ||
1059 | const AUTHOR_OF_INTRO = "aui"; | ||
1060 | |||
1061 | /** | ||
1062 | * Bibliographic antecedent | ||
1063 | * | ||
1064 | * Use for the author responsible for a work upon which the work represented by | ||
1065 | * the catalog record is based. This can be appropriate for adaptations, sequels, | ||
1066 | * continuations, indexes, etc. | ||
1067 | */ | ||
1068 | const BIB_ANTECEDENT = "ant"; | ||
1069 | |||
1070 | /** | ||
1071 | * Book producer | ||
1072 | * | ||
1073 | * Use for the person or firm responsible for the production of books and other | ||
1074 | * print media, if specific codes (e.g., [bkd], [egr], [tyd], [prt]) are not desired. | ||
1075 | */ | ||
1076 | const BOOK_PRODUCER = "bkp"; | ||
1077 | |||
1078 | /** | ||
1079 | * Collaborator | ||
1080 | * | ||
1081 | * Use for a person or corporate body that takes a limited part in the elaboration | ||
1082 | * of a work of another author or that brings complements (e.g., appendices, notes) | ||
1083 | * to the work of another author. | ||
1084 | */ | ||
1085 | const COLABORATOR = "clb"; | ||
1086 | |||
1087 | /** | ||
1088 | * Commentator | ||
1089 | * | ||
1090 | * Use for a person who provides interpretation, analysis, or a discussion of the | ||
1091 | * subject matter on a recording, motion picture, or other audiovisual medium. | ||
1092 | * Compiler [com] Use for a person who produces a work or publication by selecting | ||
1093 | * and putting together material from the works of various persons or bodies. | ||
1094 | */ | ||
1095 | const COMMENTATOR = "cmm"; | ||
1096 | |||
1097 | /** | ||
1098 | * Designer | ||
1099 | * | ||
1100 | * Use for a person or organization responsible for design if specific codes (e.g., | ||
1101 | * [bkd], [tyd]) are not desired. | ||
1102 | */ | ||
1103 | const DESIGNER = "dsr"; | ||
1104 | |||
1105 | /** | ||
1106 | * Editor | ||
1107 | * | ||
1108 | * Use for a person who prepares for publication a work not primarily his/her own, | ||
1109 | * such as by elucidating text, adding introductory or other critical matter, or | ||
1110 | * technically directing an editorial staff. | ||
1111 | */ | ||
1112 | const EDITORT = "edt"; | ||
1113 | |||
1114 | /** | ||
1115 | * Illustrator | ||
1116 | * | ||
1117 | * Use for the person who conceives, and perhaps also implements, a design or | ||
1118 | * illustration, usually to accompany a written text. | ||
1119 | */ | ||
1120 | const ILLUSTRATOR = "ill"; | ||
1121 | |||
1122 | /** | ||
1123 | * Lyricist | ||
1124 | * | ||
1125 | * Use for the writer of the text of a song. | ||
1126 | */ | ||
1127 | const LYRICIST = "lyr"; | ||
1128 | |||
1129 | /** | ||
1130 | * Metadata contact | ||
1131 | * | ||
1132 | * Use for the person or organization primarily responsible for compiling and | ||
1133 | * maintaining the original description of a metadata set (e.g., geospatial | ||
1134 | * metadata set). | ||
1135 | */ | ||
1136 | const METADATA_CONTACT = "mdc"; | ||
1137 | |||
1138 | /** | ||
1139 | * Musician | ||
1140 | * | ||
1141 | * Use for the person who performs music or contributes to the musical content | ||
1142 | * of a work when it is not possible or desirable to identify the function more | ||
1143 | * precisely. | ||
1144 | */ | ||
1145 | const MUSICIAN = "mus"; | ||
1146 | |||
1147 | /** | ||
1148 | * Narrator | ||
1149 | * | ||
1150 | * Use for the speaker who relates the particulars of an act, occurrence, or | ||
1151 | * course of events. | ||
1152 | */ | ||
1153 | const NARRATOR = "nrt"; | ||
1154 | |||
1155 | /** | ||
1156 | * Other | ||
1157 | * | ||
1158 | * Use for relator codes from other lists which have no equivalent in the MARC | ||
1159 | * list or for terms which have not been assigned a code. | ||
1160 | */ | ||
1161 | const OTHER = "oth"; | ||
1162 | |||
1163 | /** | ||
1164 | * Photographer | ||
1165 | * | ||
1166 | * Use for the person or organization responsible for taking photographs, whether | ||
1167 | * they are used in their original form or as reproductions. | ||
1168 | */ | ||
1169 | const PHOTOGRAPHER = "pht"; | ||
1170 | |||
1171 | /** | ||
1172 | * Printer | ||
1173 | * | ||
1174 | * Use for the person or organization who prints texts, whether from type or plates. | ||
1175 | */ | ||
1176 | const PRINTER = "prt"; | ||
1177 | |||
1178 | /** | ||
1179 | * Redactor | ||
1180 | * | ||
1181 | * Use for a person who writes or develops the framework for an item without | ||
1182 | * being intellectually responsible for its content. | ||
1183 | */ | ||
1184 | const REDACTOR = "red"; | ||
1185 | |||
1186 | /** | ||
1187 | * Reviewer | ||
1188 | * | ||
1189 | * Use for a person or corporate body responsible for the review of book, motion | ||
1190 | * picture, performance, etc. | ||
1191 | */ | ||
1192 | const REVIEWER = "rev"; | ||
1193 | |||
1194 | /** | ||
1195 | * Sponsor | ||
1196 | * | ||
1197 | * Use for the person or agency that issued a contract, or under whose auspices | ||
1198 | * a work has been written, printed, published, etc. | ||
1199 | */ | ||
1200 | const SPONSOR = "spn"; | ||
1201 | |||
1202 | /** | ||
1203 | * Thesis advisor | ||
1204 | * | ||
1205 | * Use for the person under whose supervision a degree candidate develops and | ||
1206 | * presents a thesis, memoir, or text of a dissertation. | ||
1207 | */ | ||
1208 | const THESIS_ADVISOR = "ths"; | ||
1209 | |||
1210 | /** | ||
1211 | * Transcriber | ||
1212 | * | ||
1213 | * Use for a person who prepares a handwritten or typewritten copy from original | ||
1214 | * material, including from dictated or orally recorded material. | ||
1215 | */ | ||
1216 | const TRANSCRIBER = "trc"; | ||
1217 | |||
1218 | /** | ||
1219 | * Translator | ||
1220 | * | ||
1221 | * Use for a person who renders a text from one language into another, or from | ||
1222 | * an older form of a language into the modern form. | ||
1223 | */ | ||
1224 | const TRANSLATOR = "trl"; | ||
1225 | } | ||
1226 | ?> | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php new file mode 100644 index 00000000..f1f41bd5 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.php | |||
@@ -0,0 +1,2432 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Create an ePub compatible book file. | ||
4 | * | ||
5 | * Please note, once finalized a book can no longer have chapters of data added or changed. | ||
6 | * | ||
7 | * License: GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * | ||
9 | * Thanks to: Adam Schmalhofer and Kirstyn Fox for invaluable input and for "nudging" me in the right direction :) | ||
10 | * | ||
11 | * @author A. Grandt <php@grandt.com> | ||
12 | * @copyright 2009-2014 A. Grandt | ||
13 | * @license GNU LGPL 2.1 | ||
14 | * @version 3.20 | ||
15 | * @link http://www.phpclasses.org/package/6115 | ||
16 | * @link https://github.com/Grandt/PHPePub | ||
17 | * @uses Zip.php version 1.50; http://www.phpclasses.org/browse/package/6110.html or https://github.com/Grandt/PHPZip | ||
18 | */ | ||
19 | class EPub { | ||
20 | const VERSION = 3.20; | ||
21 | const REQ_ZIP_VERSION = 1.60; | ||
22 | |||
23 | const IDENTIFIER_UUID = 'UUID'; | ||
24 | const IDENTIFIER_URI = 'URI'; | ||
25 | const IDENTIFIER_ISBN = 'ISBN'; | ||
26 | |||
27 | /** Ignore all external references, and do not process the file for these */ | ||
28 | const EXTERNAL_REF_IGNORE = 0; | ||
29 | /** Process the file for external references and add them to the book */ | ||
30 | const EXTERNAL_REF_ADD = 1; | ||
31 | /** Process the file for external references and add them to the book, but remove images, and img tags */ | ||
32 | const EXTERNAL_REF_REMOVE_IMAGES = 2; | ||
33 | /** Process the file for external references and add them to the book, but replace images, and img tags with [image] */ | ||
34 | const EXTERNAL_REF_REPLACE_IMAGES = 3; | ||
35 | |||
36 | const DIRECTION_LEFT_TO_RIGHT = "ltr"; | ||
37 | const DIRECTION_RIGHT_TO_LEFT = "rtl"; | ||
38 | |||
39 | const BOOK_VERSION_EPUB2 = "2.0"; | ||
40 | const BOOK_VERSION_EPUB3 = "3.0"; | ||
41 | |||
42 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
43 | |||
44 | public $maxImageWidth = 768; | ||
45 | public $maxImageHeight = 1024; | ||
46 | |||
47 | public $splitDefaultSize = 250000; | ||
48 | /** Gifs can crash some early ADE based readers, and are disabled by default. | ||
49 | * getImage will convert these if it can, unless this is set to TRUE. | ||
50 | */ | ||
51 | public $isGifImagesEnabled = FALSE; | ||
52 | public $isReferencesAddedToToc = TRUE; | ||
53 | |||
54 | private $zip; | ||
55 | |||
56 | private $title = ""; | ||
57 | private $language = "en"; | ||
58 | private $identifier = ""; | ||
59 | private $identifierType = ""; | ||
60 | private $description = ""; | ||
61 | private $author = ""; | ||
62 | private $authorSortKey = ""; | ||
63 | private $publisherName = ""; | ||
64 | private $publisherURL = ""; | ||
65 | private $date = 0; | ||
66 | private $rights = ""; | ||
67 | private $coverage = ""; | ||
68 | private $relation = ""; | ||
69 | private $sourceURL = ""; | ||
70 | |||
71 | private $chapterCount = 0; | ||
72 | private $opf = NULL; | ||
73 | private $ncx = NULL; | ||
74 | private $isFinalized = FALSE; | ||
75 | private $isCoverImageSet = FALSE; | ||
76 | private $buildTOC = FALSE; | ||
77 | private $tocTitle = NULL; | ||
78 | private $tocFileName = NULL; | ||
79 | private $tocCSSClass = NULL; | ||
80 | private $tocAddReferences = FALSE; | ||
81 | private $tocCssFileName = NULL; | ||
82 | |||
83 | private $fileList = array(); | ||
84 | private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT; | ||
85 | private $languageCode = "en"; | ||
86 | |||
87 | /** | ||
88 | * Used for building the TOC. | ||
89 | * If this list is overwritten it MUST contain at least "text" as an element. | ||
90 | */ | ||
91 | public $referencesOrder = NULL; | ||
92 | |||
93 | private $dateformat = 'Y-m-d\TH:i:s.000000P'; // ISO 8601 long | ||
94 | private $dateformatShort = 'Y-m-d'; // short date format to placate ePubChecker. | ||
95 | private $headerDateFormat = "D, d M Y H:i:s T"; | ||
96 | |||
97 | protected $isCurlInstalled; | ||
98 | protected $isGdInstalled; | ||
99 | protected $isExifInstalled; | ||
100 | protected $isFileGetContentsInstalled; | ||
101 | protected $isFileGetContentsExtInstalled; | ||
102 | |||
103 | private $bookRoot = "OEBPS/"; | ||
104 | private $docRoot = NULL; | ||
105 | private $EPubMark = TRUE; | ||
106 | private $generator = ""; | ||
107 | |||
108 | private $log = NULL; | ||
109 | public $isLogging = TRUE; | ||
110 | |||
111 | public $encodeHTML = FALSE; | ||
112 | |||
113 | private $mimetypes = array( | ||
114 | "js" => "application/x-javascript", "swf" => "application/x-shockwave-flash", "xht" => "application/xhtml+xml", "xhtml" => "application/xhtml+xml", "zip" => "application/zip", | ||
115 | "aif" => "audio/x-aiff", "aifc" => "audio/x-aiff", "aiff" => "audio/x-aiff", "au" => "audio/basic", "kar" => "audio/midi", "m3u" => "audio/x-mpegurl", "mid" => "audio/midi", "midi" => "audio/midi", "mp2" => "audio/mpeg", "mp3" => "audio/mpeg", "mpga" => "audio/mpeg", "oga" => "audio/ogg", "ogg" => "audio/ogg", "ra" => "audio/x-realaudio", "ram" => "audio/x-pn-realaudio", "rm" => "audio/x-pn-realaudio", "rpm" => "audio/x-pn-realaudio-plugin", "snd" => "audio/basic", "wav" => "audio/x-wav", | ||
116 | "bmp" => "image/bmp", "djv" => "image/vnd.djvu", "djvu" => "image/vnd.djvu", "gif" => "image/gif", "ief" => "image/ief", "jpe" => "image/jpeg", "jpeg" => "image/jpeg", "jpg" => "image/jpeg", "pbm" => "image/x-portable-bitmap", "pgm" => "image/x-portable-graymap", "png" => "image/png", "pnm" => "image/x-portable-anymap", "ppm" => "image/x-portable-pixmap", "ras" => "image/x-cmu-raster", "rgb" => "image/x-rgb", "tif" => "image/tif", "tiff" => "image/tiff", "wbmp" => "image/vnd.wap.wbmp", "xbm" => "image/x-xbitmap", "xpm" => "image/x-xpixmap", "xwd" => "image/x-windowdump", | ||
117 | "asc" => "text/plain", "css" => "text/css", "etx" => "text/x-setext", "htm" => "text/html", "html" => "text/html", "rtf" => "text/rtf", "rtx" => "text/richtext", "sgm" => "text/sgml", "sgml" => "text/sgml", "tsv" => "text/tab-seperated-values", "txt" => "text/plain", "wml" => "text/vnd.wap.wml", "wmls" => "text/vnd.wap.wmlscript", "xml" => "text/xml", "xsl" => "text/xml", | ||
118 | "avi" => "video/x-msvideo", "mov" => "video/quicktime", "movie" => "video/x-sgi-movie", "mp4" => "video/mp4", "mpe" => "video/mpeg", "mpeg" => "video/mpeg", "mpg" => "video/mpeg", "mxu" => "video/vnd.mpegurl", "ogv" => "video/ogg", "qt" => "video/quicktime", "webm" => "video/webm"); | ||
119 | |||
120 | // These are the ONLY allowed types in that these are the ones ANY reader must support, any other MUST have the fallback attribute pointing to one of these. | ||
121 | private $coreMediaTypes = array("image/gif", "image/jpeg", "image/png", "image/svg+xml", "application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/css", "text/x-oeb1-css", "text/x-oeb1-document"); | ||
122 | |||
123 | private $opsContentTypes = array("application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/x-oeb1-document"); | ||
124 | |||
125 | private $forbiddenCharacters = array("?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "%"); | ||
126 | |||
127 | private $htmlContentHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<title></title>\n</head>\n<body>\n"; | ||
128 | private $htmlContentFooter = "</body>\n</html>\n"; | ||
129 | |||
130 | /** | ||
131 | * Class constructor. | ||
132 | * | ||
133 | * @return void | ||
134 | */ | ||
135 | function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { | ||
136 | include_once("Zip.php"); | ||
137 | include_once("Logger.php"); | ||
138 | |||
139 | $this->bookVersion = $bookVersion; | ||
140 | $this->writingDirection = $writingDirection; | ||
141 | $this->languageCode = $languageCode; | ||
142 | |||
143 | $this->log = new Logger("EPub", $this->isLogging); | ||
144 | |||
145 | /* Prepare Logging. Just in case it's used. later */ | ||
146 | if ($this->isLogging) { | ||
147 | $this->log->logLine("EPub class version....: " . self::VERSION); | ||
148 | $this->log->logLine("EPub req. Zip version.: " . self::REQ_ZIP_VERSION); | ||
149 | $this->log->logLine("Zip version...........: " . Zip::VERSION); | ||
150 | $this->log->dumpInstalledModules(); | ||
151 | } | ||
152 | |||
153 | if (!defined("Zip::VERSION") || Zip::VERSION < self::REQ_ZIP_VERSION) { | ||
154 | die("<p>EPub version " . self::VERSION . " requires Zip.php at version " . self::REQ_ZIP_VERSION . " or higher.<br />You can obtain the latest version from <a href=\"http://www.phpclasses.org/browse/package/6110.html\">http://www.phpclasses.org/browse/package/6110.html</a>.</p>"); | ||
155 | } | ||
156 | |||
157 | include_once("EPubChapterSplitter.php"); | ||
158 | include_once("EPub.HtmlEntities.php"); | ||
159 | include_once("EPub.NCX.php"); | ||
160 | include_once("EPub.OPF.php"); | ||
161 | |||
162 | $this->initialize(); | ||
163 | } | ||
164 | |||
165 | /** | ||
166 | * Class destructor | ||
167 | * | ||
168 | * @return void | ||
169 | * @TODO make sure elements in the destructor match the current class elements | ||
170 | */ | ||
171 | function __destruct() { | ||
172 | unset($this->bookVersion, $this->maxImageWidth, $this->maxImageHeight); | ||
173 | unset($this->splitDefaultSize, $this->isGifImagesEnabled, $this->isReferencesAddedToToc); | ||
174 | unset($this->zip, $this->title, $this->language, $this->identifier, $this->identifierType); | ||
175 | unset($this->description, $this->author, $this->authorSortKey, $this->publisherName); | ||
176 | unset($this->publisherURL, $this->date, $this->rights, $this->coverage, $this->relation); | ||
177 | unset($this->sourceURL, $this->chapterCount, $this->opf, $this->ncx, $this->isFinalized); | ||
178 | unset($this->isCoverImageSet, $this->fileList, $this->writingDirection, $this->languageCode); | ||
179 | unset($this->referencesOrder, $this->dateformat, $this->dateformatShort, $this->headerDateFormat); | ||
180 | unset($this->isCurlInstalled, $this->isGdInstalled, $this->isExifInstalled); | ||
181 | unset($this->isFileGetContentsInstalled, $this->isFileGetContentsExtInstalled, $this->bookRoot); | ||
182 | unset($this->docRoot, $this->EPubMark, $this->generator, $this->log, $this->isLogging); | ||
183 | unset($this->encodeHTML, $this->mimetypes, $this->coreMediaTypes, $this->opsContentTypes); | ||
184 | unset($this->forbiddenCharacters, $this->htmlContentHeader, $this->htmlContentFooter); | ||
185 | unset($this->buildTOC, $this->tocTitle, $this->tocCSSClass, $this->tocAddReferences); | ||
186 | unset($this->tocFileName, $this->tocCssFileName); | ||
187 | } | ||
188 | |||
189 | /** | ||
190 | * initialize defaults. | ||
191 | */ | ||
192 | private function initialize() { | ||
193 | $this->referencesOrder = array( | ||
194 | Reference::COVER => "Cover Page", | ||
195 | Reference::TITLE_PAGE => "Title Page", | ||
196 | Reference::ACKNOWLEDGEMENTS => "Acknowledgements", | ||
197 | Reference::BIBLIOGRAPHY => "Bibliography", | ||
198 | Reference::COLOPHON => "Colophon", | ||
199 | Reference::COPYRIGHT_PAGE => "Copyright", | ||
200 | Reference::DEDICATION => "Dedication", | ||
201 | Reference::EPIGRAPH => "Epigraph", | ||
202 | Reference::FOREWORD => "Foreword", | ||
203 | Reference::TABLE_OF_CONTENTS => "Table of Contents", | ||
204 | Reference::NOTES => "Notes", | ||
205 | Reference::PREFACE => "Preface", | ||
206 | Reference::TEXT => "First Page", | ||
207 | Reference::LIST_OF_ILLUSTRATIONS => "List of Illustrations", | ||
208 | Reference::LIST_OF_TABLES => "List of Tables", | ||
209 | Reference::GLOSSARY => "Glossary", | ||
210 | Reference::INDEX => "Index"); | ||
211 | |||
212 | $this->docRoot = filter_input(INPUT_SERVER, "DOCUMENT_ROOT") . "/"; | ||
213 | |||
214 | $this->isCurlInstalled = extension_loaded('curl') && function_exists('curl_version'); | ||
215 | $this->isGdInstalled = extension_loaded('gd') && function_exists('gd_info'); | ||
216 | $this->isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype'); | ||
217 | $this->isFileGetContentsInstalled = function_exists('file_get_contents'); | ||
218 | $this->isFileGetContentsExtInstalled = $this->isFileGetContentsInstalled && ini_get('allow_url_fopen'); | ||
219 | |||
220 | $this->zip = new Zip(); | ||
221 | $this->zip->setExtraField(FALSE); | ||
222 | $this->zip->addFile("application/epub+zip", "mimetype"); | ||
223 | $this->zip->setExtraField(TRUE); | ||
224 | $this->zip->addDirectory("META-INF"); | ||
225 | |||
226 | $this->content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n\t<rootfiles>\n\t\t<rootfile full-path=\"" . $this->bookRoot . "book.opf\" media-type=\"application/oebps-package+xml\" />\n\t</rootfiles>\n</container>\n"; | ||
227 | |||
228 | if (!$this->isEPubVersion2()) { | ||
229 | $this->htmlContentHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
230 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
231 | . "<head>" | ||
232 | . "<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n" | ||
233 | . "<title></title>\n" | ||
234 | . "</head>\n" | ||
235 | . "<body>\n"; | ||
236 | } | ||
237 | |||
238 | $this->zip->addFile($this->content, "META-INF/container.xml", 0, NULL, FALSE); | ||
239 | $this->content = NULL; | ||
240 | $this->ncx = new Ncx(NULL, NULL, NULL, $this->languageCode, $this->writingDirection); | ||
241 | $this->opf = new Opf(); | ||
242 | $this->ncx->setVersion($this->bookVersion); | ||
243 | $this->opf->setVersion($this->bookVersion); | ||
244 | $this->opf->addItem("ncx", "book.ncx", Ncx::MIMETYPE); | ||
245 | $this->chapterCount = 0; | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * Add dynamically generated data as a file to the book. | ||
250 | * | ||
251 | * @param string $fileName Filename to use for the file, must be unique for the book. | ||
252 | * @param string $fileId Unique identifier for the file. | ||
253 | * @param string $fileData File data | ||
254 | * @param string $mimetype file mime type | ||
255 | * @return bool $success | ||
256 | */ | ||
257 | function addFile($fileName, $fileId, $fileData, $mimetype) { | ||
258 | if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
259 | return FALSE; | ||
260 | } | ||
261 | |||
262 | $fileName = $this->normalizeFileName($fileName); | ||
263 | |||
264 | $compress = (strpos($mimetype, "image/") !== 0); | ||
265 | |||
266 | $this->zip->addFile($fileData, $this->bookRoot.$fileName, 0, NULL, $compress); | ||
267 | $this->fileList[$fileName] = $fileName; | ||
268 | $this->opf->addItem($fileId, $fileName, $mimetype); | ||
269 | return TRUE; | ||
270 | } | ||
271 | |||
272 | /** | ||
273 | * Add a large file directly from the filestystem to the book. | ||
274 | * | ||
275 | * @param string $fileName Filename to use for the file, must be unique for the book. | ||
276 | * @param string $fileId Unique identifier for the file. | ||
277 | * @param string $filePath File path | ||
278 | * @param string $mimetype file mime type | ||
279 | * @return bool $success | ||
280 | */ | ||
281 | function addLargeFile($fileName, $fileId, $filePath, $mimetype) { | ||
282 | if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
283 | return FALSE; | ||
284 | } | ||
285 | $fileName = $this->normalizeFileName($fileName); | ||
286 | |||
287 | if ($this->zip->addLargeFile($filePath, $this->bookRoot.$fileName)) { | ||
288 | $this->fileList[$fileName] = $fileName; | ||
289 | $this->opf->addItem($fileId, $fileName, $mimetype); | ||
290 | return TRUE; | ||
291 | } | ||
292 | return FALSE; | ||
293 | } | ||
294 | |||
295 | /** | ||
296 | * Add a CSS file to the book. | ||
297 | * | ||
298 | * @param string $fileName Filename to use for the CSS file, must be unique for the book. | ||
299 | * @param string $fileId Unique identifier for the file. | ||
300 | * @param string $fileData CSS data | ||
301 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for <code>processCSSExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE. | ||
302 | * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. | ||
303 | * | ||
304 | * @return bool $success | ||
305 | */ | ||
306 | function addCSSFile($fileName, $fileId, $fileData, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { | ||
307 | if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
308 | return FALSE; | ||
309 | } | ||
310 | $fileName = Zip::getRelativePath($fileName); | ||
311 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
312 | |||
313 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
314 | $cssDir = pathinfo($fileName); | ||
315 | $cssDir = preg_replace('#^[/\.]+#i', "", $cssDir["dirname"] . "/"); | ||
316 | if (!empty($cssDir)) { | ||
317 | $cssDir = preg_replace('#[^/]+/#i', "../", $cssDir); | ||
318 | } | ||
319 | |||
320 | $this->processCSSExternalReferences($fileData, $externalReferences, $baseDir, $cssDir); | ||
321 | } | ||
322 | |||
323 | $this->addFile($fileName, "css_" . $fileId, $fileData, "text/css"); | ||
324 | |||
325 | return TRUE; | ||
326 | } | ||
327 | |||
328 | /** | ||
329 | * Add a chapter to the book, as a chapter should not exceed 250kB, you can parse an array with multiple parts as $chapterData. | ||
330 | * These will still only show up as a single chapter in the book TOC. | ||
331 | * | ||
332 | * @param string $chapterName Name of the chapter, will be use din the TOC | ||
333 | * @param string $fileName Filename to use for the chapter, must be unique for the book. | ||
334 | * @param string $chapter Chapter text in XHTML or array $chapterData valid XHTML data for the chapter. File should NOT exceed 250kB. | ||
335 | * @param bool $autoSplit Should the chapter be split if it exceeds the default split size? Default=FALSE, only used if $chapterData is a string. | ||
336 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for <code>processChapterExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE. | ||
337 | * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. | ||
338 | * @return mixed $success FALSE if the addition failed, else the new NavPoint. | ||
339 | */ | ||
340 | function addChapter($chapterName, $fileName, $chapterData = NULL, $autoSplit = FALSE, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { | ||
341 | if ($this->isFinalized) { | ||
342 | return FALSE; | ||
343 | } | ||
344 | $fileName = Zip::getRelativePath($fileName); | ||
345 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
346 | $fileName = $this->sanitizeFileName($fileName); | ||
347 | |||
348 | $chapter = $chapterData; | ||
349 | if ($autoSplit && is_string($chapterData) && mb_strlen($chapterData) > $this->splitDefaultSize) { | ||
350 | $splitter = new EPubChapterSplitter(); | ||
351 | |||
352 | $chapterArray = $splitter->splitChapter($chapterData); | ||
353 | if (count($chapterArray) > 1) { | ||
354 | $chapter = $chapterArray; | ||
355 | } | ||
356 | } | ||
357 | |||
358 | if (!empty($chapter) && is_string($chapter)) { | ||
359 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
360 | $htmlDirInfo = pathinfo($fileName); | ||
361 | $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/"); | ||
362 | $this->processChapterExternalReferences($chapter, $externalReferences, $baseDir, $htmlDir); | ||
363 | } | ||
364 | |||
365 | if ($this->encodeHTML === TRUE) { | ||
366 | $chapter = $this->encodeHtml($chapter); | ||
367 | } | ||
368 | |||
369 | $this->chapterCount++; | ||
370 | $this->addFile($fileName, "chapter" . $this->chapterCount, $chapter, "application/xhtml+xml"); | ||
371 | $this->opf->addItemRef("chapter" . $this->chapterCount); | ||
372 | |||
373 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); | ||
374 | $this->ncx->addNavPoint($navPoint); | ||
375 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
376 | } else if (is_array($chapter)) { | ||
377 | $fileNameParts = pathinfo($fileName); | ||
378 | $extension = $fileNameParts['extension']; | ||
379 | $name = $fileNameParts['filename']; | ||
380 | |||
381 | $partCount = 0; | ||
382 | $this->chapterCount++; | ||
383 | |||
384 | $oneChapter = each($chapter); | ||
385 | while ($oneChapter) { | ||
386 | list($k, $v) = $oneChapter; | ||
387 | if ($this->encodeHTML === TRUE) { | ||
388 | $v = $this->encodeHtml($v); | ||
389 | } | ||
390 | |||
391 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
392 | $this->processChapterExternalReferences($v, $externalReferences, $baseDir); | ||
393 | } | ||
394 | $partCount++; | ||
395 | $partName = $name . "_" . $partCount; | ||
396 | $this->addFile($partName . "." . $extension, $partName, $v, "application/xhtml+xml"); | ||
397 | $this->opf->addItemRef($partName); | ||
398 | |||
399 | $oneChapter = each($chapter); | ||
400 | } | ||
401 | $partName = $name . "_1." . $extension; | ||
402 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $partName, $partName); | ||
403 | $this->ncx->addNavPoint($navPoint); | ||
404 | |||
405 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
406 | } else if (!isset($chapterData) && strpos($fileName, "#") > 0) { | ||
407 | $this->chapterCount++; | ||
408 | //$this->opf->addItemRef("chapter" . $this->chapterCount); | ||
409 | |||
410 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); | ||
411 | $this->ncx->addNavPoint($navPoint); | ||
412 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
413 | } else if (!isset($chapterData) && $fileName=="TOC.xhtml") { | ||
414 | $this->chapterCount++; | ||
415 | $this->opf->addItemRef("toc"); | ||
416 | |||
417 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); | ||
418 | $this->ncx->addNavPoint($navPoint); | ||
419 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
420 | } | ||
421 | return $navPoint; | ||
422 | } | ||
423 | |||
424 | /** | ||
425 | * Add one chapter level. | ||
426 | * | ||
427 | * Subsequent chapters will be added to this level. | ||
428 | * | ||
429 | * @param string $navTitle | ||
430 | * @param string $navId | ||
431 | * @param string $navClass | ||
432 | * @param int $isNavHidden | ||
433 | * @param string $writingDirection | ||
434 | * @return NavPoint The new NavPoint for that level. | ||
435 | */ | ||
436 | function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { | ||
437 | return $this->ncx->subLevel($this->decodeHtmlEntities($navTitle), $navId, $navClass, $isNavHidden, $writingDirection); | ||
438 | } | ||
439 | |||
440 | /** | ||
441 | * Step back one chapter level. | ||
442 | * | ||
443 | * Subsequent chapters will be added to this chapters parent level. | ||
444 | */ | ||
445 | function backLevel() { | ||
446 | $this->ncx->backLevel(); | ||
447 | } | ||
448 | |||
449 | /** | ||
450 | * Step back to the root level. | ||
451 | * | ||
452 | * Subsequent chapters will be added to the rooot NavMap. | ||
453 | */ | ||
454 | function rootLevel() { | ||
455 | $this->ncx->rootLevel(); | ||
456 | } | ||
457 | |||
458 | /** | ||
459 | * Step back to the given level. | ||
460 | * Useful for returning to a previous level from deep within the structure. | ||
461 | * Values below 2 will have the same effect as rootLevel() | ||
462 | * | ||
463 | * @param int $newLevel | ||
464 | */ | ||
465 | function setCurrentLevel($newLevel) { | ||
466 | $this->ncx->setCurrentLevel($newLevel); | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * Get current level count. | ||
471 | * The indentation of the current structure point. | ||
472 | * | ||
473 | * @return current level count; | ||
474 | */ | ||
475 | function getCurrentLevel() { | ||
476 | return $this->ncx->getCurrentLevel(); | ||
477 | } | ||
478 | |||
479 | /** | ||
480 | * Wrap ChapterContent with Head and Footer | ||
481 | * | ||
482 | * @param $content | ||
483 | * @return string $content | ||
484 | */ | ||
485 | private function wrapChapter($content) { | ||
486 | return $this->htmlContentHeader . "\n" . $content . "\n" . $this->htmlContentFooter; | ||
487 | } | ||
488 | |||
489 | /** | ||
490 | * Reference pages is usually one or two pages for items such as Table of Contents, reference lists, Author notes or Acknowledgements. | ||
491 | * These do not show up in the regular navigation list. | ||
492 | * | ||
493 | * As they are supposed to be short. | ||
494 | * | ||
495 | * @param string $pageName Name of the chapter, will be use din the TOC | ||
496 | * @param string $fileName Filename to use for the chapter, must be unique for the book. | ||
497 | * @param string $pageData Page content in XHTML. File should NOT exceed 250kB. | ||
498 | * @param string $reference Reference key | ||
499 | * @param int $externalReferences How to handle external references. See documentation for <code>processChapterExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE. | ||
500 | * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. | ||
501 | * @return bool $success | ||
502 | */ | ||
503 | function addReferencePage($pageName, $fileName, $pageData, $reference, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { | ||
504 | if ($this->isFinalized) { | ||
505 | return FALSE; | ||
506 | } | ||
507 | $fileName = Zip::getRelativePath($fileName); | ||
508 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
509 | |||
510 | |||
511 | if (!empty($pageData) && is_string($pageData)) { | ||
512 | if ($this->encodeHTML === TRUE) { | ||
513 | $pageData = $this->encodeHtml($pageData); | ||
514 | } | ||
515 | |||
516 | $this->wrapChapter($pageData); | ||
517 | |||
518 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
519 | $htmlDirInfo = pathinfo($fileName); | ||
520 | $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/"); | ||
521 | $this->processChapterExternalReferences($pageData, $externalReferences, $baseDir, $htmlDir); | ||
522 | } | ||
523 | |||
524 | $this->addFile($fileName, "ref_" . $reference, $pageData, "application/xhtml+xml"); | ||
525 | |||
526 | if ($reference !== Reference::TABLE_OF_CONTENTS || !isset($this->ncx->referencesList[$reference])) { | ||
527 | $this->opf->addItemRef("ref_" . $reference, FALSE); | ||
528 | $this->opf->addReference($reference, $pageName, $fileName); | ||
529 | |||
530 | $this->ncx->referencesList[$reference] = $fileName; | ||
531 | $this->ncx->referencesName[$reference] = $pageName; | ||
532 | } | ||
533 | return TRUE; | ||
534 | } | ||
535 | return TRUE; | ||
536 | } | ||
537 | |||
538 | /** | ||
539 | * Add custom metadata to the book. | ||
540 | * | ||
541 | * It is up to the builder to make sure there are no collisions. Metadata are just key value pairs. | ||
542 | * | ||
543 | * @param string $name | ||
544 | * @param string $content | ||
545 | */ | ||
546 | function addCustomMetadata($name, $content) { | ||
547 | $this->opf->addMeta($name, $content); | ||
548 | } | ||
549 | |||
550 | /** | ||
551 | * Add DublinCore metadata to the book | ||
552 | * | ||
553 | * Use the DublinCore constants included in EPub, ie DublinCore::DATE | ||
554 | * | ||
555 | * @param string $dublinCore name | ||
556 | * @param string $value | ||
557 | */ | ||
558 | function addDublinCoreMetadata($dublinCoreConstant, $value) { | ||
559 | if ($this->isFinalized) { | ||
560 | return; | ||
561 | } | ||
562 | |||
563 | $this->opf->addDCMeta($dublinCoreConstant, $this->decodeHtmlEntities($value)); | ||
564 | } | ||
565 | |||
566 | /** | ||
567 | * Add a cover image to the book. | ||
568 | * If the $imageData is not set, the function assumes the $fileName is the path to the image file. | ||
569 | * | ||
570 | * The styling and structure of the generated XHTML is heavily inspired by the XHTML generated by Calibre. | ||
571 | * | ||
572 | * @param string $fileName Filename to use for the image, must be unique for the book. | ||
573 | * @param string $imageData Binary image data | ||
574 | * @param string $mimetype Image mimetype, such as "image/jpeg" or "image/png". | ||
575 | * @return bool $success | ||
576 | */ | ||
577 | function setCoverImage($fileName, $imageData = NULL, $mimetype = NULL,$bookTitle) { | ||
578 | if ($this->isFinalized || $this->isCoverImageSet || array_key_exists("CoverPage.html", $this->fileList)) { | ||
579 | return FALSE; | ||
580 | } | ||
581 | |||
582 | if ($imageData == NULL) { | ||
583 | // assume $fileName is the valid file path. | ||
584 | if (!file_exists($fileName)) { | ||
585 | // Attempt to locate the file using the doc root. | ||
586 | $rp = realpath($this->docRoot . "/" . $fileName); | ||
587 | |||
588 | if ($rp !== FALSE) { | ||
589 | // only assign the docroot path if it actually exists there. | ||
590 | $fileName = $rp; | ||
591 | } | ||
592 | } | ||
593 | $image = $this->getImage($fileName); | ||
594 | $imageData = $image['image']; | ||
595 | $mimetype = $image['mime']; | ||
596 | $fileName = preg_replace("#\.[^\.]+$#", "." . $image['ext'], $fileName); | ||
597 | } | ||
598 | |||
599 | |||
600 | $path = pathinfo($fileName); | ||
601 | $imgPath = "images/" . $path["basename"]; | ||
602 | |||
603 | if (empty($mimetype) && file_exists($fileName)) { | ||
604 | list($width, $height, $type, $attr) = getimagesize($fileName); | ||
605 | $mimetype = image_type_to_mime_type($type); | ||
606 | } | ||
607 | if (empty($mimetype)) { | ||
608 | $ext = strtolower($path['extension']); | ||
609 | if ($ext == "jpg") { | ||
610 | $ext = "jpeg"; | ||
611 | } | ||
612 | $mimetype = "image/" . $ext; | ||
613 | } | ||
614 | |||
615 | $coverPage = ""; | ||
616 | |||
617 | if ($this->isEPubVersion2()) { | ||
618 | $coverPage = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
619 | . "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n" | ||
620 | . " \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n" | ||
621 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" xml:lang=\"en\">\n" | ||
622 | . "\t<head>\n" | ||
623 | . "\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n" | ||
624 | . "\t\t<title>Cover Image</title>\n" | ||
625 | . "\t\t<link type=\"text/css\" rel=\"stylesheet\" href=\"Styles/CoverPage.css\" />\n" | ||
626 | . "\t</head>\n" | ||
627 | . "\t<body>\n" | ||
628 | . "\t" . $bookTitle . "\n" | ||
629 | . "\t\t<div>\n" | ||
630 | . "\t\t\t<img src=\"" . $imgPath . "\" alt=\"Cover image\" style=\"height: 100%\"/>\n" | ||
631 | . "\t\t</div>\n" | ||
632 | . "\t</body>\n" | ||
633 | . "</html>\n"; | ||
634 | } else { | ||
635 | $coverPage = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
636 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
637 | . "<head>" | ||
638 | . "\t<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n" | ||
639 | . "\t\t<title>Cover Image</title>\n" | ||
640 | . "\t\t<link type=\"text/css\" rel=\"stylesheet\" href=\"Styles/CoverPage.css\" />\n" | ||
641 | . "\t</head>\n" | ||
642 | . "\t<body>\n" | ||
643 | . "\t\t<section epub:type=\"cover\">\n" | ||
644 | . "\t" . $bookTitle . "\n" | ||
645 | . "\t\t\t<img src=\"" . $imgPath . "\" alt=\"Cover image\" style=\"height: 30%\"/>\n" | ||
646 | . "\t\t</section>\n" | ||
647 | . "\t</body>\n" | ||
648 | . "</html>\n"; | ||
649 | } | ||
650 | $coverPageCss = "@page, body, div, img {\n" | ||
651 | . "\tpadding: 0pt;\n" | ||
652 | . "\tmargin:0pt;\n" | ||
653 | . "}\n\nbody {\n" | ||
654 | . "\ttext-align: center;\n" | ||
655 | . "}\n"; | ||
656 | |||
657 | $this->addCSSFile("Styles/CoverPage.css", "CoverPageCss", $coverPageCss); | ||
658 | $this->addFile($imgPath, "CoverImage", $imageData, $mimetype); | ||
659 | $this->addReferencePage("CoverPage", "CoverPage.xhtml", $coverPage, "cover"); | ||
660 | $this->isCoverImageSet = TRUE; | ||
661 | return TRUE; | ||
662 | } | ||
663 | |||
664 | /** | ||
665 | * Process external references from a HTML to the book. The chapter itself is not stored. | ||
666 | * the HTML is scanned for <link..., <style..., and <img tags. | ||
667 | * Embedded CSS styles and links will also be processed. | ||
668 | * Script tags are not processed, as scripting should be avoided in e-books. | ||
669 | * | ||
670 | * EPub keeps track of added files, and duplicate files referenced across multiple | ||
671 | * chapters, are only added once. | ||
672 | * | ||
673 | * If the $doc is a string, it is assumed to be the content of an HTML file, | ||
674 | * else is it assumes to be a DOMDocument. | ||
675 | * | ||
676 | * Basedir is the root dir the HTML is supposed to "live" in, used to resolve | ||
677 | * relative references such as <code><img src="../images/image.png"/></code> | ||
678 | * | ||
679 | * $externalReferences determines how the function will handle external references. | ||
680 | * | ||
681 | * @param mixed &$doc (referenced) | ||
682 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
683 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
684 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
685 | * | ||
686 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
687 | */ | ||
688 | protected function processChapterExternalReferences(&$doc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { | ||
689 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
690 | return FALSE; | ||
691 | } | ||
692 | |||
693 | $backPath = preg_replace('#[^/]+/#i', "../", $htmlDir); | ||
694 | $isDocAString = is_string($doc); | ||
695 | $xmlDoc = NULL; | ||
696 | |||
697 | if ($isDocAString) { | ||
698 | $xmlDoc = new DOMDocument(); | ||
699 | @$xmlDoc->loadHTML($doc); | ||
700 | } else { | ||
701 | $xmlDoc = $doc; | ||
702 | } | ||
703 | |||
704 | $this->processChapterStyles($xmlDoc, $externalReferences, $baseDir, $htmlDir); | ||
705 | $this->processChapterLinks($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); | ||
706 | $this->processChapterImages($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); | ||
707 | $this->processChapterSources($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); | ||
708 | |||
709 | if ($isDocAString) { | ||
710 | //$html = $xmlDoc->saveXML(); | ||
711 | |||
712 | $htmlNode = $xmlDoc->getElementsByTagName("html"); | ||
713 | $headNode = $xmlDoc->getElementsByTagName("head"); | ||
714 | $bodyNode = $xmlDoc->getElementsByTagName("body"); | ||
715 | |||
716 | $htmlNS = ""; | ||
717 | for ($index = 0; $index < $htmlNode->item(0)->attributes->length; $index++) { | ||
718 | $nodeName = $htmlNode->item(0)->attributes->item($index)->nodeName; | ||
719 | $nodeValue = $htmlNode->item(0)->attributes->item($index)->nodeValue; | ||
720 | |||
721 | if ($nodeName != "xmlns") { | ||
722 | $htmlNS .= " $nodeName=\"$nodeValue\""; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | $xml = new DOMDocument('1.0', "utf-8"); | ||
727 | $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
728 | $xml->preserveWhiteSpace = FALSE; | ||
729 | $xml->formatOutput = TRUE; | ||
730 | |||
731 | $xml2Doc = new DOMDocument('1.0', "utf-8"); | ||
732 | $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
733 | $xml2Doc->loadXML("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\"$htmlNS>\n</html>\n"); | ||
734 | $html = $xml2Doc->getElementsByTagName("html")->item(0); | ||
735 | $html->appendChild($xml2Doc->importNode($headNode->item(0), TRUE)); | ||
736 | $html->appendChild($xml2Doc->importNode($bodyNode->item(0), TRUE)); | ||
737 | |||
738 | // force pretty printing and correct formatting, should not be needed, but it is. | ||
739 | $xml->loadXML($xml2Doc->saveXML()); | ||
740 | $doc = $xml->saveXML(); | ||
741 | |||
742 | if (!$this->isEPubVersion2()) { | ||
743 | $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc); | ||
744 | } | ||
745 | } | ||
746 | return TRUE; | ||
747 | } | ||
748 | |||
749 | /** | ||
750 | * Process images referenced from an CSS file to the book. | ||
751 | * | ||
752 | * $externalReferences determins how the function will handle external references. | ||
753 | * | ||
754 | * @param string &$cssFile (referenced) | ||
755 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
756 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
757 | * @param string $cssDir The of the CSS file's directory from the root of the archive. | ||
758 | * | ||
759 | * @return bool FALSE if unsuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
760 | */ | ||
761 | protected function processCSSExternalReferences(&$cssFile, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $cssDir = "") { | ||
762 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
763 | return FALSE; | ||
764 | } | ||
765 | |||
766 | $backPath = preg_replace('#[^/]+/#i', "../", $cssDir); | ||
767 | $imgs = null; | ||
768 | preg_match_all('#url\s*\([\'\"\s]*(.+?)[\'\"\s]*\)#im', $cssFile, $imgs, PREG_SET_ORDER); | ||
769 | |||
770 | $itemCount = count($imgs); | ||
771 | for ($idx = 0; $idx < $itemCount; $idx++) { | ||
772 | $img = $imgs[$idx]; | ||
773 | if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES || $externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { | ||
774 | $cssFile = str_replace($img[0], "", $cssFile); | ||
775 | } else { | ||
776 | $source = $img[1]; | ||
777 | |||
778 | $pathData = pathinfo($source); | ||
779 | $internalSrc = $pathData['basename']; | ||
780 | $internalPath = ""; | ||
781 | $isSourceExternal = FALSE; | ||
782 | |||
783 | if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $cssDir, $backPath)) { | ||
784 | $cssFile = str_replace($img[0], "url('" . $backPath . $internalPath . "')", $cssFile); | ||
785 | } else if ($isSourceExternal) { | ||
786 | $cssFile = str_replace($img[0], "", $cssFile); // External image is missing | ||
787 | } // else do nothing, if the image is local, and missing, assume it's been generated. | ||
788 | } | ||
789 | } | ||
790 | return TRUE; | ||
791 | } | ||
792 | |||
793 | /** | ||
794 | * Process style tags in a DOMDocument. Styles will be passed as CSS files and reinserted into the document. | ||
795 | * | ||
796 | * @param DOMDocument &$xmlDoc (referenced) | ||
797 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
798 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
799 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
800 | * | ||
801 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
802 | */ | ||
803 | protected function processChapterStyles(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { | ||
804 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
805 | return FALSE; | ||
806 | } | ||
807 | // process inlined CSS styles in style tags. | ||
808 | $styles = $xmlDoc->getElementsByTagName("style"); | ||
809 | $styleCount = $styles->length; | ||
810 | for ($styleIdx = 0; $styleIdx < $styleCount; $styleIdx++) { | ||
811 | $style = $styles->item($styleIdx); | ||
812 | |||
813 | $styleData = preg_replace('#[/\*\s]*\<\!\[CDATA\[[\s\*/]*#im', "", $style->nodeValue); | ||
814 | $styleData = preg_replace('#[/\*\s]*\]\]\>[\s\*/]*#im', "", $styleData); | ||
815 | |||
816 | $this->processCSSExternalReferences($styleData, $externalReferences, $baseDir, $htmlDir); | ||
817 | $style->nodeValue = "\n" . trim($styleData) . "\n"; | ||
818 | } | ||
819 | return TRUE; | ||
820 | } | ||
821 | |||
822 | /** | ||
823 | * Process link tags in a DOMDocument. Linked files will be loaded into the archive, and the link src will be rewritten to point to that location. | ||
824 | * Link types text/css will be passed as CSS files. | ||
825 | * | ||
826 | * @param DOMDocument &$xmlDoc (referenced) | ||
827 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
828 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
829 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
830 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
831 | * | ||
832 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
833 | */ | ||
834 | protected function processChapterLinks(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
835 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
836 | return FALSE; | ||
837 | } | ||
838 | // process link tags. | ||
839 | $links = $xmlDoc->getElementsByTagName("link"); | ||
840 | $linkCount = $links->length; | ||
841 | for ($linkIdx = 0; $linkIdx < $linkCount; $linkIdx++) { | ||
842 | $link = $links->item($linkIdx); | ||
843 | $source = $link->attributes->getNamedItem("href")->nodeValue; | ||
844 | $sourceData = NULL; | ||
845 | |||
846 | $pathData = pathinfo($source); | ||
847 | $internalSrc = $pathData['basename']; | ||
848 | |||
849 | if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { | ||
850 | $urlinfo = parse_url($source); | ||
851 | |||
852 | if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { | ||
853 | $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1); | ||
854 | } | ||
855 | |||
856 | @$sourceData = getFileContents($source); | ||
857 | } else if (strpos($source, "/") === 0) { | ||
858 | @$sourceData = file_get_contents($this->docRoot . $source); | ||
859 | } else { | ||
860 | @$sourceData = file_get_contents($this->docRoot . $baseDir . "/" . $source); | ||
861 | } | ||
862 | |||
863 | if (!empty($sourceData)) { | ||
864 | if (!array_key_exists($internalSrc, $this->fileList)) { | ||
865 | $mime = $link->attributes->getNamedItem("type")->nodeValue; | ||
866 | if (empty($mime)) { | ||
867 | $mime = "text/plain"; | ||
868 | } | ||
869 | if ($mime == "text/css") { | ||
870 | $this->processCSSExternalReferences($sourceData, $externalReferences, $baseDir, $htmlDir); | ||
871 | $this->addCSSFile($internalSrc, $internalSrc, $sourceData, EPub::EXTERNAL_REF_IGNORE, $baseDir); | ||
872 | $link->setAttribute("href", $backPath . $internalSrc); | ||
873 | } else { | ||
874 | $this->addFile($internalSrc, $internalSrc, $sourceData, $mime); | ||
875 | } | ||
876 | $this->fileList[$internalSrc] = $source; | ||
877 | } else { | ||
878 | $link->setAttribute("href", $backPath . $internalSrc); | ||
879 | } | ||
880 | } // else do nothing, if the link is local, and missing, assume it's been generated. | ||
881 | } | ||
882 | return TRUE; | ||
883 | } | ||
884 | |||
885 | /** | ||
886 | * Process img tags in a DOMDocument. | ||
887 | * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly. | ||
888 | * | ||
889 | * @param DOMDocument &$xmlDoc (referenced) | ||
890 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
891 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
892 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
893 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
894 | * | ||
895 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
896 | */ | ||
897 | protected function processChapterImages(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
898 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
899 | return FALSE; | ||
900 | } | ||
901 | // process img tags. | ||
902 | $postProcDomElememts = array(); | ||
903 | $images = $xmlDoc->getElementsByTagName("img"); | ||
904 | $itemCount = $images->length; | ||
905 | |||
906 | for ($idx = 0; $idx < $itemCount; $idx++) { | ||
907 | $img = $images->item($idx); | ||
908 | |||
909 | if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) { | ||
910 | $postProcDomElememts[] = $img; | ||
911 | } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { | ||
912 | $altNode = $img->attributes->getNamedItem("alt"); | ||
913 | $alt = "image"; | ||
914 | if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) { | ||
915 | $alt = $altNode->nodeValue; | ||
916 | } | ||
917 | $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "<em>[" . $alt . "]</em>")); | ||
918 | } else { | ||
919 | $source = $img->attributes->getNamedItem("src")->nodeValue; | ||
920 | |||
921 | $parsedSource = parse_url($source); | ||
922 | $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME))); | ||
923 | $internalPath = ""; | ||
924 | $isSourceExternal = FALSE; | ||
925 | |||
926 | if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) { | ||
927 | $img->setAttribute("src", $backPath . $internalPath); | ||
928 | } else if ($isSourceExternal) { | ||
929 | $postProcDomElememts[] = $img; // External image is missing | ||
930 | } // else do nothing, if the image is local, and missing, assume it's been generated. | ||
931 | } | ||
932 | } | ||
933 | |||
934 | foreach ($postProcDomElememts as $target) { | ||
935 | if (is_array($target)) { | ||
936 | $target[0]->parentNode->replaceChild($target[1], $target[0]); | ||
937 | } else { | ||
938 | $target->parentNode->removeChild($target); | ||
939 | } | ||
940 | } | ||
941 | return TRUE; | ||
942 | } | ||
943 | |||
944 | /** | ||
945 | * Process source tags in a DOMDocument. | ||
946 | * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly. | ||
947 | * | ||
948 | * @param DOMDocument &$xmlDoc (referenced) | ||
949 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
950 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
951 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
952 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
953 | * | ||
954 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
955 | */ | ||
956 | protected function processChapterSources(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
957 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
958 | return FALSE; | ||
959 | } | ||
960 | |||
961 | if ($this->bookVersion !== EPub::BOOK_VERSION_EPUB3) { | ||
962 | // ePub 2 does not support multimedia formats, and they must be removed. | ||
963 | $externalReferences = EPub::EXTERNAL_REF_REMOVE_IMAGES; | ||
964 | } | ||
965 | |||
966 | $postProcDomElememts = array(); | ||
967 | $images = $xmlDoc->getElementsByTagName("source"); | ||
968 | $itemCount = $images->length; | ||
969 | for ($idx = 0; $idx < $itemCount; $idx++) { | ||
970 | $img = $images->item($idx); | ||
971 | if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) { | ||
972 | $postProcDomElememts[] = $img; | ||
973 | } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { | ||
974 | $altNode = $img->attributes->getNamedItem("alt"); | ||
975 | $alt = "image"; | ||
976 | if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) { | ||
977 | $alt = $altNode->nodeValue; | ||
978 | } | ||
979 | $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "[" . $alt . "]")); | ||
980 | } else { | ||
981 | $source = $img->attributes->getNamedItem("src")->nodeValue; | ||
982 | |||
983 | $parsedSource = parse_url($source); | ||
984 | $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME))); | ||
985 | $internalPath = ""; | ||
986 | $isSourceExternal = FALSE; | ||
987 | |||
988 | if ($this->resolveMedia($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) { | ||
989 | $img->setAttribute("src", $backPath . $internalPath); | ||
990 | } else if ($isSourceExternal) { | ||
991 | $postProcDomElememts[] = $img; // External image is missing | ||
992 | } // else do nothing, if the image is local, and missing, assume it's been generated. | ||
993 | } | ||
994 | } | ||
995 | } | ||
996 | |||
997 | /** | ||
998 | * Resolve an image src and determine it's target location and add it to the book. | ||
999 | * | ||
1000 | * @param string $source Image Source link. | ||
1001 | * @param string &$internalPath (referenced) Return value, will be set to the target path and name in the book. | ||
1002 | * @param string &$internalSrc (referenced) Return value, will be set to the target name in the book. | ||
1003 | * @param string &$isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL. | ||
1004 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
1005 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
1006 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
1007 | */ | ||
1008 | protected function resolveImage($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
1009 | if ($this->isFinalized) { | ||
1010 | return FALSE; | ||
1011 | } | ||
1012 | $imageData = NULL; | ||
1013 | |||
1014 | if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { | ||
1015 | $urlinfo = parse_url($source); | ||
1016 | $urlPath = pathinfo($urlinfo['path']); | ||
1017 | |||
1018 | if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { | ||
1019 | $internalSrc = $this->sanitizeFileName(urldecode(substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1))); | ||
1020 | } | ||
1021 | $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME); | ||
1022 | $isSourceExternal = TRUE; | ||
1023 | $imageData = $this->getImage($source); | ||
1024 | } else if (strpos($source, "/") === 0) { | ||
1025 | $internalPath = pathinfo($source, PATHINFO_DIRNAME); | ||
1026 | |||
1027 | $path = $source; | ||
1028 | if (!file_exists($path)) { | ||
1029 | $path = $this->docRoot . $path; | ||
1030 | } | ||
1031 | |||
1032 | $imageData = $this->getImage($path); | ||
1033 | } else { | ||
1034 | $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME)); | ||
1035 | |||
1036 | $path = $baseDir . "/" . $source; | ||
1037 | if (!file_exists($path)) { | ||
1038 | $path = $this->docRoot . $path; | ||
1039 | } | ||
1040 | |||
1041 | $imageData = $this->getImage($path); | ||
1042 | } | ||
1043 | if ($imageData !== FALSE) { | ||
1044 | $iSrcInfo = pathinfo($internalSrc); | ||
1045 | if (!empty($imageData['ext']) && $imageData['ext'] != $iSrcInfo['extension']) { | ||
1046 | $internalSrc = $iSrcInfo['filename'] . "." . $imageData['ext']; | ||
1047 | } | ||
1048 | $internalPath = Zip::getRelativePath("images/" . $internalPath . "/" . $internalSrc); | ||
1049 | if (!array_key_exists($internalPath, $this->fileList)) { | ||
1050 | $this->addFile($internalPath, "i_" . $internalSrc, $imageData['image'], $imageData['mime']); | ||
1051 | $this->fileList[$internalPath] = $source; | ||
1052 | } | ||
1053 | return TRUE; | ||
1054 | } | ||
1055 | return FALSE; | ||
1056 | } | ||
1057 | |||
1058 | /** | ||
1059 | * Resolve a media src and determine it's target location and add it to the book. | ||
1060 | * | ||
1061 | * @param string $source Source link. | ||
1062 | * @param string $internalPath (referenced) Return value, will be set to the target path and name in the book. | ||
1063 | * @param string $internalSrc (referenced) Return value, will be set to the target name in the book. | ||
1064 | * @param string $isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL. | ||
1065 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
1066 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
1067 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
1068 | */ | ||
1069 | protected function resolveMedia($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
1070 | if ($this->isFinalized) { | ||
1071 | return FALSE; | ||
1072 | } | ||
1073 | $mediaPath = NULL; | ||
1074 | $tmpFile; | ||
1075 | |||
1076 | if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { | ||
1077 | $urlinfo = parse_url($source); | ||
1078 | |||
1079 | if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { | ||
1080 | $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1); | ||
1081 | } | ||
1082 | $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME); | ||
1083 | $isSourceExternal = TRUE; | ||
1084 | $mediaPath = $this->getFileContents($source, true); | ||
1085 | $tmpFile = $mediaPath; | ||
1086 | } else if (strpos($source, "/") === 0) { | ||
1087 | $internalPath = pathinfo($source, PATHINFO_DIRNAME); | ||
1088 | |||
1089 | $mediaPath = $source; | ||
1090 | if (!file_exists($mediaPath)) { | ||
1091 | $mediaPath = $this->docRoot . $mediaPath; | ||
1092 | } | ||
1093 | } else { | ||
1094 | $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME)); | ||
1095 | |||
1096 | $mediaPath = $baseDir . "/" . $source; | ||
1097 | if (!file_exists($mediaPath)) { | ||
1098 | $mediaPath = $this->docRoot . $mediaPath; | ||
1099 | } | ||
1100 | } | ||
1101 | |||
1102 | if ($mediaPath !== FALSE) { | ||
1103 | $mime = $this->getMime($source); | ||
1104 | $internalPath = Zip::getRelativePath("media/" . $internalPath . "/" . $internalSrc); | ||
1105 | |||
1106 | if (!array_key_exists($internalPath, $this->fileList) && | ||
1107 | $this->addLargeFile($internalPath, "m_" . $internalSrc, $mediaPath, $mime)) { | ||
1108 | $this->fileList[$internalPath] = $source; | ||
1109 | } | ||
1110 | if (isset($tmpFile)) { | ||
1111 | unlink($tmpFile); | ||
1112 | } | ||
1113 | return TRUE; | ||
1114 | } | ||
1115 | return FALSE; | ||
1116 | } | ||
1117 | |||
1118 | /** | ||
1119 | * Get Book Chapter count. | ||
1120 | * | ||
1121 | * @access public | ||
1122 | * @return number of chapters | ||
1123 | */ | ||
1124 | function getChapterCount() { | ||
1125 | return $this->chapterCount; | ||
1126 | } | ||
1127 | |||
1128 | /** | ||
1129 | * Book title, mandatory. | ||
1130 | * | ||
1131 | * Used for the dc:title metadata parameter in the OPF file as well as the DocTitle attribute in the NCX file. | ||
1132 | * | ||
1133 | * @param string $title | ||
1134 | * @access public | ||
1135 | * @return bool $success | ||
1136 | */ | ||
1137 | function setTitle($title) { | ||
1138 | if ($this->isFinalized) { | ||
1139 | return FALSE; | ||
1140 | } | ||
1141 | $this->title = $title; | ||
1142 | return TRUE; | ||
1143 | } | ||
1144 | |||
1145 | /** | ||
1146 | * Get Book title. | ||
1147 | * | ||
1148 | * @access public | ||
1149 | * @return $title | ||
1150 | */ | ||
1151 | function getTitle() { | ||
1152 | return $this->title; | ||
1153 | } | ||
1154 | |||
1155 | /** | ||
1156 | * Book language, mandatory | ||
1157 | * | ||
1158 | * Use the RFC3066 Language codes, such as "en", "da", "fr" etc. | ||
1159 | * Defaults to "en". | ||
1160 | * | ||
1161 | * Used for the dc:language metadata parameter in the OPF file. | ||
1162 | * | ||
1163 | * @param string $language | ||
1164 | * @access public | ||
1165 | * @return bool $success | ||
1166 | */ | ||
1167 | function setLanguage($language) { | ||
1168 | if ($this->isFinalized || mb_strlen($language) != 2) { | ||
1169 | return FALSE; | ||
1170 | } | ||
1171 | $this->language = $language; | ||
1172 | return TRUE; | ||
1173 | } | ||
1174 | |||
1175 | /** | ||
1176 | * Get Book language. | ||
1177 | * | ||
1178 | * @access public | ||
1179 | * @return $language | ||
1180 | */ | ||
1181 | function getLanguage() { | ||
1182 | return $this->language; | ||
1183 | } | ||
1184 | |||
1185 | /** | ||
1186 | * Unique book identifier, mandatory. | ||
1187 | * Use the URI, or ISBN if available. | ||
1188 | * | ||
1189 | * An unambiguous reference to the resource within a given context. | ||
1190 | * | ||
1191 | * Recommended best practice is to identify the resource by means of a | ||
1192 | * string conforming to a formal identification system. | ||
1193 | * | ||
1194 | * Used for the dc:identifier metadata parameter in the OPF file, as well | ||
1195 | * as dtb:uid in the NCX file. | ||
1196 | * | ||
1197 | * Identifier type should only be: | ||
1198 | * EPub::IDENTIFIER_URI | ||
1199 | * EPub::IDENTIFIER_ISBN | ||
1200 | * EPub::IDENTIFIER_UUID | ||
1201 | * | ||
1202 | * @param string $identifier | ||
1203 | * @param string $identifierType | ||
1204 | * @access public | ||
1205 | * @return bool $success | ||
1206 | */ | ||
1207 | function setIdentifier($identifier, $identifierType) { | ||
1208 | if ($this->isFinalized || ($identifierType !== EPub::IDENTIFIER_URI && $identifierType !== EPub::IDENTIFIER_ISBN && $identifierType !== EPub::IDENTIFIER_UUID)) { | ||
1209 | return FALSE; | ||
1210 | } | ||
1211 | $this->identifier = $identifier; | ||
1212 | $this->identifierType = $identifierType; | ||
1213 | return TRUE; | ||
1214 | } | ||
1215 | |||
1216 | /** | ||
1217 | * Get Book identifier. | ||
1218 | * | ||
1219 | * @access public | ||
1220 | * @return $identifier | ||
1221 | */ | ||
1222 | function getIdentifier() { | ||
1223 | return $this->identifier; | ||
1224 | } | ||
1225 | |||
1226 | /** | ||
1227 | * Get Book identifierType. | ||
1228 | * | ||
1229 | * @access public | ||
1230 | * @return $identifierType | ||
1231 | */ | ||
1232 | function getIdentifierType() { | ||
1233 | return $this->identifierType; | ||
1234 | } | ||
1235 | |||
1236 | /** | ||
1237 | * Book description, optional. | ||
1238 | * | ||
1239 | * An account of the resource. | ||
1240 | * | ||
1241 | * Description may include but is not limited to: an abstract, a table of | ||
1242 | * contents, a graphical representation, or a free-text account of the | ||
1243 | * resource. | ||
1244 | * | ||
1245 | * Used for the dc:source metadata parameter in the OPF file | ||
1246 | * | ||
1247 | * @param string $description | ||
1248 | * @access public | ||
1249 | * @return bool $success | ||
1250 | */ | ||
1251 | function setDescription($description) { | ||
1252 | if ($this->isFinalized) { | ||
1253 | return FALSE; | ||
1254 | } | ||
1255 | $this->description = $description; | ||
1256 | return TRUE; | ||
1257 | } | ||
1258 | |||
1259 | /** | ||
1260 | * Get Book description. | ||
1261 | * | ||
1262 | * @access public | ||
1263 | * @return $description | ||
1264 | */ | ||
1265 | function getDescription() { | ||
1266 | return $this->description; | ||
1267 | } | ||
1268 | |||
1269 | /** | ||
1270 | * Book author or creator, optional. | ||
1271 | * The $authorSortKey is basically how the name is to be sorted, usually | ||
1272 | * it's "Lastname, First names" where the $author is the straight | ||
1273 | * "Firstnames Lastname" | ||
1274 | * | ||
1275 | * An entity primarily responsible for making the resource. | ||
1276 | * | ||
1277 | * Examples of a Creator include a person, an organization, or a service. | ||
1278 | * Typically, the name of a Creator should be used to indicate the entity. | ||
1279 | * | ||
1280 | * Used for the dc:creator metadata parameter in the OPF file and the | ||
1281 | * docAuthor attribure in the NCX file. | ||
1282 | * The sort key is used for the opf:file-as attribute in dc:creator. | ||
1283 | * | ||
1284 | * @param string $author | ||
1285 | * @param string $authorSortKey | ||
1286 | * @access public | ||
1287 | * @return bool $success | ||
1288 | */ | ||
1289 | function setAuthor($author, $authorSortKey) { | ||
1290 | if ($this->isFinalized) { | ||
1291 | return FALSE; | ||
1292 | } | ||
1293 | $this->author = $author; | ||
1294 | $this->authorSortKey = $authorSortKey; | ||
1295 | return TRUE; | ||
1296 | } | ||
1297 | |||
1298 | /** | ||
1299 | * Get Book author. | ||
1300 | * | ||
1301 | * @access public | ||
1302 | * @return $author | ||
1303 | */ | ||
1304 | function getAuthor() { | ||
1305 | return $this->author; | ||
1306 | } | ||
1307 | |||
1308 | /** | ||
1309 | * Publisher Information, optional. | ||
1310 | * | ||
1311 | * An entity responsible for making the resource available. | ||
1312 | * | ||
1313 | * Examples of a Publisher include a person, an organization, or a service. | ||
1314 | * Typically, the name of a Publisher should be used to indicate the entity. | ||
1315 | * | ||
1316 | * Used for the dc:publisher and dc:relation metadata parameters in the OPF file. | ||
1317 | * | ||
1318 | * @param string $publisherName | ||
1319 | * @param string $publisherURL | ||
1320 | * @access public | ||
1321 | * @return bool $success | ||
1322 | */ | ||
1323 | function setPublisher($publisherName, $publisherURL) { | ||
1324 | if ($this->isFinalized) { | ||
1325 | return FALSE; | ||
1326 | } | ||
1327 | $this->publisherName = $publisherName; | ||
1328 | $this->publisherURL = $publisherURL; | ||
1329 | return TRUE; | ||
1330 | } | ||
1331 | |||
1332 | /** | ||
1333 | * Get Book publisherName. | ||
1334 | * | ||
1335 | * @access public | ||
1336 | * @return $publisherName | ||
1337 | */ | ||
1338 | function getPublisherName() { | ||
1339 | return $this->publisherName; | ||
1340 | } | ||
1341 | |||
1342 | /** | ||
1343 | * Get Book publisherURL. | ||
1344 | * | ||
1345 | * @access public | ||
1346 | * @return $publisherURL | ||
1347 | */ | ||
1348 | function getPublisherURL() { | ||
1349 | return $this->publisherURL; | ||
1350 | } | ||
1351 | |||
1352 | /** | ||
1353 | * Release date, optional. If left blank, the time of the finalization will | ||
1354 | * be used. | ||
1355 | * | ||
1356 | * A point or period of time associated with an event in the lifecycle of | ||
1357 | * the resource. | ||
1358 | * | ||
1359 | * Date may be used to express temporal information at any level of | ||
1360 | * granularity. Recommended best practice is to use an encoding scheme, | ||
1361 | * such as the W3CDTF profile of ISO 8601 [W3CDTF]. | ||
1362 | * | ||
1363 | * Used for the dc:date metadata parameter in the OPF file | ||
1364 | * | ||
1365 | * @param long $timestamp | ||
1366 | * @access public | ||
1367 | * @return bool $success | ||
1368 | */ | ||
1369 | function setDate($timestamp) { | ||
1370 | if ($this->isFinalized) { | ||
1371 | return FALSE; | ||
1372 | } | ||
1373 | $this->date = $timestamp; | ||
1374 | $this->opf->date = $timestamp; | ||
1375 | return TRUE; | ||
1376 | } | ||
1377 | |||
1378 | /** | ||
1379 | * Get Book date. | ||
1380 | * | ||
1381 | * @access public | ||
1382 | * @return $date | ||
1383 | */ | ||
1384 | function getDate() { | ||
1385 | return $this->date; | ||
1386 | } | ||
1387 | |||
1388 | /** | ||
1389 | * Book (copy)rights, optional. | ||
1390 | * | ||
1391 | * Information about rights held in and over the resource. | ||
1392 | * | ||
1393 | * Typically, rights information includes a statement about various | ||
1394 | * property rights associated with the resource, including intellectual | ||
1395 | * property rights. | ||
1396 | * | ||
1397 | * Used for the dc:rights metadata parameter in the OPF file | ||
1398 | * | ||
1399 | * @param string $rightsText | ||
1400 | * @access public | ||
1401 | * @return bool $success | ||
1402 | */ | ||
1403 | function setRights($rightsText) { | ||
1404 | if ($this->isFinalized) { | ||
1405 | return FALSE; | ||
1406 | } | ||
1407 | $this->rights = $rightsText; | ||
1408 | return TRUE; | ||
1409 | } | ||
1410 | |||
1411 | /** | ||
1412 | * Get Book rights. | ||
1413 | * | ||
1414 | * @access public | ||
1415 | * @return $rights | ||
1416 | */ | ||
1417 | function getRights() { | ||
1418 | return $this->rights; | ||
1419 | } | ||
1420 | |||
1421 | /** | ||
1422 | * Add book Subject. | ||
1423 | * | ||
1424 | * The topic of the resource. | ||
1425 | * | ||
1426 | * Typically, the subject will be represented using keywords, key phrases, | ||
1427 | * or classification codes. Recommended best practice is to use a | ||
1428 | * controlled vocabulary. To describe the spatial or temporal topic of the | ||
1429 | * resource, use the Coverage element. | ||
1430 | * | ||
1431 | * @param string $subject | ||
1432 | */ | ||
1433 | function setSubject($subject) { | ||
1434 | if ($this->isFinalized) { | ||
1435 | return; | ||
1436 | } | ||
1437 | $this->opf->addDCMeta(DublinCore::SUBJECT, $this->decodeHtmlEntities($subject)); | ||
1438 | } | ||
1439 | |||
1440 | /** | ||
1441 | * Book source URL, optional. | ||
1442 | * | ||
1443 | * A related resource from which the described resource is derived. | ||
1444 | * | ||
1445 | * The described resource may be derived from the related resource in whole | ||
1446 | * or in part. Recommended best practice is to identify the related | ||
1447 | * resource by means of a string conforming to a formal identification system. | ||
1448 | * | ||
1449 | * Used for the dc:source metadata parameter in the OPF file | ||
1450 | * | ||
1451 | * @param string $sourceURL | ||
1452 | * @access public | ||
1453 | * @return bool $success | ||
1454 | */ | ||
1455 | function setSourceURL($sourceURL) { | ||
1456 | if ($this->isFinalized) { | ||
1457 | return FALSE; | ||
1458 | } | ||
1459 | $this->sourceURL = $sourceURL; | ||
1460 | return TRUE; | ||
1461 | } | ||
1462 | |||
1463 | /** | ||
1464 | * Get Book sourceURL. | ||
1465 | * | ||
1466 | * @access public | ||
1467 | * @return $sourceURL | ||
1468 | */ | ||
1469 | function getSourceURL() { | ||
1470 | return $this->sourceURL; | ||
1471 | } | ||
1472 | |||
1473 | /** | ||
1474 | * Coverage, optional. | ||
1475 | * | ||
1476 | * The spatial or temporal topic of the resource, the spatial applicability | ||
1477 | * of the resource, or the jurisdiction under which the resource is relevant. | ||
1478 | * | ||
1479 | * Spatial topic and spatial applicability may be a named place or a location | ||
1480 | * specified by its geographic coordinates. Temporal topic may be a named | ||
1481 | * period, date, or date range. A jurisdiction may be a named administrative | ||
1482 | * entity or a geographic place to which the resource applies. Recommended | ||
1483 | * best practice is to use a controlled vocabulary such as the Thesaurus of | ||
1484 | * Geographic Names [TGN]. Where appropriate, named places or time periods | ||
1485 | * can be used in preference to numeric identifiers such as sets of | ||
1486 | * coordinates or date ranges. | ||
1487 | * | ||
1488 | * Used for the dc:coverage metadata parameter in the OPF file | ||
1489 | * | ||
1490 | * Same as ->addDublinCoreMetadata(DublinCore::COVERAGE, $coverage); | ||
1491 | * | ||
1492 | * @param string $coverage | ||
1493 | * @access public | ||
1494 | * @return bool $success | ||
1495 | */ | ||
1496 | function setCoverage($coverage) { | ||
1497 | if ($this->isFinalized) { | ||
1498 | return FALSE; | ||
1499 | } | ||
1500 | $this->coverage = $coverage; | ||
1501 | return TRUE; | ||
1502 | } | ||
1503 | |||
1504 | /** | ||
1505 | * Get Book coverage. | ||
1506 | * | ||
1507 | * @access public | ||
1508 | * @return $coverage | ||
1509 | */ | ||
1510 | function getCoverage() { | ||
1511 | return $this->coverage; | ||
1512 | } | ||
1513 | |||
1514 | /** | ||
1515 | * Set book Relation. | ||
1516 | * | ||
1517 | * A related resource. | ||
1518 | * | ||
1519 | * Recommended best practice is to identify the related resource by means | ||
1520 | * of a string conforming to a formal identification system. | ||
1521 | * | ||
1522 | * @param string $relation | ||
1523 | */ | ||
1524 | function setRelation($relation) { | ||
1525 | if ($this->isFinalized) { | ||
1526 | return; | ||
1527 | } | ||
1528 | $this->relation = $relation; | ||
1529 | } | ||
1530 | |||
1531 | /** | ||
1532 | * Get the book relation. | ||
1533 | * | ||
1534 | * @return string The relation. | ||
1535 | */ | ||
1536 | function getRelation() { | ||
1537 | return $this->relation; | ||
1538 | } | ||
1539 | |||
1540 | /** | ||
1541 | * Set book Generator. | ||
1542 | * | ||
1543 | * The generator is a meta tag added to the ncx file, it is not visible | ||
1544 | * from within the book, but is a kind of electronic watermark. | ||
1545 | * | ||
1546 | * @param string $generator | ||
1547 | */ | ||
1548 | function setGenerator($generator) { | ||
1549 | if ($this->isFinalized) { | ||
1550 | return; | ||
1551 | } | ||
1552 | $this->generator = $generator; | ||
1553 | } | ||
1554 | |||
1555 | /** | ||
1556 | * Get the book relation. | ||
1557 | * | ||
1558 | * @return string The generator identity string. | ||
1559 | */ | ||
1560 | function getGenerator() { | ||
1561 | return $this->generator; | ||
1562 | } | ||
1563 | |||
1564 | /** | ||
1565 | * Set ePub date formate to the short yyyy-mm-dd form, for compliance with | ||
1566 | * a bug in EpubCheck, prior to its version 1.1. | ||
1567 | * | ||
1568 | * The latest version of ePubCheck can be obtained here: | ||
1569 | * http://code.google.com/p/epubcheck/ | ||
1570 | * | ||
1571 | * @access public | ||
1572 | * @return bool $success | ||
1573 | */ | ||
1574 | function setShortDateFormat() { | ||
1575 | if ($this->isFinalized) { | ||
1576 | return FALSE; | ||
1577 | } | ||
1578 | $this->dateformat = $this->dateformatShort; | ||
1579 | return TRUE; | ||
1580 | } | ||
1581 | |||
1582 | /** | ||
1583 | * @Deprecated | ||
1584 | */ | ||
1585 | function setIgnoreEmptyBuffer($ignoreEmptyBuffer = TRUE) { | ||
1586 | die ("Function was deprecated, functionality is no longer needed."); | ||
1587 | } | ||
1588 | |||
1589 | /** | ||
1590 | * Set the references title for the ePub 3 landmarks section | ||
1591 | * | ||
1592 | * @param string $referencesTitle | ||
1593 | * @param string $referencesId | ||
1594 | * @param string $referencesClass | ||
1595 | * @return bool | ||
1596 | */ | ||
1597 | function setReferencesTitle($referencesTitle = "Guide", $referencesId = "", $referencesClass = "references") { | ||
1598 | if ($this->isFinalized) { | ||
1599 | return FALSE; | ||
1600 | } | ||
1601 | $this->ncx->referencesTitle = is_string($referencesTitle) ? trim($referencesTitle) : "Guide"; | ||
1602 | $this->ncx->referencesId = is_string($referencesId) ? trim($referencesId) : "references"; | ||
1603 | $this->ncx->referencesClass = is_string($referencesClass) ? trim($referencesClass) : "references"; | ||
1604 | return TRUE; | ||
1605 | } | ||
1606 | |||
1607 | /** | ||
1608 | * Set the references title for the ePub 3 landmarks section | ||
1609 | * | ||
1610 | * @param bool $referencesTitle | ||
1611 | */ | ||
1612 | function setisReferencesAddedToToc($isReferencesAddedToToc = TRUE) { | ||
1613 | if ($this->isFinalized) { | ||
1614 | return FALSE; | ||
1615 | } | ||
1616 | $this->isReferencesAddedToToc = $isReferencesAddedToToc === TRUE; | ||
1617 | return TRUE; | ||
1618 | } | ||
1619 | |||
1620 | /** | ||
1621 | * Get Book status. | ||
1622 | * | ||
1623 | * @access public | ||
1624 | * @return bool | ||
1625 | */ | ||
1626 | function isFinalized() { | ||
1627 | return $this->isFinalized; | ||
1628 | } | ||
1629 | |||
1630 | /** | ||
1631 | * Build the Table of Contents. This is not strictly necessary, as most eReaders will build it from the navigation structure in the .ncx file. | ||
1632 | * | ||
1633 | * @param string $cssFileName Include a link to this css file in the TOC html. | ||
1634 | * @param string $tocCSSClass The TOC is a <div>, if you need special formatting, you can add a css class for that div. Default is "toc". | ||
1635 | * @param string $title Title of the Table of contents. Default is "Table of Contents". Use this for ie. languages other than English. | ||
1636 | * @param bool $addReferences include reference pages in the TOC, using the $referencesOrder array to determine the order of the pages in the TOC. Default is TRUE. | ||
1637 | * @param bool $addToIndex Add the TOC to the NCX index at the current leve/position. Default is FALSE | ||
1638 | * @param string $tocFileName Change teh default name of the TOC file. The default is "TOC.xhtml" | ||
1639 | */ | ||
1640 | function buildTOC($cssFileName = NULL, $tocCSSClass = "toc", $title = "Table of Contents", $addReferences = TRUE, $addToIndex = FALSE, $tocFileName = "TOC.xhtml") { | ||
1641 | if ($this->isFinalized) { | ||
1642 | return FALSE; | ||
1643 | } | ||
1644 | $this->buildTOC = TRUE; | ||
1645 | $this->tocTitle = $title; | ||
1646 | $this->tocFileName = $this->normalizeFileName($tocFileName); | ||
1647 | if (!empty($cssFileName)) { | ||
1648 | $this->tocCSSFileName = $this->normalizeFileName($cssFileName); | ||
1649 | } | ||
1650 | $this->tocCSSClass = $tocCSSClass; | ||
1651 | $this->tocAddReferences = $addReferences; | ||
1652 | |||
1653 | $this->opf->addItemRef("ref_" . Reference::TABLE_OF_CONTENTS, FALSE); | ||
1654 | $this->opf->addReference(Reference::TABLE_OF_CONTENTS, $title, $this->tocFileName); | ||
1655 | |||
1656 | if ($addToIndex) { | ||
1657 | $navPoint = new NavPoint($this->decodeHtmlEntities($title), $this->tocFileName, "ref_" . Reference::TABLE_OF_CONTENTS); | ||
1658 | $this->ncx->addNavPoint($navPoint); | ||
1659 | } else { | ||
1660 | $this->ncx->referencesList[Reference::TABLE_OF_CONTENTS] = $this->tocFileName; | ||
1661 | $this->ncx->referencesName[Reference::TABLE_OF_CONTENTS] = $title; | ||
1662 | } | ||
1663 | } | ||
1664 | |||
1665 | private function finalizeTOC() { | ||
1666 | if (!$this->buildTOC) { | ||
1667 | return FALSE; | ||
1668 | } | ||
1669 | |||
1670 | if (empty($this->tocTitle)) { | ||
1671 | $this->tocTitle = "Table of Contents"; | ||
1672 | } | ||
1673 | |||
1674 | $tocData = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"; | ||
1675 | |||
1676 | if ($this->isEPubVersion2()) { | ||
1677 | $tocData .= "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n" | ||
1678 | . " \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n" | ||
1679 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" | ||
1680 | . "<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n"; | ||
1681 | } else { | ||
1682 | $tocData .= "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
1683 | . "<head>\n<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n"; | ||
1684 | } | ||
1685 | |||
1686 | if (!empty($this->tocCssFileName)) { | ||
1687 | $tocData .= "<link rel=\"stylesheet\" type=\"text/css\" href=\"" . $this->tocCssFileName . "\" />\n"; | ||
1688 | } | ||
1689 | |||
1690 | $tocData .= "<title>" . $this->tocTitle . "</title>\n" | ||
1691 | . "</head>\n" | ||
1692 | . "<body>\n" | ||
1693 | . "<h3>" . $this->tocTitle . "</h3>\n<div"; | ||
1694 | |||
1695 | if (!empty($this->tocCSSClass)) { | ||
1696 | $tocData .= " class=\"" . $this->tocCSSClass . "\""; | ||
1697 | } | ||
1698 | $tocData .= ">\n"; | ||
1699 | |||
1700 | while (list($item, $descriptive) = each($this->referencesOrder)) { | ||
1701 | if ($item === "text") { | ||
1702 | while (list($chapterName, $navPoint) = each($this->ncx->chapterList)) { | ||
1703 | $fileName = $navPoint->getContentSrc(); | ||
1704 | $level = $navPoint->getLevel() -2; | ||
1705 | $tocData .= "\t<p>" . str_repeat("      ", $level) . "<a href=\"" . $this->sanitizeFileName($fileName) . "\">" . $chapterName . "</a></p>\n"; | ||
1706 | } | ||
1707 | } else if ($this->tocAddReferences === TRUE) { | ||
1708 | if (array_key_exists($item, $this->ncx->referencesList)) { | ||
1709 | $tocData .= "\t<p><a href=\"" . $this->ncx->referencesList[$item] . "\">" . $descriptive . "</a></p>\n"; | ||
1710 | } else if ($item === "toc") { | ||
1711 | $tocData .= "\t<p><a href=\"TOC.xhtml\">" . $this->tocTitle . "</a></p>\n"; | ||
1712 | } else if ($item === "cover" && $this->isCoverImageSet) { | ||
1713 | $tocData .= "\t<p><a href=\"CoverPage.xhtml\">" . $descriptive . "</a></p>\n"; | ||
1714 | } | ||
1715 | } | ||
1716 | } | ||
1717 | $tocData .= "</div>\n</body>\n</html>\n"; | ||
1718 | |||
1719 | $this->addReferencePage($this->tocTitle, $this->tocFileName, $tocData, Reference::TABLE_OF_CONTENTS); | ||
1720 | |||
1721 | } | ||
1722 | |||
1723 | /** | ||
1724 | * @return bool | ||
1725 | */ | ||
1726 | function isEPubVersion2() { | ||
1727 | return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; | ||
1728 | } | ||
1729 | |||
1730 | /** | ||
1731 | * @param string $cssFileName | ||
1732 | * @param string $title | ||
1733 | * @return string | ||
1734 | */ | ||
1735 | function buildEPub3TOC($cssFileName = NULL, $title = "Table of Contents") { | ||
1736 | $this->ncx->referencesOrder = $this->referencesOrder; | ||
1737 | $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title)); | ||
1738 | return $this->ncx->finalizeEPub3($title, $cssFileName); | ||
1739 | } | ||
1740 | |||
1741 | /** | ||
1742 | * @param string $fileName | ||
1743 | * @param string $tocData | ||
1744 | * @return bool | ||
1745 | */ | ||
1746 | function addEPub3TOC($fileName, $tocData) { | ||
1747 | if ($this->isEPubVersion2() || $this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
1748 | return FALSE; | ||
1749 | } | ||
1750 | $fileName = Zip::getRelativePath($fileName); | ||
1751 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
1752 | |||
1753 | $this->zip->addFile($tocData, $this->bookRoot.$fileName); | ||
1754 | |||
1755 | $this->fileList[$fileName] = $fileName; | ||
1756 | $this->opf->addItem("toc", $fileName, "application/xhtml+xml", "nav"); | ||
1757 | return TRUE; | ||
1758 | } | ||
1759 | |||
1760 | /** | ||
1761 | * Check for mandatory parameters and finalize the e-book. | ||
1762 | * Once finalized, the book is locked for further additions. | ||
1763 | * | ||
1764 | * @return bool $success | ||
1765 | */ | ||
1766 | function finalize() { | ||
1767 | if ($this->isFinalized || $this->chapterCount == 0 || empty($this->title) || empty($this->language)) { | ||
1768 | return FALSE; | ||
1769 | } | ||
1770 | |||
1771 | if (empty($this->identifier) || empty($this->identifierType)) { | ||
1772 | $this->setIdentifier($this->createUUID(4), EPub::IDENTIFIER_UUID); | ||
1773 | } | ||
1774 | |||
1775 | if ($this->date == 0) { | ||
1776 | $this->date = time(); | ||
1777 | } | ||
1778 | |||
1779 | if (empty($this->sourceURL)) { | ||
1780 | $this->sourceURL = $this->getCurrentPageURL(); | ||
1781 | } | ||
1782 | |||
1783 | if (empty($this->publisherURL)) { | ||
1784 | $this->sourceURL = $this->getCurrentServerURL(); | ||
1785 | } | ||
1786 | |||
1787 | // Generate OPF data: | ||
1788 | $this->opf->setIdent("BookId"); | ||
1789 | $this->opf->initialize($this->title, $this->language, $this->identifier, $this->identifierType); | ||
1790 | |||
1791 | $DCdate = new DublinCore(DublinCore::DATE, gmdate($this->dateformat, $this->date)); | ||
1792 | $DCdate->addOpfAttr("event", "publication"); | ||
1793 | $this->opf->metadata->addDublinCore($DCdate); | ||
1794 | |||
1795 | if (!empty($this->description)) { | ||
1796 | $this->opf->addDCMeta(DublinCore::DESCRIPTION, $this->decodeHtmlEntities($this->description)); | ||
1797 | } | ||
1798 | |||
1799 | if (!empty($this->publisherName)) { | ||
1800 | $this->opf->addDCMeta(DublinCore::PUBLISHER, $this->decodeHtmlEntities($this->publisherName)); | ||
1801 | } | ||
1802 | |||
1803 | if (!empty($this->publisherURL)) { | ||
1804 | $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->publisherURL)); | ||
1805 | } | ||
1806 | |||
1807 | if (!empty($this->author)) { | ||
1808 | $author = $this->decodeHtmlEntities($this->author); | ||
1809 | $this->opf->addCreator($author, $this->decodeHtmlEntities($this->authorSortKey), MarcCode::AUTHOR); | ||
1810 | $this->ncx->setDocAuthor($author); | ||
1811 | } | ||
1812 | |||
1813 | if (!empty($this->rights)) { | ||
1814 | $this->opf->addDCMeta(DublinCore::RIGHTS, $this->decodeHtmlEntities($this->rights)); | ||
1815 | } | ||
1816 | |||
1817 | if (!empty($this->coverage)) { | ||
1818 | $this->opf->addDCMeta(DublinCore::COVERAGE, $this->decodeHtmlEntities($this->coverage)); | ||
1819 | } | ||
1820 | |||
1821 | if (!empty($this->sourceURL)) { | ||
1822 | $this->opf->addDCMeta(DublinCore::SOURCE, $this->sourceURL); | ||
1823 | } | ||
1824 | |||
1825 | if (!empty($this->relation)) { | ||
1826 | $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->relation)); | ||
1827 | } | ||
1828 | |||
1829 | if ($this->isCoverImageSet) { | ||
1830 | $this->opf->addMeta("cover", "coverImage"); | ||
1831 | } | ||
1832 | |||
1833 | if (!empty($this->generator)) { | ||
1834 | $gen = $this->decodeHtmlEntities($this->generator); | ||
1835 | $this->opf->addMeta("generator", $gen); | ||
1836 | $this->ncx->addMetaEntry("dtb:generator", $gen); | ||
1837 | } | ||
1838 | |||
1839 | if ($this->EPubMark) { | ||
1840 | $this->opf->addMeta("generator", "EPub (Version " . self::VERSION . ") by A. Grandt, http://www.phpclasses.org/package/6115"); | ||
1841 | } | ||
1842 | |||
1843 | reset($this->ncx->chapterList); | ||
1844 | list($firstChapterName, $firstChapterNavPoint) = each($this->ncx->chapterList); | ||
1845 | $firstChapterFileName = $firstChapterNavPoint->getContentSrc(); | ||
1846 | $this->opf->addReference(Reference::TEXT, $this->decodeHtmlEntities($firstChapterName), $firstChapterFileName); | ||
1847 | |||
1848 | $this->ncx->setUid($this->identifier); | ||
1849 | |||
1850 | $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title)); | ||
1851 | |||
1852 | $this->ncx->referencesOrder = $this->referencesOrder; | ||
1853 | if ($this->isReferencesAddedToToc) { | ||
1854 | $this->ncx->finalizeReferences(); | ||
1855 | } | ||
1856 | |||
1857 | $this->finalizeTOC(); | ||
1858 | |||
1859 | if (!$this->isEPubVersion2()) { | ||
1860 | $this->addEPub3TOC("epub3toc.xhtml", $this->buildEPub3TOC()); | ||
1861 | } | ||
1862 | |||
1863 | $opfFinal = $this->fixEncoding($this->opf->finalize()); | ||
1864 | $ncxFinal = $this->fixEncoding($this->ncx->finalize()); | ||
1865 | |||
1866 | if (mb_detect_encoding($opfFinal, 'UTF-8', true) === "UTF-8") { | ||
1867 | $this->zip->addFile($opfFinal, $this->bookRoot."book.opf"); | ||
1868 | } else { | ||
1869 | $this->zip->addFile(mb_convert_encoding($opfFinal, "UTF-8"), $this->bookRoot."book.opf"); | ||
1870 | } | ||
1871 | |||
1872 | if (mb_detect_encoding($ncxFinal, 'UTF-8', true) === "UTF-8") { | ||
1873 | $this->zip->addFile($ncxFinal, $this->bookRoot."book.ncx"); | ||
1874 | } else { | ||
1875 | $this->zip->addFile(mb_convert_encoding($ncxFinal, "UTF-8"), $this->bookRoot."book.ncx"); | ||
1876 | } | ||
1877 | |||
1878 | $this->opf = NULL; | ||
1879 | $this->ncx = NULL; | ||
1880 | |||
1881 | $this->isFinalized = TRUE; | ||
1882 | return TRUE; | ||
1883 | } | ||
1884 | |||
1885 | /** | ||
1886 | * Ensure the encoded string is a valid UTF-8 string. | ||
1887 | * | ||
1888 | * Note, that a mb_detect_encoding on the returned string will still return ASCII if the entire string is comprized of characters in the 1-127 range. | ||
1889 | * | ||
1890 | * @link: http://snippetdb.com/php/convert-string-to-utf-8-for-mysql | ||
1891 | * @param string $in_str | ||
1892 | * @return string converted string. | ||
1893 | */ | ||
1894 | function fixEncoding($in_str) { | ||
1895 | if (mb_detect_encoding($in_str) == "UTF-8" && mb_check_encoding($in_str,"UTF-8")) { | ||
1896 | return $in_str; | ||
1897 | } else { | ||
1898 | return utf8_encode($in_str); | ||
1899 | } | ||
1900 | } | ||
1901 | |||
1902 | /** | ||
1903 | * Return the finalized book. | ||
1904 | * | ||
1905 | * @return string with the book in binary form. | ||
1906 | */ | ||
1907 | function getBook() { | ||
1908 | if (!$this->isFinalized) { | ||
1909 | $this->finalize(); | ||
1910 | } | ||
1911 | |||
1912 | return $this->zip->getZipData(); | ||
1913 | } | ||
1914 | |||
1915 | /** | ||
1916 | * Remove disallowed characters from string to get a nearly safe filename | ||
1917 | * | ||
1918 | * @param string $fileName | ||
1919 | * @return mixed|string | ||
1920 | */ | ||
1921 | function sanitizeFileName($fileName) { | ||
1922 | $fileName1 = str_replace($this->forbiddenCharacters, '', $fileName); | ||
1923 | $fileName2 = preg_replace('/[\s-]+/', '-', $fileName1); | ||
1924 | return trim($fileName2, '.-_'); | ||
1925 | |||
1926 | } | ||
1927 | |||
1928 | /** | ||
1929 | * Cleanup the filepath, and remove leading . and / characters. | ||
1930 | * | ||
1931 | * Sometimes, when a path is generated from multiple fragments, | ||
1932 | * you can get something like "../data/html/../images/image.jpeg" | ||
1933 | * ePub files don't work well with that, this will normalize that | ||
1934 | * example path to "data/images/image.jpeg" | ||
1935 | * | ||
1936 | * @param string $fileName | ||
1937 | * @return string normalized filename | ||
1938 | */ | ||
1939 | function normalizeFileName($fileName) { | ||
1940 | return preg_replace('#^[/\.]+#i', "", Zip::getRelativePath($fileName)); | ||
1941 | } | ||
1942 | |||
1943 | /** | ||
1944 | * Save the ePub file to local disk. | ||
1945 | * | ||
1946 | * @param string $fileName | ||
1947 | * @param string $baseDir If empty baseDir is absolute to server path, if omitted it's relative to script path | ||
1948 | * @return The sent file name if successfull, FALSE if it failed. | ||
1949 | */ | ||
1950 | function saveBook($fileName, $baseDir = '.') { | ||
1951 | |||
1952 | // Make fileName safe | ||
1953 | $fileName = $this->sanitizeFileName($fileName); | ||
1954 | |||
1955 | // Finalize book, if it's not done already | ||
1956 | if (!$this->isFinalized) { | ||
1957 | $this->finalize(); | ||
1958 | } | ||
1959 | |||
1960 | if (stripos(strrev($fileName), "bupe.") !== 0) { | ||
1961 | $fileName .= ".epub"; | ||
1962 | } | ||
1963 | |||
1964 | // Try to open file access | ||
1965 | $fh = fopen($baseDir.'/'.$fileName, "w"); | ||
1966 | |||
1967 | if ($fh) { | ||
1968 | fputs($fh, $this->getBook()); | ||
1969 | fclose($fh); | ||
1970 | |||
1971 | // if file is written return TRUE | ||
1972 | return $fileName; | ||
1973 | } | ||
1974 | |||
1975 | // return FALSE by default | ||
1976 | return FALSE; | ||
1977 | } | ||
1978 | |||
1979 | /** | ||
1980 | * Return the finalized book size. | ||
1981 | * | ||
1982 | * @return string | ||
1983 | */ | ||
1984 | function getBookSize() { | ||
1985 | if (!$this->isFinalized) { | ||
1986 | $this->finalize(); | ||
1987 | } | ||
1988 | |||
1989 | return $this->zip->getArchiveSize(); | ||
1990 | } | ||
1991 | |||
1992 | /** | ||
1993 | * Send the book as a zip download | ||
1994 | * | ||
1995 | * Sending will fail if the output buffer is in use. You can override this limit by | ||
1996 | * calling setIgnoreEmptyBuffer(TRUE), though the function will still fail if that | ||
1997 | * buffer is not empty. | ||
1998 | * | ||
1999 | * @param string $fileName The name of the book without the .epub at the end. | ||
2000 | * @return The sent file name if successfull, FALSE if it failed. | ||
2001 | */ | ||
2002 | function sendBook($fileName) { | ||
2003 | if (!$this->isFinalized) { | ||
2004 | $this->finalize(); | ||
2005 | } | ||
2006 | |||
2007 | if (stripos(strrev($fileName), "bupe.") !== 0) { | ||
2008 | $fileName .= ".epub"; | ||
2009 | } | ||
2010 | |||
2011 | if (TRUE === $this->zip->sendZip($fileName, "application/epub+zip")) { | ||
2012 | return $fileName; | ||
2013 | } | ||
2014 | return FALSE; | ||
2015 | } | ||
2016 | |||
2017 | /** | ||
2018 | * Generates an UUID. | ||
2019 | * | ||
2020 | * Default version (4) will generate a random UUID, version 3 will URL based UUID. | ||
2021 | * | ||
2022 | * Added for convinience | ||
2023 | * | ||
2024 | * @param int $bookVersion UUID version to retrieve, See lib.uuid.manual.html for details. | ||
2025 | * @param string $url | ||
2026 | * @return string The formatted uuid | ||
2027 | */ | ||
2028 | function createUUID($bookVersion = 4, $url = NULL) { | ||
2029 | include_once("lib.uuid.php"); | ||
2030 | return UUID::mint($bookVersion, $url, UUID::nsURL); | ||
2031 | } | ||
2032 | |||
2033 | /** | ||
2034 | * Get the url of the current page. | ||
2035 | * Example use: Default Source URL | ||
2036 | * | ||
2037 | * $return string Page URL. | ||
2038 | */ | ||
2039 | function getCurrentPageURL() { | ||
2040 | $pageURL = $this->getCurrentServerURL() . filter_input(INPUT_SERVER, "REQUEST_URI"); | ||
2041 | return $pageURL; | ||
2042 | } | ||
2043 | |||
2044 | /** | ||
2045 | * Get the url of the server. | ||
2046 | * Example use: Default Publisher URL | ||
2047 | * | ||
2048 | * $return string Server URL. | ||
2049 | */ | ||
2050 | function getCurrentServerURL() { | ||
2051 | $serverURL = 'http'; | ||
2052 | $https = filter_input(INPUT_SERVER, "HTTPS"); | ||
2053 | $port = filter_input(INPUT_SERVER, "SERVER_PORT"); | ||
2054 | |||
2055 | if ($https === "on") { | ||
2056 | $serverURL .= "s"; | ||
2057 | } | ||
2058 | $serverURL .= "://" . filter_input(INPUT_SERVER, "SERVER_NAME"); | ||
2059 | if ($port != "80") { | ||
2060 | $serverURL .= ":" . $port; | ||
2061 | } | ||
2062 | return $serverURL . '/'; | ||
2063 | } | ||
2064 | |||
2065 | /** | ||
2066 | * Try to determine the mimetype of the file path. | ||
2067 | * | ||
2068 | * @param string $source Path | ||
2069 | * @return string mimetype, or FALSE. | ||
2070 | */ | ||
2071 | function getMime($source) { | ||
2072 | return $this->mimetypes[pathinfo($source, PATHINFO_EXTENSION)]; | ||
2073 | } | ||
2074 | |||
2075 | /** | ||
2076 | * Get an image from a file or url, return it resized if the image exceeds the $maxImageWidth or $maxImageHeight directives. | ||
2077 | * | ||
2078 | * The return value is an array. | ||
2079 | * ['width'] is the width of the image. | ||
2080 | * ['height'] is the height of the image. | ||
2081 | * ['mime'] is the mime type of the image. Resized images are always in jpeg format. | ||
2082 | * ['image'] is the image data. | ||
2083 | * ['ext'] is the extension of the image file. | ||
2084 | * | ||
2085 | * @param string $source path or url to file. | ||
2086 | * $return array | ||
2087 | */ | ||
2088 | function getImage($source) { | ||
2089 | $width = -1; | ||
2090 | $height = -1; | ||
2091 | $mime = "application/octet-stream"; | ||
2092 | $type = FALSE; | ||
2093 | $ext = ""; | ||
2094 | |||
2095 | |||
2096 | $image = $this->getFileContents($source); | ||
2097 | |||
2098 | if ($image !== FALSE && strlen($image) > 0) { | ||
2099 | $imageFile = imagecreatefromstring($image); | ||
2100 | if ($imageFile !== false) { | ||
2101 | $width = ImageSX($imageFile); | ||
2102 | $height = ImageSY($imageFile); | ||
2103 | } | ||
2104 | if ($this->isExifInstalled) { | ||
2105 | @$type = exif_imagetype($source); | ||
2106 | $mime = image_type_to_mime_type($type); | ||
2107 | } | ||
2108 | if ($mime === "application/octet-stream") { | ||
2109 | $mime = $this->image_file_type_from_binary($image); | ||
2110 | } | ||
2111 | if ($mime === "application/octet-stream") { | ||
2112 | $mime = $this->getMimeTypeFromUrl($source); | ||
2113 | } | ||
2114 | } else { | ||
2115 | return FALSE; | ||
2116 | } | ||
2117 | |||
2118 | if ($width <= 0 || $height <= 0) { | ||
2119 | return FALSE; | ||
2120 | } | ||
2121 | |||
2122 | $ratio = 1; | ||
2123 | |||
2124 | if ($this->isGdInstalled) { | ||
2125 | if ($width > $this->maxImageWidth) { | ||
2126 | $ratio = $this->maxImageWidth/$width; | ||
2127 | } | ||
2128 | if ($height*$ratio > $this->maxImageHeight) { | ||
2129 | $ratio = $this->maxImageHeight/$height; | ||
2130 | } | ||
2131 | |||
2132 | if ($ratio < 1 || empty($mime) || ($this->isGifImagesEnabled !== FALSE && $mime == "image/gif")) { | ||
2133 | $image_o = imagecreatefromstring($image); | ||
2134 | $image_p = imagecreatetruecolor($width*$ratio, $height*$ratio); | ||
2135 | |||
2136 | if ($mime == "image/png") { | ||
2137 | imagealphablending($image_p, false); | ||
2138 | imagesavealpha($image_p, true); | ||
2139 | imagealphablending($image_o, true); | ||
2140 | |||
2141 | imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height); | ||
2142 | ob_start(); | ||
2143 | imagepng($image_p, NULL, 9); | ||
2144 | $image = ob_get_contents(); | ||
2145 | ob_end_clean(); | ||
2146 | |||
2147 | $ext = "png"; | ||
2148 | } else { | ||
2149 | imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height); | ||
2150 | ob_start(); | ||
2151 | imagejpeg($image_p, NULL, 80); | ||
2152 | $image = ob_get_contents(); | ||
2153 | ob_end_clean(); | ||
2154 | |||
2155 | $mime = "image/jpeg"; | ||
2156 | $ext = "jpg"; | ||
2157 | } | ||
2158 | imagedestroy($image_o); | ||
2159 | imagedestroy($image_p); | ||
2160 | } | ||
2161 | } | ||
2162 | |||
2163 | if ($ext === "") { | ||
2164 | static $mimeToExt = array ( | ||
2165 | 'image/jpeg' => 'jpg', | ||
2166 | 'image/gif' => 'gif', | ||
2167 | 'image/png' => 'png' | ||
2168 | ); | ||
2169 | |||
2170 | if (isset($mimeToExt[$mime])) { | ||
2171 | $ext = $mimeToExt[$mime]; | ||
2172 | } | ||
2173 | } | ||
2174 | |||
2175 | $rv = array(); | ||
2176 | $rv['width'] = $width*$ratio; | ||
2177 | $rv['height'] = $height*$ratio; | ||
2178 | $rv['mime'] = $mime; | ||
2179 | $rv['image'] = $image; | ||
2180 | $rv['ext'] = $ext; | ||
2181 | |||
2182 | return $rv; | ||
2183 | } | ||
2184 | |||
2185 | /** | ||
2186 | * Get file contents, using curl if available, else file_get_contents | ||
2187 | * | ||
2188 | * @param string $source | ||
2189 | * @return bool | ||
2190 | */ | ||
2191 | function getFileContents($source, $toTempFile = FALSE) { | ||
2192 | $isExternal = preg_match('#^(http|ftp)s?://#i', $source) == 1; | ||
2193 | |||
2194 | if ($isExternal && $this->isCurlInstalled) { | ||
2195 | $ch = curl_init(); | ||
2196 | $outFile = NULL; | ||
2197 | $fp = NULL; | ||
2198 | $res = FALSE; | ||
2199 | $info = array('http_code' => 500); | ||
2200 | |||
2201 | curl_setopt($ch, CURLOPT_HEADER, 0); | ||
2202 | curl_setopt($ch, CURLOPT_URL, str_replace(" ","%20",$source)); | ||
2203 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | ||
2204 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | ||
2205 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 4096); | ||
2206 | |||
2207 | if ($toTempFile) { | ||
2208 | $outFile = tempnam(sys_get_temp_dir(), "EPub_v" . EPub::VERSION . "_"); | ||
2209 | $fp = fopen($outFile, "w+b"); | ||
2210 | curl_setopt($ch, CURLOPT_FILE, $fp); | ||
2211 | |||
2212 | $res = curl_exec($ch); | ||
2213 | $info = curl_getinfo($ch); | ||
2214 | |||
2215 | curl_close($ch); | ||
2216 | fclose($fp); | ||
2217 | } else { | ||
2218 | $res = curl_exec($ch); | ||
2219 | $info = curl_getinfo($ch); | ||
2220 | |||
2221 | curl_close($ch); | ||
2222 | } | ||
2223 | |||
2224 | if ($info['http_code'] == 200 && $res != false) { | ||
2225 | if ($toTempFile) { | ||
2226 | return $outFile; | ||
2227 | } | ||
2228 | return $res; | ||
2229 | } | ||
2230 | return FALSE; | ||
2231 | } | ||
2232 | |||
2233 | if ($this->isFileGetContentsInstalled && (!$isExternal || $this->isFileGetContentsExtInstalled)) { | ||
2234 | @$data = file_get_contents($source); | ||
2235 | return $data; | ||
2236 | } | ||
2237 | return FALSE; | ||
2238 | } | ||
2239 | |||
2240 | /** | ||
2241 | * get mime type from image data | ||
2242 | * | ||
2243 | * By fireweasel found on http://stackoverflow.com/questions/2207095/get-image-mimetype-from-resource-in-php-gd | ||
2244 | * @staticvar array $type | ||
2245 | * @param object $binary | ||
2246 | * @return string | ||
2247 | */ | ||
2248 | function image_file_type_from_binary($binary) { | ||
2249 | $hits = 0; | ||
2250 | if (!preg_match( | ||
2251 | '/\A(?:(\xff\xd8\xff)|(GIF8[79]a)|(\x89PNG\x0d\x0a)|(BM)|(\x49\x49(?:\x2a\x00|\x00\x4a))|(FORM.{4}ILBM))/', | ||
2252 | $binary, $hits)) { | ||
2253 | return 'application/octet-stream'; | ||
2254 | } | ||
2255 | static $type = array ( | ||
2256 | 1 => 'image/jpeg', | ||
2257 | 2 => 'image/gif', | ||
2258 | 3 => 'image/png', | ||
2259 | 4 => 'image/x-windows-bmp', | ||
2260 | 5 => 'image/tiff', | ||
2261 | 6 => 'image/x-ilbm', | ||
2262 | ); | ||
2263 | return $type[count($hits) - 1]; | ||
2264 | } | ||
2265 | |||
2266 | /** | ||
2267 | * @param string $source URL Source | ||
2268 | * @return string MimeType | ||
2269 | */ | ||
2270 | function getMimeTypeFromUrl($source) { | ||
2271 | $ext = FALSE; | ||
2272 | |||
2273 | $srev = strrev($source); | ||
2274 | $pos = strpos($srev, "?"); | ||
2275 | if ($pos !== FALSE) { | ||
2276 | $srev = substr($srev, $pos+1); | ||
2277 | } | ||
2278 | |||
2279 | $pos = strpos($srev, "."); | ||
2280 | if ($pos !== FALSE) { | ||
2281 | $ext = strtolower(strrev(substr($srev, 0, $pos))); | ||
2282 | } | ||
2283 | |||
2284 | if ($ext !== FALSE) { | ||
2285 | return $this->getMimeTypeFromExtension($ext); | ||
2286 | } | ||
2287 | return "application/octet-stream"; | ||
2288 | } | ||
2289 | |||
2290 | /** | ||
2291 | * @param string $ext Extension | ||
2292 | * @return string MimeType | ||
2293 | */ | ||
2294 | function getMimeTypeFromExtension($ext) { | ||
2295 | switch ($ext) { | ||
2296 | case "jpg": | ||
2297 | case "jpe": | ||
2298 | case "jpeg": | ||
2299 | return 'image/jpeg'; | ||
2300 | case "gif": | ||
2301 | return 'image/gif'; | ||
2302 | case "png": | ||
2303 | return 'image/png'; | ||
2304 | case "bmp": | ||
2305 | return 'image/x-windows-bmp'; | ||
2306 | case "tif": | ||
2307 | case "tiff": | ||
2308 | case "cpt": | ||
2309 | return 'image/tiff'; | ||
2310 | case "lbm": | ||
2311 | case "ilbm": | ||
2312 | return 'image/x-ilbm'; | ||
2313 | default: | ||
2314 | return "application/octet-stream"; | ||
2315 | } | ||
2316 | } | ||
2317 | |||
2318 | /** | ||
2319 | * Encode html code to use html entities, safeguarding it from potential character encoding peoblems | ||
2320 | * This function is a bit different from the vanilla htmlentities function in that it does not encode html tags. | ||
2321 | * | ||
2322 | * The regexp is taken from the PHP Manual discussion, it was written by user "busbyjon". | ||
2323 | * http://www.php.net/manual/en/function.htmlentities.php#90111 | ||
2324 | * | ||
2325 | * @param string $string string to encode. | ||
2326 | */ | ||
2327 | public function encodeHtml($string) { | ||
2328 | $string = strtr($string, $this->html_encoding_characters); | ||
2329 | |||
2330 | //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&\\1", $string); | ||
2331 | //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&", $string); | ||
2332 | return $string; | ||
2333 | } | ||
2334 | |||
2335 | /** | ||
2336 | * Helper function to create a DOM fragment with given markup. | ||
2337 | * | ||
2338 | * @author Adam Schmalhofer | ||
2339 | * | ||
2340 | * @param DOMDocument $dom | ||
2341 | * @param string $markup | ||
2342 | * @return DOMNode fragment in a node. | ||
2343 | */ | ||
2344 | protected function createDomFragment($dom, $markup) { | ||
2345 | $node = $dom->createDocumentFragment(); | ||
2346 | $node->appendXML($markup); | ||
2347 | return $node; | ||
2348 | } | ||
2349 | |||
2350 | /** | ||
2351 | * Retrieve an array of file names currently added to the book. | ||
2352 | * $key is the filename used in the book | ||
2353 | * $value is the original filename, will be the same as $key for most entries | ||
2354 | * | ||
2355 | * @return array file list | ||
2356 | */ | ||
2357 | function getFileList() { | ||
2358 | return $this->fileList; | ||
2359 | } | ||
2360 | |||
2361 | /** | ||
2362 | * @deprecated Use Zip::getRelativePath($relPath) instead. | ||
2363 | */ | ||
2364 | function relPath($relPath) { | ||
2365 | die ("Function was deprecated, use Zip::getRelativePath(\$relPath); instead"); | ||
2366 | } | ||
2367 | |||
2368 | /** | ||
2369 | * Set default chapter target size. | ||
2370 | * Default is 250000 bytes, and minimum is 10240 bytes. | ||
2371 | * | ||
2372 | * @param int $size segment size in bytes | ||
2373 | * @return void | ||
2374 | */ | ||
2375 | function setSplitSize($size) { | ||
2376 | $this->splitDefaultSize = (int)$size; | ||
2377 | if ($size < 10240) { | ||
2378 | $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. | ||
2379 | } | ||
2380 | } | ||
2381 | |||
2382 | /** | ||
2383 | * Get the chapter target size. | ||
2384 | * | ||
2385 | * @return $size | ||
2386 | */ | ||
2387 | function getSplitSize() { | ||
2388 | return $this->splitDefaultSize; | ||
2389 | } | ||
2390 | |||
2391 | /** | ||
2392 | * Remove all non essential html tags and entities. | ||
2393 | * | ||
2394 | * @global type $htmlEntities | ||
2395 | * @param string $string | ||
2396 | * @return string with the stripped entities. | ||
2397 | */ | ||
2398 | function decodeHtmlEntities($string) { | ||
2399 | global $htmlEntities; | ||
2400 | |||
2401 | $string = preg_replace('~\s*<br\s*/*\s*>\s*~i', "\n", $string); | ||
2402 | $string = preg_replace('~\s*</(p|div)\s*>\s*~i', "\n\n", $string); | ||
2403 | $string = preg_replace('~<[^>]*>~', '', $string); | ||
2404 | |||
2405 | $string = strtr($string, $htmlEntities); | ||
2406 | |||
2407 | $string = str_replace('&', '&', $string); | ||
2408 | $string = str_replace('&amp;', '&', $string); | ||
2409 | $string = preg_replace('~&(#x*[a-fA-F0-9]+;)~', '&\1', $string); | ||
2410 | $string = str_replace('<', '<', $string); | ||
2411 | $string = str_replace('>', '>', $string); | ||
2412 | |||
2413 | return $string; | ||
2414 | } | ||
2415 | |||
2416 | /** | ||
2417 | * Simply remove all HTML tags, brute force and no finesse. | ||
2418 | * | ||
2419 | * @param string $string html | ||
2420 | * @return string | ||
2421 | */ | ||
2422 | function html2text($string) { | ||
2423 | return preg_replace('~<[^>]*>~', '', $string); | ||
2424 | } | ||
2425 | |||
2426 | /** | ||
2427 | * @return string | ||
2428 | */ | ||
2429 | function getLog() { | ||
2430 | return $this->log->getLog(); | ||
2431 | } | ||
2432 | } | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php new file mode 100644 index 00000000..1d44f238 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php | |||
@@ -0,0 +1,201 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts. | ||
4 | * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts. | ||
5 | * Split size is considered max target size. The actual size is the result of an even split across the resulting files. | ||
6 | * | ||
7 | * @author A. Grandt <php@grandt.com> | ||
8 | * @copyright 2009-2014 A. Grandt | ||
9 | * @license GNU LGPL 2.1 | ||
10 | * @link http://www.phpclasses.org/package/6115 | ||
11 | * @link https://github.com/Grandt/PHPePub | ||
12 | * @version 3.20 | ||
13 | */ | ||
14 | class EPubChapterSplitter { | ||
15 | const VERSION = 3.20; | ||
16 | |||
17 | private $splitDefaultSize = 250000; | ||
18 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
19 | |||
20 | /** | ||
21 | * | ||
22 | * Enter description here ... | ||
23 | * | ||
24 | * @param unknown_type $ident | ||
25 | */ | ||
26 | function setVersion($bookVersion) { | ||
27 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
28 | } | ||
29 | |||
30 | /** | ||
31 | * Set default chapter target size. | ||
32 | * Default is 250000 bytes, and minimum is 10240 bytes. | ||
33 | * | ||
34 | * @param $size segment size in bytes | ||
35 | * @return void | ||
36 | */ | ||
37 | function setSplitSize($size) { | ||
38 | $this->splitDefaultSize = (int)$size; | ||
39 | if ($size < 10240) { | ||
40 | $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. | ||
41 | } | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * Get the chapter target size. | ||
46 | * | ||
47 | * @return $size | ||
48 | */ | ||
49 | function getSplitSize() { | ||
50 | return $this->splitDefaultSize; | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * Split $chapter into multiple parts. | ||
55 | * | ||
56 | * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php | ||
57 | * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given | ||
58 | * | ||
59 | * @param String $chapter XHTML file | ||
60 | * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check. | ||
61 | * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern. | ||
62 | * | ||
63 | * @return array with 1 or more parts | ||
64 | */ | ||
65 | function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') { | ||
66 | $chapterData = array(); | ||
67 | $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1); | ||
68 | if ($splitOnSearchString && !$isSearchRegexp) { | ||
69 | $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#"; | ||
70 | } | ||
71 | |||
72 | if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) { | ||
73 | return array($chapter); | ||
74 | } | ||
75 | |||
76 | $xmlDoc = new DOMDocument(); | ||
77 | @$xmlDoc->loadHTML($chapter); | ||
78 | |||
79 | $head = $xmlDoc->getElementsByTagName("head"); | ||
80 | $body = $xmlDoc->getElementsByTagName("body"); | ||
81 | |||
82 | $htmlPos = stripos($chapter, "<html"); | ||
83 | $htmlEndPos = stripos($chapter, ">", $htmlPos); | ||
84 | $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>"; | ||
85 | if (strpos(trim($newXML), "<?xml ") === FALSE) { | ||
86 | $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML; | ||
87 | } | ||
88 | $headerLength = strlen($newXML); | ||
89 | |||
90 | $files = array(); | ||
91 | $chapterNames = array(); | ||
92 | $domDepth = 0; | ||
93 | $domPath = array(); | ||
94 | $domClonedPath = array(); | ||
95 | |||
96 | $curFile = $xmlDoc->createDocumentFragment(); | ||
97 | $files[] = $curFile; | ||
98 | $curParent = $curFile; | ||
99 | $curSize = 0; | ||
100 | |||
101 | $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); | ||
102 | $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; | ||
103 | |||
104 | $partSize = $this->splitDefaultSize - $headLen; | ||
105 | |||
106 | if ($bodyLen > $partSize) { | ||
107 | $parts = ceil($bodyLen / $partSize); | ||
108 | $partSize = ($bodyLen / $parts) - $headLen; | ||
109 | } | ||
110 | |||
111 | $node = $body->item(0)->firstChild; | ||
112 | |||
113 | do { | ||
114 | $nodeData = $xmlDoc->saveXML($node); | ||
115 | $nodeLen = strlen($nodeData); | ||
116 | |||
117 | if ($nodeLen > $partSize && $node->hasChildNodes()) { | ||
118 | $domPath[] = $node; | ||
119 | $domClonedPath[] = $node->cloneNode(false); | ||
120 | $domDepth++; | ||
121 | |||
122 | $node = $node->firstChild; | ||
123 | } | ||
124 | |||
125 | $node2 = $node->nextSibling; | ||
126 | |||
127 | if ($node != null && $node->nodeName != "#text") { | ||
128 | $doSplit = false; | ||
129 | if ($splitOnSearchString) { | ||
130 | $doSplit = preg_match($searchString, $nodeData) == 1; | ||
131 | if ($doSplit) { | ||
132 | $chapterNames[] = trim($nodeData); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) { | ||
137 | $curFile = $xmlDoc->createDocumentFragment(); | ||
138 | $files[] = $curFile; | ||
139 | $curParent = $curFile; | ||
140 | if ($domDepth > 0) { | ||
141 | reset($domPath); | ||
142 | reset($domClonedPath); | ||
143 | $oneDomClonedPath = each($domClonedPath); | ||
144 | while ($oneDomClonedPath) { | ||
145 | list($k, $v) = $oneDomClonedPath; | ||
146 | $newParent = $v->cloneNode(false); | ||
147 | $curParent->appendChild($newParent); | ||
148 | $curParent = $newParent; | ||
149 | $oneDomClonedPath = each($domClonedPath); | ||
150 | } | ||
151 | } | ||
152 | $curSize = strlen($xmlDoc->saveXML($curFile)); | ||
153 | } | ||
154 | $curParent->appendChild($node->cloneNode(true)); | ||
155 | $curSize += $nodeLen; | ||
156 | } | ||
157 | |||
158 | $node = $node2; | ||
159 | while ($node == null && $domDepth > 0) { | ||
160 | $domDepth--; | ||
161 | $node = end($domPath)->nextSibling; | ||
162 | array_pop($domPath); | ||
163 | array_pop($domClonedPath); | ||
164 | $curParent = $curParent->parentNode; | ||
165 | } | ||
166 | } while ($node != null); | ||
167 | |||
168 | $curFile = null; | ||
169 | $curSize = 0; | ||
170 | |||
171 | $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | ||
172 | $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
173 | $xml->preserveWhiteSpace = false; | ||
174 | $xml->formatOutput = true; | ||
175 | |||
176 | for ($idx = 0; $idx < count($files); $idx++) { | ||
177 | $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | ||
178 | $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
179 | $xml2Doc->loadXML($newXML); | ||
180 | $html = $xml2Doc->getElementsByTagName("html")->item(0); | ||
181 | $html->appendChild($xml2Doc->importNode($head->item(0), true)); | ||
182 | $body = $xml2Doc->createElement("body"); | ||
183 | $html->appendChild($body); | ||
184 | $body->appendChild($xml2Doc->importNode($files[$idx], true)); | ||
185 | |||
186 | // force pretty printing and correct formatting, should not be needed, but it is. | ||
187 | $xml->loadXML($xml2Doc->saveXML()); | ||
188 | |||
189 | $doc = $xml->saveXML(); | ||
190 | |||
191 | if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) { | ||
192 | $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc); | ||
193 | } | ||
194 | |||
195 | $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc; | ||
196 | } | ||
197 | |||
198 | return $chapterData; | ||
199 | } | ||
200 | } | ||
201 | ?> | ||
diff --git a/inc/3rdparty/libraries/PHPePub/Logger.php b/inc/3rdparty/libraries/PHPePub/Logger.php new file mode 100644 index 00000000..314019cb --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/Logger.php | |||
@@ -0,0 +1,92 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Simple log line aggregator. | ||
4 | * | ||
5 | * @author A. Grandt <php@grandt.com> | ||
6 | * @copyright 2012-2013 A. Grandt | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 1.00 | ||
9 | */ | ||
10 | class Logger { | ||
11 | const VERSION = 1.00; | ||
12 | |||
13 | private $log = ""; | ||
14 | private $tStart; | ||
15 | private $tLast; | ||
16 | private $name = NULL; | ||
17 | private $isLogging = FALSE; | ||
18 | private $isDebugging = FALSE; | ||
19 | |||
20 | /** | ||
21 | * Class constructor. | ||
22 | * | ||
23 | * @return void | ||
24 | */ | ||
25 | function __construct($name = NULL, $isLogging = FALSE) { | ||
26 | if ($name === NULL) { | ||
27 | $this->name = ""; | ||
28 | } else { | ||
29 | $this->name = $name . " : "; | ||
30 | } | ||
31 | $this->isLogging = $isLogging; | ||
32 | $this->start(); | ||
33 | } | ||
34 | |||
35 | /** | ||
36 | * Class destructor | ||
37 | * | ||
38 | * @return void | ||
39 | * @TODO make sure elements in the destructor match the current class elements | ||
40 | */ | ||
41 | function __destruct() { | ||
42 | unset($this->log); | ||
43 | } | ||
44 | |||
45 | function start() { | ||
46 | /* Prepare Logging. Just in case it's used. later */ | ||
47 | if ($this->isLogging) { | ||
48 | $this->tStart = gettimeofday(); | ||
49 | $this->tLast = $this->tStart; | ||
50 | $this->log = "<h1>Log: " . $this->name . "</h1>\n<pre>Started: " . gmdate("D, d M Y H:i:s T", $this->tStart['sec']) . "\n Δ Start ; Δ Last ;"; | ||
51 | $this->logLine("Start"); | ||
52 | } | ||
53 | } | ||
54 | |||
55 | function dumpInstalledModules() { | ||
56 | if ($this->isLogging) { | ||
57 | $isCurlInstalled = extension_loaded('curl') && function_exists('curl_version'); | ||
58 | $isGdInstalled = extension_loaded('gd') && function_exists('gd_info'); | ||
59 | $isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype'); | ||
60 | $isFileGetContentsInstalled = function_exists('file_get_contents'); | ||
61 | $isFileGetContentsExtInstalled = $isFileGetContentsInstalled && ini_get('allow_url_fopen'); | ||
62 | |||
63 | $this->logLine("isCurlInstalled...............: " . ($isCurlInstalled ? "Yes" : "No")); | ||
64 | $this->logLine("isGdInstalled.................: " . ($isGdInstalled ? "Yes" : "No")); | ||
65 | $this->logLine("isExifInstalled...............: " . ($isExifInstalled ? "Yes" : "No")); | ||
66 | $this->logLine("isFileGetContentsInstalled....: " . ($isFileGetContentsInstalled ? "Yes" : "No")); | ||
67 | $this->logLine("isFileGetContentsExtInstalled.: " . ($isFileGetContentsExtInstalled ? "Yes" : "No")); | ||
68 | } | ||
69 | } | ||
70 | |||
71 | function logLine($line) { | ||
72 | if ($this->isLogging) { | ||
73 | $tTemp = gettimeofday(); | ||
74 | $tS = $this->tStart['sec'] + (((int)($this->tStart['usec']/100))/10000); | ||
75 | $tL = $this->tLast['sec'] + (((int)($this->tLast['usec']/100))/10000); | ||
76 | $tT = $tTemp['sec'] + (((int)($tTemp['usec']/100))/10000); | ||
77 | |||
78 | $logline = sprintf("\n+%08.04f; +%08.04f; ", ($tT-$tS), ($tT-$tL)) . $this->name . $line; | ||
79 | $this->log .= $logline; | ||
80 | $this->tLast = $tTemp; | ||
81 | |||
82 | if ($this->isDebugging) { | ||
83 | echo "<pre>" . $logline . "\n</pre>\n"; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | function getLog() { | ||
89 | return $this->log; | ||
90 | } | ||
91 | } | ||
92 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/PHPePub/Zip.php b/inc/3rdparty/libraries/PHPePub/Zip.php new file mode 100644 index 00000000..01e03566 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/Zip.php | |||
@@ -0,0 +1,818 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Class to create and manage a Zip file. | ||
4 | * | ||
5 | * Initially inspired by CreateZipFile by Rochak Chauhan www.rochakchauhan.com (http://www.phpclasses.org/browse/package/2322.html) | ||
6 | * and | ||
7 | * http://www.pkware.com/documents/casestudies/APPNOTE.TXT Zip file specification. | ||
8 | * | ||
9 | * License: GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
10 | * | ||
11 | * @author A. Grandt <php@grandt.com> | ||
12 | * @copyright 2009-2014 A. Grandt | ||
13 | * @license GNU LGPL 2.1 | ||
14 | * @link http://www.phpclasses.org/package/6110 | ||
15 | * @link https://github.com/Grandt/PHPZip | ||
16 | * @version 1.60 | ||
17 | */ | ||
18 | class Zip { | ||
19 | const VERSION = 1.60; | ||
20 | |||
21 | const ZIP_LOCAL_FILE_HEADER = "\x50\x4b\x03\x04"; // Local file header signature | ||
22 | const ZIP_CENTRAL_FILE_HEADER = "\x50\x4b\x01\x02"; // Central file header signature | ||
23 | const ZIP_END_OF_CENTRAL_DIRECTORY = "\x50\x4b\x05\x06\x00\x00\x00\x00"; //end of Central directory record | ||
24 | |||
25 | const EXT_FILE_ATTR_DIR = 010173200020; // Permission 755 drwxr-xr-x = (((S_IFDIR | 0755) << 16) | S_DOS_D); | ||
26 | const EXT_FILE_ATTR_FILE = 020151000040; // Permission 644 -rw-r--r-- = (((S_IFREG | 0644) << 16) | S_DOS_A); | ||
27 | |||
28 | const ATTR_VERSION_TO_EXTRACT = "\x14\x00"; // Version needed to extract | ||
29 | const ATTR_MADE_BY_VERSION = "\x1E\x03"; // Made By Version | ||
30 | |||
31 | // Unix file types | ||
32 | const S_IFIFO = 0010000; // named pipe (fifo) | ||
33 | const S_IFCHR = 0020000; // character special | ||
34 | const S_IFDIR = 0040000; // directory | ||
35 | const S_IFBLK = 0060000; // block special | ||
36 | const S_IFREG = 0100000; // regular | ||
37 | const S_IFLNK = 0120000; // symbolic link | ||
38 | const S_IFSOCK = 0140000; // socket | ||
39 | |||
40 | // setuid/setgid/sticky bits, the same as for chmod: | ||
41 | |||
42 | const S_ISUID = 0004000; // set user id on execution | ||
43 | const S_ISGID = 0002000; // set group id on execution | ||
44 | const S_ISTXT = 0001000; // sticky bit | ||
45 | |||
46 | // And of course, the other 12 bits are for the permissions, the same as for chmod: | ||
47 | // When addding these up, you can also just write the permissions as a simgle octal number | ||
48 | // ie. 0755. The leading 0 specifies octal notation. | ||
49 | const S_IRWXU = 0000700; // RWX mask for owner | ||
50 | const S_IRUSR = 0000400; // R for owner | ||
51 | const S_IWUSR = 0000200; // W for owner | ||
52 | const S_IXUSR = 0000100; // X for owner | ||
53 | const S_IRWXG = 0000070; // RWX mask for group | ||
54 | const S_IRGRP = 0000040; // R for group | ||
55 | const S_IWGRP = 0000020; // W for group | ||
56 | const S_IXGRP = 0000010; // X for group | ||
57 | const S_IRWXO = 0000007; // RWX mask for other | ||
58 | const S_IROTH = 0000004; // R for other | ||
59 | const S_IWOTH = 0000002; // W for other | ||
60 | const S_IXOTH = 0000001; // X for other | ||
61 | const S_ISVTX = 0001000; // save swapped text even after use | ||
62 | |||
63 | // Filetype, sticky and permissions are added up, and shifted 16 bits left BEFORE adding the DOS flags. | ||
64 | |||
65 | // DOS file type flags, we really only use the S_DOS_D flag. | ||
66 | |||
67 | const S_DOS_A = 0000040; // DOS flag for Archive | ||
68 | const S_DOS_D = 0000020; // DOS flag for Directory | ||
69 | const S_DOS_V = 0000010; // DOS flag for Volume | ||
70 | const S_DOS_S = 0000004; // DOS flag for System | ||
71 | const S_DOS_H = 0000002; // DOS flag for Hidden | ||
72 | const S_DOS_R = 0000001; // DOS flag for Read Only | ||
73 | |||
74 | private $zipMemoryThreshold = 1048576; // Autocreate tempfile if the zip data exceeds 1048576 bytes (1 MB) | ||
75 | |||
76 | private $zipData = NULL; | ||
77 | private $zipFile = NULL; | ||
78 | private $zipComment = NULL; | ||
79 | private $cdRec = array(); // central directory | ||
80 | private $offset = 0; | ||
81 | private $isFinalized = FALSE; | ||
82 | private $addExtraField = TRUE; | ||
83 | |||
84 | private $streamChunkSize = 65536; | ||
85 | private $streamFilePath = NULL; | ||
86 | private $streamTimestamp = NULL; | ||
87 | private $streamFileComment = NULL; | ||
88 | private $streamFile = NULL; | ||
89 | private $streamData = NULL; | ||
90 | private $streamFileLength = 0; | ||
91 | private $streamExtFileAttr = null; | ||
92 | |||
93 | /** | ||
94 | * Constructor. | ||
95 | * | ||
96 | * @param boolean $useZipFile Write temp zip data to tempFile? Default FALSE | ||
97 | */ | ||
98 | function __construct($useZipFile = FALSE) { | ||
99 | if ($useZipFile) { | ||
100 | $this->zipFile = tmpfile(); | ||
101 | } else { | ||
102 | $this->zipData = ""; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | function __destruct() { | ||
107 | if (is_resource($this->zipFile)) { | ||
108 | fclose($this->zipFile); | ||
109 | } | ||
110 | $this->zipData = NULL; | ||
111 | } | ||
112 | |||
113 | /** | ||
114 | * Extra fields on the Zip directory records are Unix time codes needed for compatibility on the default Mac zip archive tool. | ||
115 | * These are enabled as default, as they do no harm elsewhere and only add 26 bytes per file added. | ||
116 | * | ||
117 | * @param bool $setExtraField TRUE (default) will enable adding of extra fields, anything else will disable it. | ||
118 | */ | ||
119 | function setExtraField($setExtraField = TRUE) { | ||
120 | $this->addExtraField = ($setExtraField === TRUE); | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * Set Zip archive comment. | ||
125 | * | ||
126 | * @param string $newComment New comment. NULL to clear. | ||
127 | * @return bool $success | ||
128 | */ | ||
129 | public function setComment($newComment = NULL) { | ||
130 | if ($this->isFinalized) { | ||
131 | return FALSE; | ||
132 | } | ||
133 | $this->zipComment = $newComment; | ||
134 | |||
135 | return TRUE; | ||
136 | } | ||
137 | |||
138 | /** | ||
139 | * Set zip file to write zip data to. | ||
140 | * This will cause all present and future data written to this class to be written to this file. | ||
141 | * This can be used at any time, even after the Zip Archive have been finalized. Any previous file will be closed. | ||
142 | * Warning: If the given file already exists, it will be overwritten. | ||
143 | * | ||
144 | * @param string $fileName | ||
145 | * @return bool $success | ||
146 | */ | ||
147 | public function setZipFile($fileName) { | ||
148 | if (is_file($fileName)) { | ||
149 | unlink($fileName); | ||
150 | } | ||
151 | $fd=fopen($fileName, "x+b"); | ||
152 | if (is_resource($this->zipFile)) { | ||
153 | rewind($this->zipFile); | ||
154 | while (!feof($this->zipFile)) { | ||
155 | fwrite($fd, fread($this->zipFile, $this->streamChunkSize)); | ||
156 | } | ||
157 | |||
158 | fclose($this->zipFile); | ||
159 | } else { | ||
160 | fwrite($fd, $this->zipData); | ||
161 | $this->zipData = NULL; | ||
162 | } | ||
163 | $this->zipFile = $fd; | ||
164 | |||
165 | return TRUE; | ||
166 | } | ||
167 | |||
168 | /** | ||
169 | * Add an empty directory entry to the zip archive. | ||
170 | * Basically this is only used if an empty directory is added. | ||
171 | * | ||
172 | * @param string $directoryPath Directory Path and name to be added to the archive. | ||
173 | * @param int $timestamp (Optional) Timestamp for the added directory, if omitted or set to 0, the current time will be used. | ||
174 | * @param string $fileComment (Optional) Comment to be added to the archive for this directory. To use fileComment, timestamp must be given. | ||
175 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
176 | * @return bool $success | ||
177 | */ | ||
178 | public function addDirectory($directoryPath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_DIR) { | ||
179 | if ($this->isFinalized) { | ||
180 | return FALSE; | ||
181 | } | ||
182 | $directoryPath = str_replace("\\", "/", $directoryPath); | ||
183 | $directoryPath = rtrim($directoryPath, "/"); | ||
184 | |||
185 | if (strlen($directoryPath) > 0) { | ||
186 | $this->buildZipEntry($directoryPath.'/', $fileComment, "\x00\x00", "\x00\x00", $timestamp, "\x00\x00\x00\x00", 0, 0, $extFileAttr); | ||
187 | return TRUE; | ||
188 | } | ||
189 | return FALSE; | ||
190 | } | ||
191 | |||
192 | /** | ||
193 | * Add a file to the archive at the specified location and file name. | ||
194 | * | ||
195 | * @param string $data File data. | ||
196 | * @param string $filePath Filepath and name to be used in the archive. | ||
197 | * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. | ||
198 | * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. | ||
199 | * @param bool $compress (Optional) Compress file, if set to FALSE the file will only be stored. Default TRUE. | ||
200 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
201 | * @return bool $success | ||
202 | */ | ||
203 | public function addFile($data, $filePath, $timestamp = 0, $fileComment = NULL, $compress = TRUE, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
204 | if ($this->isFinalized) { | ||
205 | return FALSE; | ||
206 | } | ||
207 | |||
208 | if (is_resource($data) && get_resource_type($data) == "stream") { | ||
209 | $this->addLargeFile($data, $filePath, $timestamp, $fileComment, $extFileAttr); | ||
210 | return FALSE; | ||
211 | } | ||
212 | |||
213 | $gzData = ""; | ||
214 | $gzType = "\x08\x00"; // Compression type 8 = deflate | ||
215 | $gpFlags = "\x00\x00"; // General Purpose bit flags for compression type 8 it is: 0=Normal, 1=Maximum, 2=Fast, 3=super fast compression. | ||
216 | $dataLength = strlen($data); | ||
217 | $fileCRC32 = pack("V", crc32($data)); | ||
218 | |||
219 | if ($compress) { | ||
220 | $gzTmp = gzcompress($data); | ||
221 | $gzData = substr(substr($gzTmp, 0, strlen($gzTmp) - 4), 2); // gzcompress adds a 2 byte header and 4 byte CRC we can't use. | ||
222 | // The 2 byte header does contain useful data, though in this case the 2 parameters we'd be interrested in will always be 8 for compression type, and 2 for General purpose flag. | ||
223 | $gzLength = strlen($gzData); | ||
224 | } else { | ||
225 | $gzLength = $dataLength; | ||
226 | } | ||
227 | |||
228 | if ($gzLength >= $dataLength) { | ||
229 | $gzLength = $dataLength; | ||
230 | $gzData = $data; | ||
231 | $gzType = "\x00\x00"; // Compression type 0 = stored | ||
232 | $gpFlags = "\x00\x00"; // Compression type 0 = stored | ||
233 | } | ||
234 | |||
235 | if (!is_resource($this->zipFile) && ($this->offset + $gzLength) > $this->zipMemoryThreshold) { | ||
236 | $this->zipflush(); | ||
237 | } | ||
238 | |||
239 | $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr); | ||
240 | |||
241 | $this->zipwrite($gzData); | ||
242 | |||
243 | return TRUE; | ||
244 | } | ||
245 | |||
246 | /** | ||
247 | * Add the content to a directory. | ||
248 | * | ||
249 | * @author Adam Schmalhofer <Adam.Schmalhofer@gmx.de> | ||
250 | * @author A. Grandt | ||
251 | * | ||
252 | * @param string $realPath Path on the file system. | ||
253 | * @param string $zipPath Filepath and name to be used in the archive. | ||
254 | * @param bool $recursive Add content recursively, default is TRUE. | ||
255 | * @param bool $followSymlinks Follow and add symbolic links, if they are accessible, default is TRUE. | ||
256 | * @param array &$addedFiles Reference to the added files, this is used to prevent duplicates, efault is an empty array. | ||
257 | * If you start the function by parsing an array, the array will be populated with the realPath | ||
258 | * and zipPath kay/value pairs added to the archive by the function. | ||
259 | * @param bool $overrideFilePermissions Force the use of the file/dir permissions set in the $extDirAttr | ||
260 | * and $extFileAttr parameters. | ||
261 | * @param int $extDirAttr Permissions for directories. | ||
262 | * @param int $extFileAttr Permissions for files. | ||
263 | */ | ||
264 | public function addDirectoryContent($realPath, $zipPath, $recursive = TRUE, $followSymlinks = TRUE, &$addedFiles = array(), | ||
265 | $overrideFilePermissions = FALSE, $extDirAttr = self::EXT_FILE_ATTR_DIR, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
266 | if (file_exists($realPath) && !isset($addedFiles[realpath($realPath)])) { | ||
267 | if (is_dir($realPath)) { | ||
268 | if ($overrideFilePermissions) { | ||
269 | $this->addDirectory($zipPath, 0, null, $extDirAttr); | ||
270 | } else { | ||
271 | $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($realPath)); | ||
272 | } | ||
273 | } | ||
274 | |||
275 | $addedFiles[realpath($realPath)] = $zipPath; | ||
276 | |||
277 | $iter = new DirectoryIterator($realPath); | ||
278 | foreach ($iter as $file) { | ||
279 | if ($file->isDot()) { | ||
280 | continue; | ||
281 | } | ||
282 | $newRealPath = $file->getPathname(); | ||
283 | $newZipPath = self::pathJoin($zipPath, $file->getFilename()); | ||
284 | |||
285 | if (file_exists($newRealPath) && ($followSymlinks === TRUE || !is_link($newRealPath))) { | ||
286 | if ($file->isFile()) { | ||
287 | $addedFiles[realpath($newRealPath)] = $newZipPath; | ||
288 | if ($overrideFilePermissions) { | ||
289 | $this->addLargeFile($newRealPath, $newZipPath, 0, null, $extFileAttr); | ||
290 | } else { | ||
291 | $this->addLargeFile($newRealPath, $newZipPath, 0, null, self::getFileExtAttr($newRealPath)); | ||
292 | } | ||
293 | } else if ($recursive === TRUE) { | ||
294 | $this->addDirectoryContent($newRealPath, $newZipPath, $recursive, $followSymlinks, $addedFiles, $overrideFilePermissions, $extDirAttr, $extFileAttr); | ||
295 | } else { | ||
296 | if ($overrideFilePermissions) { | ||
297 | $this->addDirectory($zipPath, 0, null, $extDirAttr); | ||
298 | } else { | ||
299 | $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($newRealPath)); | ||
300 | } | ||
301 | } | ||
302 | } | ||
303 | } | ||
304 | } | ||
305 | } | ||
306 | |||
307 | /** | ||
308 | * Add a file to the archive at the specified location and file name. | ||
309 | * | ||
310 | * @param string $dataFile File name/path. | ||
311 | * @param string $filePath Filepath and name to be used in the archive. | ||
312 | * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. | ||
313 | * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. | ||
314 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
315 | * @return bool $success | ||
316 | */ | ||
317 | public function addLargeFile($dataFile, $filePath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
318 | if ($this->isFinalized) { | ||
319 | return FALSE; | ||
320 | } | ||
321 | |||
322 | if (is_string($dataFile) && is_file($dataFile)) { | ||
323 | $this->processFile($dataFile, $filePath, $timestamp, $fileComment, $extFileAttr); | ||
324 | } else if (is_resource($dataFile) && get_resource_type($dataFile) == "stream") { | ||
325 | $fh = $dataFile; | ||
326 | $this->openStream($filePath, $timestamp, $fileComment, $extFileAttr); | ||
327 | |||
328 | while (!feof($fh)) { | ||
329 | $this->addStreamData(fread($fh, $this->streamChunkSize)); | ||
330 | } | ||
331 | $this->closeStream($this->addExtraField); | ||
332 | } | ||
333 | return TRUE; | ||
334 | } | ||
335 | |||
336 | /** | ||
337 | * Create a stream to be used for large entries. | ||
338 | * | ||
339 | * @param string $filePath Filepath and name to be used in the archive. | ||
340 | * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. | ||
341 | * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. | ||
342 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
343 | * @return bool $success | ||
344 | */ | ||
345 | public function openStream($filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
346 | if (!function_exists('sys_get_temp_dir')) { | ||
347 | die ("ERROR: Zip " . self::VERSION . " requires PHP version 5.2.1 or above if large files are used."); | ||
348 | } | ||
349 | |||
350 | if ($this->isFinalized) { | ||
351 | return FALSE; | ||
352 | } | ||
353 | |||
354 | $this->zipflush(); | ||
355 | |||
356 | if (strlen($this->streamFilePath) > 0) { | ||
357 | $this->closeStream(); | ||
358 | } | ||
359 | |||
360 | $this->streamFile = tempnam(sys_get_temp_dir(), 'Zip'); | ||
361 | $this->streamData = fopen($this->streamFile, "wb"); | ||
362 | $this->streamFilePath = $filePath; | ||
363 | $this->streamTimestamp = $timestamp; | ||
364 | $this->streamFileComment = $fileComment; | ||
365 | $this->streamFileLength = 0; | ||
366 | $this->streamExtFileAttr = $extFileAttr; | ||
367 | |||
368 | return TRUE; | ||
369 | } | ||
370 | |||
371 | /** | ||
372 | * Add data to the open stream. | ||
373 | * | ||
374 | * @param string $data | ||
375 | * @return mixed length in bytes added or FALSE if the archive is finalized or there are no open stream. | ||
376 | */ | ||
377 | public function addStreamData($data) { | ||
378 | if ($this->isFinalized || strlen($this->streamFilePath) == 0) { | ||
379 | return FALSE; | ||
380 | } | ||
381 | |||
382 | $length = fwrite($this->streamData, $data, strlen($data)); | ||
383 | if ($length != strlen($data)) { | ||
384 | die ("<p>Length mismatch</p>\n"); | ||
385 | } | ||
386 | $this->streamFileLength += $length; | ||
387 | |||
388 | return $length; | ||
389 | } | ||
390 | |||
391 | /** | ||
392 | * Close the current stream. | ||
393 | * | ||
394 | * @return bool $success | ||
395 | */ | ||
396 | public function closeStream() { | ||
397 | if ($this->isFinalized || strlen($this->streamFilePath) == 0) { | ||
398 | return FALSE; | ||
399 | } | ||
400 | |||
401 | fflush($this->streamData); | ||
402 | fclose($this->streamData); | ||
403 | |||
404 | $this->processFile($this->streamFile, $this->streamFilePath, $this->streamTimestamp, $this->streamFileComment, $this->streamExtFileAttr); | ||
405 | |||
406 | $this->streamData = null; | ||
407 | $this->streamFilePath = null; | ||
408 | $this->streamTimestamp = null; | ||
409 | $this->streamFileComment = null; | ||
410 | $this->streamFileLength = 0; | ||
411 | $this->streamExtFileAttr = null; | ||
412 | |||
413 | // Windows is a little slow at times, so a millisecond later, we can unlink this. | ||
414 | unlink($this->streamFile); | ||
415 | |||
416 | $this->streamFile = null; | ||
417 | |||
418 | return TRUE; | ||
419 | } | ||
420 | |||
421 | private function processFile($dataFile, $filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
422 | if ($this->isFinalized) { | ||
423 | return FALSE; | ||
424 | } | ||
425 | |||
426 | $tempzip = tempnam(sys_get_temp_dir(), 'ZipStream'); | ||
427 | |||
428 | $zip = new ZipArchive; | ||
429 | if ($zip->open($tempzip) === TRUE) { | ||
430 | $zip->addFile($dataFile, 'file'); | ||
431 | $zip->close(); | ||
432 | } | ||
433 | |||
434 | $file_handle = fopen($tempzip, "rb"); | ||
435 | $stats = fstat($file_handle); | ||
436 | $eof = $stats['size']-72; | ||
437 | |||
438 | fseek($file_handle, 6); | ||
439 | |||
440 | $gpFlags = fread($file_handle, 2); | ||
441 | $gzType = fread($file_handle, 2); | ||
442 | fread($file_handle, 4); | ||
443 | $fileCRC32 = fread($file_handle, 4); | ||
444 | $v = unpack("Vval", fread($file_handle, 4)); | ||
445 | $gzLength = $v['val']; | ||
446 | $v = unpack("Vval", fread($file_handle, 4)); | ||
447 | $dataLength = $v['val']; | ||
448 | |||
449 | $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr); | ||
450 | |||
451 | fseek($file_handle, 34); | ||
452 | $pos = 34; | ||
453 | |||
454 | while (!feof($file_handle) && $pos < $eof) { | ||
455 | $datalen = $this->streamChunkSize; | ||
456 | if ($pos + $this->streamChunkSize > $eof) { | ||
457 | $datalen = $eof-$pos; | ||
458 | } | ||
459 | $data = fread($file_handle, $datalen); | ||
460 | $pos += $datalen; | ||
461 | |||
462 | $this->zipwrite($data); | ||
463 | } | ||
464 | |||
465 | fclose($file_handle); | ||
466 | |||
467 | unlink($tempzip); | ||
468 | } | ||
469 | |||
470 | /** | ||
471 | * Close the archive. | ||
472 | * A closed archive can no longer have new files added to it. | ||
473 | * | ||
474 | * @return bool $success | ||
475 | */ | ||
476 | public function finalize() { | ||
477 | if (!$this->isFinalized) { | ||
478 | if (strlen($this->streamFilePath) > 0) { | ||
479 | $this->closeStream(); | ||
480 | } | ||
481 | $cd = implode("", $this->cdRec); | ||
482 | |||
483 | $cdRecSize = pack("v", sizeof($this->cdRec)); | ||
484 | $cdRec = $cd . self::ZIP_END_OF_CENTRAL_DIRECTORY | ||
485 | . $cdRecSize . $cdRecSize | ||
486 | . pack("VV", strlen($cd), $this->offset); | ||
487 | if (!empty($this->zipComment)) { | ||
488 | $cdRec .= pack("v", strlen($this->zipComment)) . $this->zipComment; | ||
489 | } else { | ||
490 | $cdRec .= "\x00\x00"; | ||
491 | } | ||
492 | |||
493 | $this->zipwrite($cdRec); | ||
494 | |||
495 | $this->isFinalized = TRUE; | ||
496 | $this->cdRec = NULL; | ||
497 | |||
498 | return TRUE; | ||
499 | } | ||
500 | return FALSE; | ||
501 | } | ||
502 | |||
503 | /** | ||
504 | * Get the handle ressource for the archive zip file. | ||
505 | * If the zip haven't been finalized yet, this will cause it to become finalized | ||
506 | * | ||
507 | * @return zip file handle | ||
508 | */ | ||
509 | public function getZipFile() { | ||
510 | if (!$this->isFinalized) { | ||
511 | $this->finalize(); | ||
512 | } | ||
513 | |||
514 | $this->zipflush(); | ||
515 | |||
516 | rewind($this->zipFile); | ||
517 | |||
518 | return $this->zipFile; | ||
519 | } | ||
520 | |||
521 | /** | ||
522 | * Get the zip file contents | ||
523 | * If the zip haven't been finalized yet, this will cause it to become finalized | ||
524 | * | ||
525 | * @return zip data | ||
526 | */ | ||
527 | public function getZipData() { | ||
528 | if (!$this->isFinalized) { | ||
529 | $this->finalize(); | ||
530 | } | ||
531 | if (!is_resource($this->zipFile)) { | ||
532 | return $this->zipData; | ||
533 | } else { | ||
534 | rewind($this->zipFile); | ||
535 | $filestat = fstat($this->zipFile); | ||
536 | return fread($this->zipFile, $filestat['size']); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | /** | ||
541 | * Send the archive as a zip download | ||
542 | * | ||
543 | * @param String $fileName The name of the Zip archive, in ISO-8859-1 (or ASCII) encoding, ie. "archive.zip". Optional, defaults to NULL, which means that no ISO-8859-1 encoded file name will be specified. | ||
544 | * @param String $contentType Content mime type. Optional, defaults to "application/zip". | ||
545 | * @param String $utf8FileName The name of the Zip archive, in UTF-8 encoding. Optional, defaults to NULL, which means that no UTF-8 encoded file name will be specified. | ||
546 | * @param bool $inline Use Content-Disposition with "inline" instead of "attached". Optional, defaults to FALSE. | ||
547 | * @return bool $success | ||
548 | */ | ||
549 | function sendZip($fileName = null, $contentType = "application/zip", $utf8FileName = null, $inline = false) { | ||
550 | if (!$this->isFinalized) { | ||
551 | $this->finalize(); | ||
552 | } | ||
553 | |||
554 | $headerFile = null; | ||
555 | $headerLine = null; | ||
556 | if (!headers_sent($headerFile, $headerLine) or die("<p><strong>Error:</strong> Unable to send file $fileName. HTML Headers have already been sent from <strong>$headerFile</strong> in line <strong>$headerLine</strong></p>")) { | ||
557 | if ((ob_get_contents() === FALSE || ob_get_contents() == '') or die("\n<p><strong>Error:</strong> Unable to send file <strong>$fileName</strong>. Output buffer contains the following text (typically warnings or errors):<br>" . htmlentities(ob_get_contents()) . "</p>")) { | ||
558 | if (ini_get('zlib.output_compression')) { | ||
559 | ini_set('zlib.output_compression', 'Off'); | ||
560 | } | ||
561 | |||
562 | header("Pragma: public"); | ||
563 | header("Last-Modified: " . gmdate("D, d M Y H:i:s T")); | ||
564 | header("Expires: 0"); | ||
565 | header("Accept-Ranges: bytes"); | ||
566 | header("Connection: close"); | ||
567 | header("Content-Type: " . $contentType); | ||
568 | $cd = "Content-Disposition: "; | ||
569 | if ($inline) { | ||
570 | $cd .= "inline"; | ||
571 | } else{ | ||
572 | $cd .= "attached"; | ||
573 | } | ||
574 | if ($fileName) { | ||
575 | $cd .= '; filename="' . $fileName . '"'; | ||
576 | } | ||
577 | if ($utf8FileName) { | ||
578 | $cd .= "; filename*=UTF-8''" . rawurlencode($utf8FileName); | ||
579 | } | ||
580 | header($cd); | ||
581 | header("Content-Length: ". $this->getArchiveSize()); | ||
582 | |||
583 | if (!is_resource($this->zipFile)) { | ||
584 | echo $this->zipData; | ||
585 | } else { | ||
586 | rewind($this->zipFile); | ||
587 | |||
588 | while (!feof($this->zipFile)) { | ||
589 | echo fread($this->zipFile, $this->streamChunkSize); | ||
590 | } | ||
591 | } | ||
592 | } | ||
593 | return TRUE; | ||
594 | } | ||
595 | return FALSE; | ||
596 | } | ||
597 | |||
598 | /** | ||
599 | * Return the current size of the archive | ||
600 | * | ||
601 | * @return $size Size of the archive | ||
602 | */ | ||
603 | public function getArchiveSize() { | ||
604 | if (!is_resource($this->zipFile)) { | ||
605 | return strlen($this->zipData); | ||
606 | } | ||
607 | $filestat = fstat($this->zipFile); | ||
608 | |||
609 | return $filestat['size']; | ||
610 | } | ||
611 | |||
612 | /** | ||
613 | * Calculate the 2 byte dostime used in the zip entries. | ||
614 | * | ||
615 | * @param int $timestamp | ||
616 | * @return 2-byte encoded DOS Date | ||
617 | */ | ||
618 | private function getDosTime($timestamp = 0) { | ||
619 | $timestamp = (int)$timestamp; | ||
620 | $oldTZ = @date_default_timezone_get(); | ||
621 | date_default_timezone_set('UTC'); | ||
622 | $date = ($timestamp == 0 ? getdate() : getdate($timestamp)); | ||
623 | date_default_timezone_set($oldTZ); | ||
624 | if ($date["year"] >= 1980) { | ||
625 | return pack("V", (($date["mday"] + ($date["mon"] << 5) + (($date["year"]-1980) << 9)) << 16) | | ||
626 | (($date["seconds"] >> 1) + ($date["minutes"] << 5) + ($date["hours"] << 11))); | ||
627 | } | ||
628 | return "\x00\x00\x00\x00"; | ||
629 | } | ||
630 | |||
631 | /** | ||
632 | * Build the Zip file structures | ||
633 | * | ||
634 | * @param string $filePath | ||
635 | * @param string $fileComment | ||
636 | * @param string $gpFlags | ||
637 | * @param string $gzType | ||
638 | * @param int $timestamp | ||
639 | * @param string $fileCRC32 | ||
640 | * @param int $gzLength | ||
641 | * @param int $dataLength | ||
642 | * @param int $extFileAttr Use self::EXT_FILE_ATTR_FILE for files, self::EXT_FILE_ATTR_DIR for Directories. | ||
643 | */ | ||
644 | private function buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr) { | ||
645 | $filePath = str_replace("\\", "/", $filePath); | ||
646 | $fileCommentLength = (empty($fileComment) ? 0 : strlen($fileComment)); | ||
647 | $timestamp = (int)$timestamp; | ||
648 | $timestamp = ($timestamp == 0 ? time() : $timestamp); | ||
649 | |||
650 | $dosTime = $this->getDosTime($timestamp); | ||
651 | $tsPack = pack("V", $timestamp); | ||
652 | |||
653 | $ux = "\x75\x78\x0B\x00\x01\x04\xE8\x03\x00\x00\x04\x00\x00\x00\x00"; | ||
654 | |||
655 | if (!isset($gpFlags) || strlen($gpFlags) != 2) { | ||
656 | $gpFlags = "\x00\x00"; | ||
657 | } | ||
658 | |||
659 | $isFileUTF8 = mb_check_encoding($filePath, "UTF-8") && !mb_check_encoding($filePath, "ASCII"); | ||
660 | $isCommentUTF8 = !empty($fileComment) && mb_check_encoding($fileComment, "UTF-8") && !mb_check_encoding($fileComment, "ASCII"); | ||
661 | if ($isFileUTF8 || $isCommentUTF8) { | ||
662 | $flag = 0; | ||
663 | $gpFlagsV = unpack("vflags", $gpFlags); | ||
664 | if (isset($gpFlagsV['flags'])) { | ||
665 | $flag = $gpFlagsV['flags']; | ||
666 | } | ||
667 | $gpFlags = pack("v", $flag | (1 << 11)); | ||
668 | } | ||
669 | |||
670 | $header = $gpFlags . $gzType . $dosTime. $fileCRC32 | ||
671 | . pack("VVv", $gzLength, $dataLength, strlen($filePath)); // File name length | ||
672 | |||
673 | $zipEntry = self::ZIP_LOCAL_FILE_HEADER; | ||
674 | $zipEntry .= self::ATTR_VERSION_TO_EXTRACT; | ||
675 | $zipEntry .= $header; | ||
676 | $zipEntry .= pack("v", ($this->addExtraField ? 28 : 0)); // Extra field length | ||
677 | $zipEntry .= $filePath; // FileName | ||
678 | // Extra fields | ||
679 | if ($this->addExtraField) { | ||
680 | $zipEntry .= "\x55\x54\x09\x00\x03" . $tsPack . $tsPack . $ux; | ||
681 | } | ||
682 | $this->zipwrite($zipEntry); | ||
683 | |||
684 | $cdEntry = self::ZIP_CENTRAL_FILE_HEADER; | ||
685 | $cdEntry .= self::ATTR_MADE_BY_VERSION; | ||
686 | $cdEntry .= ($dataLength === 0 ? "\x0A\x00" : self::ATTR_VERSION_TO_EXTRACT); | ||
687 | $cdEntry .= $header; | ||
688 | $cdEntry .= pack("v", ($this->addExtraField ? 24 : 0)); // Extra field length | ||
689 | $cdEntry .= pack("v", $fileCommentLength); // File comment length | ||
690 | $cdEntry .= "\x00\x00"; // Disk number start | ||
691 | $cdEntry .= "\x00\x00"; // internal file attributes | ||
692 | $cdEntry .= pack("V", $extFileAttr); // External file attributes | ||
693 | $cdEntry .= pack("V", $this->offset); // Relative offset of local header | ||
694 | $cdEntry .= $filePath; // FileName | ||
695 | // Extra fields | ||
696 | if ($this->addExtraField) { | ||
697 | $cdEntry .= "\x55\x54\x05\x00\x03" . $tsPack . $ux; | ||
698 | } | ||
699 | if (!empty($fileComment)) { | ||
700 | $cdEntry .= $fileComment; // Comment | ||
701 | } | ||
702 | |||
703 | $this->cdRec[] = $cdEntry; | ||
704 | $this->offset += strlen($zipEntry) + $gzLength; | ||
705 | } | ||
706 | |||
707 | private function zipwrite($data) { | ||
708 | if (!is_resource($this->zipFile)) { | ||
709 | $this->zipData .= $data; | ||
710 | } else { | ||
711 | fwrite($this->zipFile, $data); | ||
712 | fflush($this->zipFile); | ||
713 | } | ||
714 | } | ||
715 | |||
716 | private function zipflush() { | ||
717 | if (!is_resource($this->zipFile)) { | ||
718 | $this->zipFile = tmpfile(); | ||
719 | fwrite($this->zipFile, $this->zipData); | ||
720 | $this->zipData = NULL; | ||
721 | } | ||
722 | } | ||
723 | |||
724 | /** | ||
725 | * Join $file to $dir path, and clean up any excess slashes. | ||
726 | * | ||
727 | * @param string $dir | ||
728 | * @param string $file | ||
729 | */ | ||
730 | public static function pathJoin($dir, $file) { | ||
731 | if (empty($dir) || empty($file)) { | ||
732 | return self::getRelativePath($dir . $file); | ||
733 | } | ||
734 | return self::getRelativePath($dir . '/' . $file); | ||
735 | } | ||
736 | |||
737 | /** | ||
738 | * Clean up a path, removing any unnecessary elements such as /./, // or redundant ../ segments. | ||
739 | * If the path starts with a "/", it is deemed an absolute path and any /../ in the beginning is stripped off. | ||
740 | * The returned path will not end in a "/". | ||
741 | * | ||
742 | * Sometimes, when a path is generated from multiple fragments, | ||
743 | * you can get something like "../data/html/../images/image.jpeg" | ||
744 | * This will normalize that example path to "../data/images/image.jpeg" | ||
745 | * | ||
746 | * @param string $path The path to clean up | ||
747 | * @return string the clean path | ||
748 | */ | ||
749 | public static function getRelativePath($path) { | ||
750 | $path = preg_replace("#/+\.?/+#", "/", str_replace("\\", "/", $path)); | ||
751 | $dirs = explode("/", rtrim(preg_replace('#^(?:\./)+#', '', $path), '/')); | ||
752 | |||
753 | $offset = 0; | ||
754 | $sub = 0; | ||
755 | $subOffset = 0; | ||
756 | $root = ""; | ||
757 | |||
758 | if (empty($dirs[0])) { | ||
759 | $root = "/"; | ||
760 | $dirs = array_splice($dirs, 1); | ||
761 | } else if (preg_match("#[A-Za-z]:#", $dirs[0])) { | ||
762 | $root = strtoupper($dirs[0]) . "/"; | ||
763 | $dirs = array_splice($dirs, 1); | ||
764 | } | ||
765 | |||
766 | $newDirs = array(); | ||
767 | foreach ($dirs as $dir) { | ||
768 | if ($dir !== "..") { | ||
769 | $subOffset--; | ||
770 | $newDirs[++$offset] = $dir; | ||
771 | } else { | ||
772 | $subOffset++; | ||
773 | if (--$offset < 0) { | ||
774 | $offset = 0; | ||
775 | if ($subOffset > $sub) { | ||
776 | $sub++; | ||
777 | } | ||
778 | } | ||
779 | } | ||
780 | } | ||
781 | |||
782 | if (empty($root)) { | ||
783 | $root = str_repeat("../", $sub); | ||
784 | } | ||
785 | return $root . implode("/", array_slice($newDirs, 0, $offset)); | ||
786 | } | ||
787 | |||
788 | /** | ||
789 | * Create the file permissions for a file or directory, for use in the extFileAttr parameters. | ||
790 | * | ||
791 | * @param int $owner Unix permisions for owner (octal from 00 to 07) | ||
792 | * @param int $group Unix permisions for group (octal from 00 to 07) | ||
793 | * @param int $other Unix permisions for others (octal from 00 to 07) | ||
794 | * @param bool $isFile | ||
795 | * @return EXTRERNAL_REF field. | ||
796 | */ | ||
797 | public static function generateExtAttr($owner = 07, $group = 05, $other = 05, $isFile = true) { | ||
798 | $fp = $isFile ? self::S_IFREG : self::S_IFDIR; | ||
799 | $fp |= (($owner & 07) << 6) | (($group & 07) << 3) | ($other & 07); | ||
800 | |||
801 | return ($fp << 16) | ($isFile ? self::S_DOS_A : self::S_DOS_D); | ||
802 | } | ||
803 | |||
804 | /** | ||
805 | * Get the file permissions for a file or directory, for use in the extFileAttr parameters. | ||
806 | * | ||
807 | * @param string $filename | ||
808 | * @return external ref field, or FALSE if the file is not found. | ||
809 | */ | ||
810 | public static function getFileExtAttr($filename) { | ||
811 | if (file_exists($filename)) { | ||
812 | $fp = fileperms($filename) << 16; | ||
813 | return $fp | (is_dir($filename) ? self::S_DOS_D : self::S_DOS_A); | ||
814 | } | ||
815 | return FALSE; | ||
816 | } | ||
817 | } | ||
818 | ?> | ||
diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt new file mode 100644 index 00000000..9424a83e --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt | |||
@@ -0,0 +1,31 @@ | |||
1 | DrUUID RFC4122 library for PHP5 | ||
2 | by J. King (http://jkingweb.ca/) | ||
3 | Licensed under MIT license | ||
4 | |||
5 | See http://jkingweb.ca/code/php/lib.uuid/ | ||
6 | for documentation | ||
7 | |||
8 | Last revised 2010-02-15 | ||
9 | |||
10 | Copyright (c) 2009 J. King | ||
11 | |||
12 | Permission is hereby granted, free of charge, to any person | ||
13 | obtaining a copy of this software and associated documentation | ||
14 | files (the "Software"), to deal in the Software without | ||
15 | restriction, including without limitation the rights to use, | ||
16 | copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
17 | copies of the Software, and to permit persons to whom the | ||
18 | Software is furnished to do so, subject to the following | ||
19 | conditions: | ||
20 | |||
21 | The above copyright notice and this permission notice shall be | ||
22 | included in all copies or substantial portions of the Software. | ||
23 | |||
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
26 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
27 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
28 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
29 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
30 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
31 | OTHER DEALINGS IN THE SOFTWARE. | ||
diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.php b/inc/3rdparty/libraries/PHPePub/lib.uuid.php new file mode 100644 index 00000000..c6a8de52 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.php | |||
@@ -0,0 +1,314 @@ | |||
1 | <?php | ||
2 | /* | ||
3 | DrUUID RFC4122 library for PHP5 | ||
4 | by J. King (http://jkingweb.ca/) | ||
5 | Licensed under MIT license | ||
6 | |||
7 | See http://jkingweb.ca/code/php/lib.uuid/ | ||
8 | for documentation | ||
9 | |||
10 | Last revised 2010-02-15 | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | Copyright (c) 2009 J. King | ||
15 | |||
16 | Permission is hereby granted, free of charge, to any person | ||
17 | obtaining a copy of this software and associated documentation | ||
18 | files (the "Software"), to deal in the Software without | ||
19 | restriction, including without limitation the rights to use, | ||
20 | copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
21 | copies of the Software, and to permit persons to whom the | ||
22 | Software is furnished to do so, subject to the following | ||
23 | conditions: | ||
24 | |||
25 | The above copyright notice and this permission notice shall be | ||
26 | included in all copies or substantial portions of the Software. | ||
27 | |||
28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
29 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
30 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
31 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
32 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
33 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
34 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
35 | OTHER DEALINGS IN THE SOFTWARE. | ||
36 | */ | ||
37 | |||
38 | |||
39 | class UUID { | ||
40 | const MD5 = 3; | ||
41 | const SHA1 = 5; | ||
42 | const clearVer = 15; // 00001111 Clears all bits of version byte with AND | ||
43 | const clearVar = 63; // 00111111 Clears all relevant bits of variant byte with AND | ||
44 | const varRes = 224; // 11100000 Variant reserved for future use | ||
45 | const varMS = 192; // 11000000 Microsft GUID variant | ||
46 | const varRFC = 128; // 10000000 The RFC 4122 variant (this variant) | ||
47 | const varNCS = 0; // 00000000 The NCS compatibility variant | ||
48 | const version1 = 16; // 00010000 | ||
49 | const version3 = 48; // 00110000 | ||
50 | const version4 = 64; // 01000000 | ||
51 | const version5 = 80; // 01010000 | ||
52 | const interval = 0x01b21dd213814000; // Time (in 100ns steps) between the start of the UTC and Unix epochs | ||
53 | const nsDNS = '6ba7b810-9dad-11d1-80b4-00c04fd430c8'; | ||
54 | const nsURL = '6ba7b811-9dad-11d1-80b4-00c04fd430c8'; | ||
55 | const nsOID = '6ba7b812-9dad-11d1-80b4-00c04fd430c8'; | ||
56 | const nsX500 = '6ba7b814-9dad-11d1-80b4-00c04fd430c8'; | ||
57 | protected static $randomFunc = 'randomTwister'; | ||
58 | protected static $randomSource = NULL; | ||
59 | //instance properties | ||
60 | protected $bytes; | ||
61 | protected $hex; | ||
62 | protected $string; | ||
63 | protected $urn; | ||
64 | protected $version; | ||
65 | protected $variant; | ||
66 | protected $node; | ||
67 | protected $time; | ||
68 | |||
69 | public static function mint($ver = 1, $node = NULL, $ns = NULL) { | ||
70 | /* Create a new UUID based on provided data. */ | ||
71 | switch((int) $ver) { | ||
72 | case 1: | ||
73 | return new self(self::mintTime($node)); | ||
74 | case 2: | ||
75 | // Version 2 is not supported | ||
76 | throw new UUIDException("Version 2 is unsupported."); | ||
77 | case 3: | ||
78 | return new self(self::mintName(self::MD5, $node, $ns)); | ||
79 | case 4: | ||
80 | return new self(self::mintRand()); | ||
81 | case 5: | ||
82 | return new self(self::mintName(self::SHA1, $node, $ns)); | ||
83 | default: | ||
84 | throw new UUIDException("Selected version is invalid or unsupported."); | ||
85 | } | ||
86 | } | ||
87 | |||
88 | public static function import($uuid) { | ||
89 | /* Import an existing UUID. */ | ||
90 | return new self(self::makeBin($uuid, 16)); | ||
91 | } | ||
92 | |||
93 | public static function compare($a, $b) { | ||
94 | /* Compares the binary representations of two UUIDs. | ||
95 | The comparison will return true if they are bit-exact, | ||
96 | or if neither is valid. */ | ||
97 | if (self::makeBin($a, 16)==self::makeBin($b, 16)) { | ||
98 | return TRUE; | ||
99 | } else { | ||
100 | return FALSE; | ||
101 | } | ||
102 | } | ||
103 | |||
104 | public function __toString() { | ||
105 | return $this->string; | ||
106 | } | ||
107 | |||
108 | public function __get($var) { | ||
109 | switch($var) { | ||
110 | case "bytes": | ||
111 | return $this->bytes; | ||
112 | case "hex": | ||
113 | return bin2hex($this->bytes); | ||
114 | case "string": | ||
115 | return $this->__toString(); | ||
116 | case "urn": | ||
117 | return "urn:uuid:".$this->__toString(); | ||
118 | case "version": | ||
119 | return ord($this->bytes[6]) >> 4; | ||
120 | case "variant": | ||
121 | $byte = ord($this->bytes[8]); | ||
122 | if ($byte >= self::varRes) { | ||
123 | return 3; | ||
124 | } | ||
125 | if ($byte >= self::varMS) { | ||
126 | return 2; | ||
127 | } | ||
128 | if ($byte >= self::varRFC) { | ||
129 | return 1; | ||
130 | } | ||
131 | return 0; | ||
132 | case "node": | ||
133 | if (ord($this->bytes[6])>>4==1) { | ||
134 | return bin2hex(substr($this->bytes,10)); | ||
135 | } else { | ||
136 | return NULL; | ||
137 | } | ||
138 | case "time": | ||
139 | if (ord($this->bytes[6])>>4==1) { | ||
140 | // Restore contiguous big-endian byte order | ||
141 | $time = bin2hex($this->bytes[6].$this->bytes[7].$this->bytes[4].$this->bytes[5].$this->bytes[0].$this->bytes[1].$this->bytes[2].$this->bytes[3]); | ||
142 | // Clear version flag | ||
143 | $time[0] = "0"; | ||
144 | // Do some reverse arithmetic to get a Unix timestamp | ||
145 | $time = (hexdec($time) - self::interval) / 10000000; | ||
146 | return $time; | ||
147 | } else { | ||
148 | return NULL; | ||
149 | } | ||
150 | default: | ||
151 | return NULL; | ||
152 | } | ||
153 | } | ||
154 | |||
155 | protected function __construct($uuid) { | ||
156 | if (strlen($uuid) != 16) { | ||
157 | throw new UUIDException("Input must be a 128-bit integer."); | ||
158 | } | ||
159 | $this->bytes = $uuid; | ||
160 | // Optimize the most common use | ||
161 | $this->string = | ||
162 | bin2hex(substr($uuid,0,4))."-". | ||
163 | bin2hex(substr($uuid,4,2))."-". | ||
164 | bin2hex(substr($uuid,6,2))."-". | ||
165 | bin2hex(substr($uuid,8,2))."-". | ||
166 | bin2hex(substr($uuid,10,6)); | ||
167 | } | ||
168 | |||
169 | protected static function mintTime($node = NULL) { | ||
170 | /* Generates a Version 1 UUID. | ||
171 | These are derived from the time at which they were generated. */ | ||
172 | // Get time since Gregorian calendar reform in 100ns intervals | ||
173 | // This is exceedingly difficult because of PHP's (and pack()'s) | ||
174 | // integer size limits. | ||
175 | // Note that this will never be more accurate than to the microsecond. | ||
176 | $time = microtime(1) * 10000000 + self::interval; | ||
177 | // Convert to a string representation | ||
178 | $time = sprintf("%F", $time); | ||
179 | preg_match("/^\d+/", $time, $time); //strip decimal point | ||
180 | // And now to a 64-bit binary representation | ||
181 | $time = base_convert($time[0], 10, 16); | ||
182 | $time = pack("H*", str_pad($time, 16, "0", STR_PAD_LEFT)); | ||
183 | // Reorder bytes to their proper locations in the UUID | ||
184 | $uuid = $time[4].$time[5].$time[6].$time[7].$time[2].$time[3].$time[0].$time[1]; | ||
185 | // Generate a random clock sequence | ||
186 | $uuid .= self::randomBytes(2); | ||
187 | // set variant | ||
188 | $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); | ||
189 | // set version | ||
190 | $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version1); | ||
191 | // Set the final 'node' parameter, a MAC address | ||
192 | if ($node) { | ||
193 | $node = self::makeBin($node, 6); | ||
194 | } | ||
195 | if (!$node) { | ||
196 | // If no node was provided or if the node was invalid, | ||
197 | // generate a random MAC address and set the multicast bit | ||
198 | $node = self::randomBytes(6); | ||
199 | $node[0] = pack("C", ord($node[0]) | 1); | ||
200 | } | ||
201 | $uuid .= $node; | ||
202 | return $uuid; | ||
203 | } | ||
204 | |||
205 | protected static function mintRand() { | ||
206 | /* Generate a Version 4 UUID. | ||
207 | These are derived soly from random numbers. */ | ||
208 | // generate random fields | ||
209 | $uuid = self::randomBytes(16); | ||
210 | // set variant | ||
211 | $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); | ||
212 | // set version | ||
213 | $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version4); | ||
214 | return $uuid; | ||
215 | } | ||
216 | |||
217 | protected static function mintName($ver, $node, $ns) { | ||
218 | /* Generates a Version 3 or Version 5 UUID. | ||
219 | These are derived from a hash of a name and its namespace, in binary form. */ | ||
220 | if (!$node) { | ||
221 | throw new UUIDException("A name-string is required for Version 3 or 5 UUIDs."); | ||
222 | } | ||
223 | // if the namespace UUID isn't binary, make it so | ||
224 | $ns = self::makeBin($ns, 16); | ||
225 | if (!$ns) { | ||
226 | throw new UUIDException("A binary namespace is required for Version 3 or 5 UUIDs."); | ||
227 | } | ||
228 | $uuid = null; | ||
229 | $version = self::version3; | ||
230 | switch($ver) { | ||
231 | case self::MD5: | ||
232 | $version = self::version3; | ||
233 | $uuid = md5($ns.$node,1); | ||
234 | break; | ||
235 | case self::SHA1: | ||
236 | $version = self::version5; | ||
237 | $uuid = substr(sha1($ns.$node,1),0, 16); | ||
238 | break; | ||
239 | } | ||
240 | // set variant | ||
241 | $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); | ||
242 | // set version | ||
243 | $uuid[6] = chr(ord($uuid[6]) & self::clearVer | $version); | ||
244 | return ($uuid); | ||
245 | } | ||
246 | |||
247 | protected static function makeBin($str, $len) { | ||
248 | /* Insure that an input string is either binary or hexadecimal. | ||
249 | Returns binary representation, or false on failure. */ | ||
250 | if ($str instanceof self) { | ||
251 | return $str->bytes; | ||
252 | } | ||
253 | if (strlen($str)==$len) { | ||
254 | return $str; | ||
255 | } else { | ||
256 | $str = preg_replace("/^urn:uuid:/is", "", $str); // strip URN scheme and namespace | ||
257 | } | ||
258 | $str = preg_replace("/[^a-f0-9]/is", "", $str); // strip non-hex characters | ||
259 | if (strlen($str) != ($len * 2)) { | ||
260 | return FALSE; | ||
261 | } else { | ||
262 | return pack("H*", $str); | ||
263 | } | ||
264 | } | ||
265 | |||
266 | public static function initRandom() { | ||
267 | /* Look for a system-provided source of randomness, which is usually crytographically secure. | ||
268 | /dev/urandom is tried first simply out of bias for Linux systems. */ | ||
269 | if (is_readable('/dev/urandom')) { | ||
270 | self::$randomSource = fopen('/dev/urandom', 'rb'); | ||
271 | self::$randomFunc = 'randomFRead'; | ||
272 | } | ||
273 | else if (class_exists('COM', 0)) { | ||
274 | try { | ||
275 | self::$randomSource = new COM('CAPICOM.Utilities.1'); // See http://msdn.microsoft.com/en-us/library/aa388182(VS.85).aspx | ||
276 | self::$randomFunc = 'randomCOM'; | ||
277 | } | ||
278 | catch(Exception $e) { | ||
279 | } | ||
280 | } | ||
281 | return self::$randomFunc; | ||
282 | } | ||
283 | |||
284 | public static function randomBytes($bytes) { | ||
285 | return call_user_func(array('self', self::$randomFunc), $bytes); | ||
286 | } | ||
287 | |||
288 | protected static function randomTwister($bytes) { | ||
289 | /* Get the specified number of random bytes, using mt_rand(). | ||
290 | Randomness is returned as a string of bytes. */ | ||
291 | $rand = ""; | ||
292 | for ($a = 0; $a < $bytes; $a++) { | ||
293 | $rand .= chr(mt_rand(0, 255)); | ||
294 | } | ||
295 | return $rand; | ||
296 | } | ||
297 | |||
298 | protected static function randomFRead($bytes) { | ||
299 | /* Get the specified number of random bytes using a file handle | ||
300 | previously opened with UUID::initRandom(). | ||
301 | Randomness is returned as a string of bytes. */ | ||
302 | return fread(self::$randomSource, $bytes); | ||
303 | } | ||
304 | |||
305 | protected static function randomCOM($bytes) { | ||
306 | /* Get the specified number of random bytes using Windows' | ||
307 | randomness source via a COM object previously created by UUID::initRandom(). | ||
308 | Randomness is returned as a string of bytes. */ | ||
309 | return base64_decode(self::$randomSource->GetRandom($bytes,0)); // straight binary mysteriously doesn't work, hence the base64 | ||
310 | } | ||
311 | } | ||
312 | |||
313 | class UUIDException extends Exception { | ||
314 | } | ||
diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php index ddd33bb5..21e693e7 100644 --- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php +++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php | |||
@@ -1,728 +1,727 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Content Extractor | 3 | * Content Extractor |
4 | * | 4 | * |
5 | * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) | 5 | * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) |
6 | * to extract content from HTML files. | 6 | * to extract content from HTML files. |
7 | * | 7 | * |
8 | * @version 1.0 | 8 | * @version 1.0 |
9 | * @date 2013-02-05 | 9 | * @date 2013-02-05 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2013 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class ContentExtractor | 15 | class ContentExtractor |
16 | { | 16 | { |
17 | protected static $tidy_config = array( | 17 | protected static $tidy_config = array( |
18 | 'clean' => true, | 18 | 'clean' => true, |
19 | 'output-xhtml' => true, | 19 | 'output-xhtml' => true, |
20 | 'logical-emphasis' => true, | 20 | 'logical-emphasis' => true, |
21 | 'show-body-only' => false, | 21 | 'show-body-only' => false, |
22 | 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', | 22 | 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', |
23 | 'new-inline-tags' => 'mark, time, meter, progress, data', | 23 | 'new-inline-tags' => 'mark, time, meter, progress, data', |
24 | 'wrap' => 0, | 24 | 'wrap' => 0, |
25 | 'drop-empty-paras' => true, | 25 | 'drop-empty-paras' => true, |
26 | 'drop-proprietary-attributes' => false, | 26 | 'drop-proprietary-attributes' => false, |
27 | 'enclose-text' => true, | 27 | 'enclose-text' => true, |
28 | 'enclose-block-text' => true, | 28 | 'enclose-block-text' => true, |
29 | 'merge-divs' => true, | 29 | 'merge-divs' => true, |
30 | 'merge-spans' => true, | 30 | 'merge-spans' => true, |
31 | 'char-encoding' => 'utf8', | 31 | 'char-encoding' => 'utf8', |
32 | 'hide-comments' => true | 32 | 'hide-comments' => true |
33 | ); | 33 | ); |
34 | protected $html; | 34 | protected $html; |
35 | protected $config; | 35 | protected $config; |
36 | protected $title; | 36 | protected $title; |
37 | protected $author = array(); | 37 | protected $author = array(); |
38 | protected $language; | 38 | protected $language; |
39 | protected $date; | 39 | protected $date; |
40 | protected $body; | 40 | protected $body; |
41 | protected $success = false; | 41 | protected $success = false; |
42 | protected $nextPageUrl; | 42 | protected $nextPageUrl; |
43 | public $allowedParsers = array('libxml', 'html5lib'); | 43 | public $allowedParsers = array('libxml', 'html5lib'); |
44 | public $fingerprints = array(); | 44 | public $fingerprints = array(); |
45 | public $readability; | 45 | public $readability; |
46 | public $debug = false; | 46 | public $debug = false; |
47 | public $debugVerbose = false; | 47 | public $debugVerbose = false; |
48 | 48 | ||
49 | function __construct($path, $fallback=null) { | 49 | function __construct($path, $fallback=null) { |
50 | SiteConfig::set_config_path($path, $fallback); | 50 | SiteConfig::set_config_path($path, $fallback); |
51 | } | 51 | } |
52 | 52 | ||
53 | protected function debug($msg) { | 53 | protected function debug($msg) { |
54 | if ($this->debug) { | 54 | if ($this->debug) { |
55 | $mem = round(memory_get_usage()/1024, 2); | 55 | $mem = round(memory_get_usage()/1024, 2); |
56 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 56 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
57 | echo '* ',$msg; | 57 | echo '* ',$msg; |
58 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 58 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
59 | echo "\n"; | 59 | echo "\n"; |
60 | ob_flush(); | 60 | ob_flush(); |
61 | flush(); | 61 | flush(); |
62 | } | 62 | } |
63 | } | 63 | } |
64 | 64 | ||
65 | public function reset() { | 65 | public function reset() { |
66 | $this->html = null; | 66 | $this->html = null; |
67 | $this->readability = null; | 67 | $this->readability = null; |
68 | $this->config = null; | 68 | $this->config = null; |
69 | $this->title = null; | 69 | $this->title = null; |
70 | $this->body = null; | 70 | $this->body = null; |
71 | $this->author = array(); | 71 | $this->author = array(); |
72 | $this->language = null; | 72 | $this->language = null; |
73 | $this->date = null; | 73 | $this->date = null; |
74 | $this->nextPageUrl = null; | 74 | $this->nextPageUrl = null; |
75 | $this->success = false; | 75 | $this->success = false; |
76 | } | 76 | } |
77 | 77 | ||
78 | public function findHostUsingFingerprints($html) { | 78 | public function findHostUsingFingerprints($html) { |
79 | $this->debug('Checking fingerprints...'); | 79 | $this->debug('Checking fingerprints...'); |
80 | $head = substr($html, 0, 8000); | 80 | $head = substr($html, 0, 8000); |
81 | foreach ($this->fingerprints as $_fp => $_fphost) { | 81 | foreach ($this->fingerprints as $_fp => $_fphost) { |
82 | $lookin = 'html'; | 82 | $lookin = 'html'; |
83 | if (is_array($_fphost)) { | 83 | if (is_array($_fphost)) { |
84 | if (isset($_fphost['head']) && $_fphost['head']) { | 84 | if (isset($_fphost['head']) && $_fphost['head']) { |
85 | $lookin = 'head'; | 85 | $lookin = 'head'; |
86 | } | 86 | } |
87 | $_fphost = $_fphost['hostname']; | 87 | $_fphost = $_fphost['hostname']; |
88 | } | 88 | } |
89 | if (strpos($$lookin, $_fp) !== false) { | 89 | if (strpos($$lookin, $_fp) !== false) { |
90 | $this->debug("Found match: $_fphost"); | 90 | $this->debug("Found match: $_fphost"); |
91 | return $_fphost; | 91 | return $_fphost; |
92 | } | 92 | } |
93 | } | 93 | } |
94 | $this->debug('No fingerprint matches'); | 94 | $this->debug('No fingerprint matches'); |
95 | return false; | 95 | return false; |
96 | } | 96 | } |
97 | 97 | ||
98 | // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) | 98 | // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) |
99 | public function buildSiteConfig($url, $html='', $add_to_cache=true) { | 99 | public function buildSiteConfig($url, $html='', $add_to_cache=true) { |
100 | // extract host name | 100 | // extract host name |
101 | $host = @parse_url($url, PHP_URL_HOST); | 101 | $host = @parse_url($url, PHP_URL_HOST); |
102 | $host = strtolower($host); | 102 | $host = strtolower($host); |
103 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 103 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
104 | // is merged version already cached? | 104 | // is merged version already cached? |
105 | if (SiteConfig::is_cached("$host.merged")) { | 105 | if (SiteConfig::is_cached("$host.merged")) { |
106 | $this->debug("Returning cached and merged site config for $host"); | 106 | $this->debug("Returning cached and merged site config for $host"); |
107 | return SiteConfig::build("$host.merged"); | 107 | return SiteConfig::build("$host.merged"); |
108 | } | 108 | } |
109 | // let's build from site_config/custom/ and standard/ | 109 | // let's build from site_config/custom/ and standard/ |
110 | $config = SiteConfig::build($host); | 110 | $config = SiteConfig::build($host); |
111 | if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { | 111 | if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { |
112 | SiteConfig::add_to_cache($host, $config); | 112 | SiteConfig::add_to_cache($host, $config); |
113 | } | 113 | } |
114 | // if no match, use defaults | 114 | // if no match, use defaults |
115 | if (!$config) $config = new SiteConfig(); | 115 | if (!$config) $config = new SiteConfig(); |
116 | // load fingerprint config? | 116 | // load fingerprint config? |
117 | if ($config->autodetect_on_failure()) { | 117 | if ($config->autodetect_on_failure()) { |
118 | // check HTML for fingerprints | 118 | // check HTML for fingerprints |
119 | if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { | 119 | if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { |
120 | if ($config_fingerprint = SiteConfig::build($_fphost)) { | 120 | if ($config_fingerprint = SiteConfig::build($_fphost)) { |
121 | $this->debug("Appending site config settings from $_fphost (fingerprint match)"); | 121 | $this->debug("Appending site config settings from $_fphost (fingerprint match)"); |
122 | $config->append($config_fingerprint); | 122 | $config->append($config_fingerprint); |
123 | if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { | 123 | if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { |
124 | //$config_fingerprint->cache_in_apc = true; | 124 | //$config_fingerprint->cache_in_apc = true; |
125 | SiteConfig::add_to_cache($_fphost, $config_fingerprint); | 125 | SiteConfig::add_to_cache($_fphost, $config_fingerprint); |
126 | } | 126 | } |
127 | } | 127 | } |
128 | } | 128 | } |
129 | } | 129 | } |
130 | // load global config? | 130 | // load global config? |
131 | if ($config->autodetect_on_failure()) { | 131 | if ($config->autodetect_on_failure()) { |
132 | if ($config_global = SiteConfig::build('global', true)) { | 132 | if ($config_global = SiteConfig::build('global', true)) { |
133 | $this->debug('Appending site config settings from global.txt'); | 133 | $this->debug('Appending site config settings from global.txt'); |
134 | $config->append($config_global); | 134 | $config->append($config_global); |
135 | if ($add_to_cache && !SiteConfig::is_cached('global')) { | 135 | if ($add_to_cache && !SiteConfig::is_cached('global')) { |
136 | //$config_global->cache_in_apc = true; | 136 | //$config_global->cache_in_apc = true; |
137 | SiteConfig::add_to_cache('global', $config_global); | 137 | SiteConfig::add_to_cache('global', $config_global); |
138 | } | 138 | } |
139 | } | 139 | } |
140 | } | 140 | } |
141 | // store copy of merged config | 141 | // store copy of merged config |
142 | if ($add_to_cache) { | 142 | if ($add_to_cache) { |
143 | // do not store in APC if wildcard match | 143 | // do not store in APC if wildcard match |
144 | $use_apc = ($host == $config->cache_key); | 144 | $use_apc = ($host == $config->cache_key); |
145 | $config->cache_key = null; | 145 | $config->cache_key = null; |
146 | SiteConfig::add_to_cache("$host.merged", $config, $use_apc); | 146 | SiteConfig::add_to_cache("$host.merged", $config, $use_apc); |
147 | } | 147 | } |
148 | return $config; | 148 | return $config; |
149 | } | 149 | } |
150 | 150 | ||
151 | // returns true on success, false on failure | 151 | // returns true on success, false on failure |
152 | // $smart_tidy indicates that if tidy is used and no results are produced, we will | 152 | // $smart_tidy indicates that if tidy is used and no results are produced, we will |
153 | // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time | 153 | // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time |
154 | // but it has problems of its own which we try to avoid with this option. | 154 | // but it has problems of its own which we try to avoid with this option. |
155 | public function process($html, $url, $smart_tidy=true) { | 155 | public function process($html, $url, $smart_tidy=true) { |
156 | $this->reset(); | 156 | $this->reset(); |
157 | $this->config = $this->buildSiteConfig($url, $html); | 157 | $this->config = $this->buildSiteConfig($url, $html); |
158 | 158 | ||
159 | // do string replacements | 159 | // do string replacements |
160 | if (!empty($this->config->find_string)) { | 160 | if (!empty($this->config->find_string)) { |
161 | if (count($this->config->find_string) == count($this->config->replace_string)) { | 161 | if (count($this->config->find_string) == count($this->config->replace_string)) { |
162 | $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); | 162 | $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); |
163 | $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); | 163 | $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); |
164 | } else { | 164 | } else { |
165 | $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); | 165 | $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); |
166 | } | 166 | } |
167 | unset($_count); | 167 | unset($_count); |
168 | } | 168 | } |
169 | 169 | ||
170 | // use tidy (if it exists)? | 170 | // use tidy (if it exists)? |
171 | // This fixes problems with some sites which would otherwise | 171 | // This fixes problems with some sites which would otherwise |
172 | // trouble DOMDocument's HTML parsing. (Although sometimes it | 172 | // trouble DOMDocument's HTML parsing. (Although sometimes it |
173 | // makes matters worse, which is why you can override it in site config files.) | 173 | // makes matters worse, which is why you can override it in site config files.) |
174 | $tidied = false; | 174 | $tidied = false; |
175 | if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { | 175 | if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { |
176 | $this->debug('Using Tidy'); | 176 | $this->debug('Using Tidy'); |
177 | $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); | 177 | $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); |
178 | if (tidy_clean_repair($tidy)) { | 178 | if (tidy_clean_repair($tidy)) { |
179 | $original_html = $html; | 179 | $original_html = $html; |
180 | $tidied = true; | 180 | $tidied = true; |
181 | $html = $tidy->value; | 181 | $html = $tidy->value; |
182 | } | 182 | } |
183 | unset($tidy); | 183 | unset($tidy); |
184 | } | 184 | } |
185 | 185 | ||
186 | // load and parse html | 186 | // load and parse html |
187 | $_parser = $this->config->parser(); | 187 | $_parser = $this->config->parser(); |
188 | if (!in_array($_parser, $this->allowedParsers)) { | 188 | if (!in_array($_parser, $this->allowedParsers)) { |
189 | $this->debug("HTML parser $_parser not listed, using libxml instead"); | 189 | $this->debug("HTML parser $_parser not listed, using libxml instead"); |
190 | $_parser = 'libxml'; | 190 | $_parser = 'libxml'; |
191 | } | 191 | } |
192 | $this->debug("Attempting to parse HTML with $_parser"); | 192 | $this->debug("Attempting to parse HTML with $_parser"); |
193 | $this->readability = new Readability($html, $url, $_parser); | 193 | $this->readability = new Readability($html, $url, $_parser); |
194 | 194 | ||
195 | // we use xpath to find elements in the given HTML document | 195 | // we use xpath to find elements in the given HTML document |
196 | // see http://en.wikipedia.org/wiki/XPath_1.0 | 196 | // see http://en.wikipedia.org/wiki/XPath_1.0 |
197 | $xpath = new DOMXPath($this->readability->dom); | 197 | $xpath = new DOMXPath($this->readability->dom); |
198 | 198 | ||
199 | // try to get next page link | 199 | // try to get next page link |
200 | foreach ($this->config->next_page_link as $pattern) { | 200 | foreach ($this->config->next_page_link as $pattern) { |
201 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 201 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
202 | if (is_string($elems)) { | 202 | if (is_string($elems)) { |
203 | $this->nextPageUrl = trim($elems); | 203 | $this->nextPageUrl = trim($elems); |
204 | break; | 204 | break; |
205 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 205 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
206 | foreach ($elems as $item) { | 206 | foreach ($elems as $item) { |
207 | if ($item instanceof DOMElement && $item->hasAttribute('href')) { | 207 | if ($item instanceof DOMElement && $item->hasAttribute('href')) { |
208 | $this->nextPageUrl = $item->getAttribute('href'); | 208 | $this->nextPageUrl = $item->getAttribute('href'); |
209 | break 2; | 209 | break 2; |
210 | } elseif ($item instanceof DOMAttr && $item->value) { | 210 | } elseif ($item instanceof DOMAttr && $item->value) { |
211 | $this->nextPageUrl = $item->value; | 211 | $this->nextPageUrl = $item->value; |
212 | break 2; | 212 | break 2; |
213 | } | 213 | } |
214 | } | 214 | } |
215 | } | 215 | } |
216 | } | 216 | } |
217 | 217 | ||
218 | // try to get title | 218 | // try to get title |
219 | foreach ($this->config->title as $pattern) { | 219 | foreach ($this->config->title as $pattern) { |
220 | // $this->debug("Trying $pattern"); | 220 | // $this->debug("Trying $pattern"); |
221 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 221 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
222 | if (is_string($elems)) { | 222 | if (is_string($elems)) { |
223 | $this->title = trim($elems); | 223 | $this->title = trim($elems); |
224 | $this->debug('Title expression evaluated as string: '.$this->title); | 224 | $this->debug('Title expression evaluated as string: '.$this->title); |
225 | $this->debug("...XPath match: $pattern"); | 225 | $this->debug("...XPath match: $pattern"); |
226 | break; | 226 | break; |
227 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 227 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
228 | $this->title = $elems->item(0)->textContent; | 228 | $this->title = $elems->item(0)->textContent; |
229 | $this->debug('Title matched: '.$this->title); | 229 | $this->debug('Title matched: '.$this->title); |
230 | $this->debug("...XPath match: $pattern"); | 230 | $this->debug("...XPath match: $pattern"); |
231 | // remove title from document | 231 | // remove title from document |
232 | try { | 232 | try { |
233 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 233 | @$elems->item(0)->parentNode->removeChild($elems->item(0)); |
234 | } catch (DOMException $e) { | 234 | } catch (DOMException $e) { |
235 | // do nothing | 235 | // do nothing |
236 | } | 236 | } |
237 | break; | 237 | break; |
238 | } | 238 | } |
239 | } | 239 | } |
240 | 240 | ||
241 | // try to get author (if it hasn't already been set) | 241 | // try to get author (if it hasn't already been set) |
242 | if (empty($this->author)) { | 242 | if (empty($this->author)) { |
243 | foreach ($this->config->author as $pattern) { | 243 | foreach ($this->config->author as $pattern) { |
244 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 244 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
245 | if (is_string($elems)) { | 245 | if (is_string($elems)) { |
246 | if (trim($elems) != '') { | 246 | if (trim($elems) != '') { |
247 | $this->author[] = trim($elems); | 247 | $this->author[] = trim($elems); |
248 | $this->debug('Author expression evaluated as string: '.trim($elems)); | 248 | $this->debug('Author expression evaluated as string: '.trim($elems)); |
249 | $this->debug("...XPath match: $pattern"); | 249 | $this->debug("...XPath match: $pattern"); |
250 | break; | 250 | break; |
251 | } | 251 | } |
252 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 252 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
253 | foreach ($elems as $elem) { | 253 | foreach ($elems as $elem) { |
254 | if (!isset($elem->parentNode)) continue; | 254 | if (!isset($elem->parentNode)) continue; |
255 | $this->author[] = trim($elem->textContent); | 255 | $this->author[] = trim($elem->textContent); |
256 | $this->debug('Author matched: '.trim($elem->textContent)); | 256 | $this->debug('Author matched: '.trim($elem->textContent)); |
257 | } | 257 | } |
258 | if (!empty($this->author)) { | 258 | if (!empty($this->author)) { |
259 | $this->debug("...XPath match: $pattern"); | 259 | $this->debug("...XPath match: $pattern"); |
260 | break; | 260 | break; |
261 | } | 261 | } |
262 | } | 262 | } |
263 | } | 263 | } |
264 | } | 264 | } |
265 | 265 | ||
266 | // try to get language | 266 | // try to get language |
267 | $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); | 267 | $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); |
268 | foreach ($_lang_xpath as $pattern) { | 268 | foreach ($_lang_xpath as $pattern) { |
269 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 269 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
270 | if (is_string($elems)) { | 270 | if (is_string($elems)) { |
271 | if (trim($elems) != '') { | 271 | if (trim($elems) != '') { |
272 | $this->language = trim($elems); | 272 | $this->language = trim($elems); |
273 | $this->debug('Language matched: '.$this->language); | 273 | $this->debug('Language matched: '.$this->language); |
274 | break; | 274 | break; |
275 | } | 275 | } |
276 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 276 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
277 | foreach ($elems as $elem) { | 277 | foreach ($elems as $elem) { |
278 | if (!isset($elem->parentNode)) continue; | 278 | if (!isset($elem->parentNode)) continue; |
279 | $this->language = trim($elem->textContent); | 279 | $this->language = trim($elem->textContent); |
280 | $this->debug('Language matched: '.$this->language); | 280 | $this->debug('Language matched: '.$this->language); |
281 | } | 281 | } |
282 | if ($this->language) break; | 282 | if ($this->language) break; |
283 | } | 283 | } |
284 | } | 284 | } |
285 | 285 | ||
286 | // try to get date | 286 | // try to get date |
287 | foreach ($this->config->date as $pattern) { | 287 | foreach ($this->config->date as $pattern) { |
288 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 288 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
289 | if (is_string($elems)) { | 289 | if (is_string($elems)) { |
290 | $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); | 290 | $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); |
291 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 291 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
292 | $this->date = $elems->item(0)->textContent; | 292 | $this->date = $elems->item(0)->textContent; |
293 | $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); | 293 | $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); |
294 | // remove date from document | 294 | // remove date from document |
295 | // $elems->item(0)->parentNode->removeChild($elems->item(0)); | 295 | // $elems->item(0)->parentNode->removeChild($elems->item(0)); |
296 | } | 296 | } |
297 | if (!$this->date) { | 297 | if (!$this->date) { |
298 | $this->date = null; | 298 | $this->date = null; |
299 | } else { | 299 | } else { |
300 | $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); | 300 | $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); |
301 | $this->debug("...XPath match: $pattern"); | 301 | $this->debug("...XPath match: $pattern"); |
302 | break; | 302 | break; |
303 | } | 303 | } |
304 | } | 304 | } |
305 | 305 | ||
306 | // strip elements (using xpath expressions) | 306 | // strip elements (using xpath expressions) |
307 | foreach ($this->config->strip as $pattern) { | 307 | foreach ($this->config->strip as $pattern) { |
308 | $elems = @$xpath->query($pattern, $this->readability->dom); | 308 | $elems = @$xpath->query($pattern, $this->readability->dom); |
309 | // check for matches | 309 | // check for matches |
310 | if ($elems && $elems->length > 0) { | 310 | if ($elems && $elems->length > 0) { |
311 | $this->debug('Stripping '.$elems->length.' elements (strip)'); | 311 | $this->debug('Stripping '.$elems->length.' elements (strip)'); |
312 | for ($i=$elems->length-1; $i >= 0; $i--) { | 312 | for ($i=$elems->length-1; $i >= 0; $i--) { |
313 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 313 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
314 | } | 314 | } |
315 | } | 315 | } |
316 | } | 316 | } |
317 | 317 | ||
318 | // strip elements (using id and class attribute values) | 318 | // strip elements (using id and class attribute values) |
319 | foreach ($this->config->strip_id_or_class as $string) { | 319 | foreach ($this->config->strip_id_or_class as $string) { |
320 | $string = strtr($string, array("'"=>'', '"'=>'')); | 320 | $string = strtr($string, array("'"=>'', '"'=>'')); |
321 | $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); | 321 | $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); |
322 | // check for matches | 322 | // check for matches |
323 | if ($elems && $elems->length > 0) { | 323 | if ($elems && $elems->length > 0) { |
324 | $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); | 324 | $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); |
325 | for ($i=$elems->length-1; $i >= 0; $i--) { | 325 | for ($i=$elems->length-1; $i >= 0; $i--) { |
326 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 326 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
327 | } | 327 | } |
328 | } | 328 | } |
329 | } | 329 | } |
330 | 330 | ||
331 | // strip images (using src attribute values) | 331 | // strip images (using src attribute values) |
332 | foreach ($this->config->strip_image_src as $string) { | 332 | foreach ($this->config->strip_image_src as $string) { |
333 | $string = strtr($string, array("'"=>'', '"'=>'')); | 333 | $string = strtr($string, array("'"=>'', '"'=>'')); |
334 | $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); | 334 | $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); |
335 | // check for matches | 335 | // check for matches |
336 | if ($elems && $elems->length > 0) { | 336 | if ($elems && $elems->length > 0) { |
337 | $this->debug('Stripping '.$elems->length.' image elements'); | 337 | $this->debug('Stripping '.$elems->length.' image elements'); |
338 | for ($i=$elems->length-1; $i >= 0; $i--) { | 338 | for ($i=$elems->length-1; $i >= 0; $i--) { |
339 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 339 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
340 | } | 340 | } |
341 | } | 341 | } |
342 | } | 342 | } |
343 | // strip elements using Readability.com and Instapaper.com ignore class names | 343 | // strip elements using Readability.com and Instapaper.com ignore class names |
344 | // .entry-unrelated and .instapaper_ignore | 344 | // .entry-unrelated and .instapaper_ignore |
345 | // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines | 345 | // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines |
346 | // and http://blog.instapaper.com/post/730281947 | 346 | // and http://blog.instapaper.com/post/730281947 |
347 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); | 347 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); |
348 | // check for matches | 348 | // check for matches |
349 | if ($elems && $elems->length > 0) { | 349 | if ($elems && $elems->length > 0) { |
350 | $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); | 350 | $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); |
351 | for ($i=$elems->length-1; $i >= 0; $i--) { | 351 | for ($i=$elems->length-1; $i >= 0; $i--) { |
352 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 352 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
353 | } | 353 | } |
354 | } | 354 | } |
355 | 355 | ||
356 | // strip elements that contain style="display: none;" | 356 | // strip elements that contain style="display: none;" |
357 | $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); | 357 | $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); |
358 | // check for matches | 358 | // check for matches |
359 | if ($elems && $elems->length > 0) { | 359 | if ($elems && $elems->length > 0) { |
360 | $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); | 360 | $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); |
361 | for ($i=$elems->length-1; $i >= 0; $i--) { | 361 | for ($i=$elems->length-1; $i >= 0; $i--) { |
362 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 362 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
363 | } | 363 | } |
364 | } | 364 | } |
365 | 365 | ||
366 | // try to get body | 366 | // try to get body |
367 | foreach ($this->config->body as $pattern) { | 367 | foreach ($this->config->body as $pattern) { |
368 | $elems = @$xpath->query($pattern, $this->readability->dom); | 368 | $elems = @$xpath->query($pattern, $this->readability->dom); |
369 | // check for matches | 369 | // check for matches |
370 | if ($elems && $elems->length > 0) { | 370 | if ($elems && $elems->length > 0) { |
371 | $this->debug('Body matched'); | 371 | $this->debug('Body matched'); |
372 | $this->debug("...XPath match: $pattern"); | 372 | $this->debug("...XPath match: $pattern"); |
373 | if ($elems->length == 1) { | 373 | if ($elems->length == 1) { |
374 | $this->body = $elems->item(0); | 374 | $this->body = $elems->item(0); |
375 | // prune (clean up elements that may not be content) | 375 | // prune (clean up elements that may not be content) |
376 | if ($this->config->prune()) { | 376 | if ($this->config->prune()) { |
377 | $this->debug('...pruning content'); | 377 | $this->debug('...pruning content'); |
378 | $this->readability->prepArticle($this->body); | 378 | $this->readability->prepArticle($this->body); |
379 | } | 379 | } |
380 | break; | 380 | break; |
381 | } else { | 381 | } else { |
382 | $this->body = $this->readability->dom->createElement('div'); | 382 | $this->body = $this->readability->dom->createElement('div'); |
383 | $this->debug($elems->length.' body elems found'); | 383 | $this->debug($elems->length.' body elems found'); |
384 | foreach ($elems as $elem) { | 384 | foreach ($elems as $elem) { |
385 | if (!isset($elem->parentNode)) continue; | 385 | if (!isset($elem->parentNode)) continue; |
386 | $isDescendant = false; | 386 | $isDescendant = false; |
387 | foreach ($this->body->childNodes as $parent) { | 387 | foreach ($this->body->childNodes as $parent) { |
388 | if ($this->isDescendant($parent, $elem)) { | 388 | if ($this->isDescendant($parent, $elem)) { |
389 | $isDescendant = true; | 389 | $isDescendant = true; |
390 | break; | 390 | break; |
391 | } | 391 | } |
392 | } | 392 | } |
393 | if ($isDescendant) { | 393 | if ($isDescendant) { |
394 | $this->debug('...element is child of another body element, skipping.'); | 394 | $this->debug('...element is child of another body element, skipping.'); |
395 | } else { | 395 | } else { |
396 | // prune (clean up elements that may not be content) | 396 | // prune (clean up elements that may not be content) |
397 | if ($this->config->prune()) { | 397 | if ($this->config->prune()) { |
398 | $this->debug('Pruning content'); | 398 | $this->debug('Pruning content'); |
399 | $this->readability->prepArticle($elem); | 399 | $this->readability->prepArticle($elem); |
400 | } | 400 | } |
401 | $this->debug('...element added to body'); | 401 | $this->debug('...element added to body'); |
402 | $this->body->appendChild($elem); | 402 | $this->body->appendChild($elem); |
403 | } | 403 | } |
404 | } | 404 | } |
405 | if ($this->body->hasChildNodes()) break; | 405 | if ($this->body->hasChildNodes()) break; |
406 | } | 406 | } |
407 | } | 407 | } |
408 | } | 408 | } |
409 | 409 | ||
410 | // auto detect? | 410 | // auto detect? |
411 | $detect_title = $detect_body = $detect_author = $detect_date = false; | 411 | $detect_title = $detect_body = $detect_author = $detect_date = false; |
412 | // detect title? | 412 | // detect title? |
413 | if (!isset($this->title)) { | 413 | if (!isset($this->title)) { |
414 | if (empty($this->config->title) || $this->config->autodetect_on_failure()) { | 414 | if (empty($this->config->title) || $this->config->autodetect_on_failure()) { |
415 | $detect_title = true; | 415 | $detect_title = true; |
416 | } | 416 | } |
417 | } | 417 | } |
418 | // detect body? | 418 | // detect body? |
419 | if (!isset($this->body)) { | 419 | if (!isset($this->body)) { |
420 | if (empty($this->config->body) || $this->config->autodetect_on_failure()) { | 420 | if (empty($this->config->body) || $this->config->autodetect_on_failure()) { |
421 | $detect_body = true; | 421 | $detect_body = true; |
422 | } | 422 | } |
423 | } | 423 | } |
424 | // detect author? | 424 | // detect author? |
425 | if (empty($this->author)) { | 425 | if (empty($this->author)) { |
426 | if (empty($this->config->author) || $this->config->autodetect_on_failure()) { | 426 | if (empty($this->config->author) || $this->config->autodetect_on_failure()) { |
427 | $detect_author = true; | 427 | $detect_author = true; |
428 | } | 428 | } |
429 | } | 429 | } |
430 | // detect date? | 430 | // detect date? |
431 | if (!isset($this->date)) { | 431 | if (!isset($this->date)) { |
432 | if (empty($this->config->date) || $this->config->autodetect_on_failure()) { | 432 | if (empty($this->config->date) || $this->config->autodetect_on_failure()) { |
433 | $detect_date = true; | 433 | $detect_date = true; |
434 | } | 434 | } |
435 | } | 435 | } |
436 | 436 | ||
437 | // check for hNews | 437 | // check for hNews |
438 | if ($detect_title || $detect_body) { | 438 | if ($detect_title || $detect_body) { |
439 | // check for hentry | 439 | // check for hentry |
440 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); | 440 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); |
441 | if ($elems && $elems->length > 0) { | 441 | if ($elems && $elems->length > 0) { |
442 | $this->debug('hNews: found hentry'); | 442 | $this->debug('hNews: found hentry'); |
443 | $hentry = $elems->item(0); | 443 | $hentry = $elems->item(0); |
444 | 444 | ||
445 | if ($detect_title) { | 445 | if ($detect_title) { |
446 | // check for entry-title | 446 | // check for entry-title |
447 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); | 447 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); |
448 | if ($elems && $elems->length > 0) { | 448 | if ($elems && $elems->length > 0) { |
449 | $this->title = $elems->item(0)->textContent; | 449 | $this->title = $elems->item(0)->textContent; |
450 | $this->debug('hNews: found entry-title: '.$this->title); | 450 | $this->debug('hNews: found entry-title: '.$this->title); |
451 | // remove title from document | 451 | // remove title from document |
452 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 452 | $elems->item(0)->parentNode->removeChild($elems->item(0)); |
453 | $detect_title = false; | 453 | $detect_title = false; |
454 | } | 454 | } |
455 | } | 455 | } |
456 | 456 | ||
457 | if ($detect_date) { | 457 | if ($detect_date) { |
458 | // check for time element with pubdate attribute | 458 | // check for time element with pubdate attribute |
459 | $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); | 459 | $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); |
460 | if ($elems && $elems->length > 0) { | 460 | if ($elems && $elems->length > 0) { |
461 | $this->date = strtotime(trim($elems->item(0)->textContent)); | 461 | $this->date = strtotime(trim($elems->item(0)->textContent)); |
462 | // remove date from document | 462 | // remove date from document |
463 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); | 463 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); |
464 | if ($this->date) { | 464 | if ($this->date) { |
465 | $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); | 465 | $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); |
466 | $detect_date = false; | 466 | $detect_date = false; |
467 | } else { | 467 | } else { |
468 | $this->date = null; | 468 | $this->date = null; |
469 | } | 469 | } |
470 | } | 470 | } |
471 | } | 471 | } |
472 | 472 | ||
473 | if ($detect_author) { | 473 | if ($detect_author) { |
474 | // check for time element with pubdate attribute | 474 | // check for time element with pubdate attribute |
475 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); | 475 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); |
476 | if ($elems && $elems->length > 0) { | 476 | if ($elems && $elems->length > 0) { |
477 | $author = $elems->item(0); | 477 | $author = $elems->item(0); |
478 | $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); | 478 | $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); |
479 | if ($fn && $fn->length > 0) { | 479 | if ($fn && $fn->length > 0) { |
480 | foreach ($fn as $_fn) { | 480 | foreach ($fn as $_fn) { |
481 | if (trim($_fn->textContent) != '') { | 481 | if (trim($_fn->textContent) != '') { |
482 | $this->author[] = trim($_fn->textContent); | 482 | $this->author[] = trim($_fn->textContent); |
483 | $this->debug('hNews: found author: '.trim($_fn->textContent)); | 483 | $this->debug('hNews: found author: '.trim($_fn->textContent)); |
484 | } | 484 | } |
485 | } | 485 | } |
486 | } else { | 486 | } else { |
487 | if (trim($author->textContent) != '') { | 487 | if (trim($author->textContent) != '') { |
488 | $this->author[] = trim($author->textContent); | 488 | $this->author[] = trim($author->textContent); |
489 | $this->debug('hNews: found author: '.trim($author->textContent)); | 489 | $this->debug('hNews: found author: '.trim($author->textContent)); |
490 | } | 490 | } |
491 | } | 491 | } |
492 | $detect_author = empty($this->author); | 492 | $detect_author = empty($this->author); |
493 | } | 493 | } |
494 | } | 494 | } |
495 | 495 | ||
496 | // check for entry-content. | 496 | // check for entry-content. |
497 | // according to hAtom spec, if there are multiple elements marked entry-content, | 497 | // according to hAtom spec, if there are multiple elements marked entry-content, |
498 | // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content | 498 | // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content |
499 | if ($detect_body) { | 499 | if ($detect_body) { |
500 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); | 500 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); |
501 | if ($elems && $elems->length > 0) { | 501 | if ($elems && $elems->length > 0) { |
502 | $this->debug('hNews: found entry-content'); | 502 | $this->debug('hNews: found entry-content'); |
503 | if ($elems->length == 1) { | 503 | if ($elems->length == 1) { |
504 | // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) | 504 | // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) |
505 | $e = $elems->item(0); | 505 | $e = $elems->item(0); |
506 | if (($e->tagName == 'img') || (trim($e->textContent) != '')) { | 506 | if (($e->tagName == 'img') || (trim($e->textContent) != '')) { |
507 | $this->body = $elems->item(0); | 507 | $this->body = $elems->item(0); |
508 | // prune (clean up elements that may not be content) | 508 | // prune (clean up elements that may not be content) |
509 | if ($this->config->prune()) { | 509 | if ($this->config->prune()) { |
510 | $this->debug('Pruning content'); | 510 | $this->debug('Pruning content'); |
511 | $this->readability->prepArticle($this->body); | 511 | $this->readability->prepArticle($this->body); |
512 | } | 512 | } |
513 | $detect_body = false; | 513 | $detect_body = false; |
514 | } else { | 514 | } else { |
515 | $this->debug('hNews: skipping entry-content - appears not to contain content'); | 515 | $this->debug('hNews: skipping entry-content - appears not to contain content'); |
516 | } | 516 | } |
517 | unset($e); | 517 | unset($e); |
518 | } else { | 518 | } else { |
519 | $this->body = $this->readability->dom->createElement('div'); | 519 | $this->body = $this->readability->dom->createElement('div'); |
520 | $this->debug($elems->length.' entry-content elems found'); | 520 | $this->debug($elems->length.' entry-content elems found'); |
521 | foreach ($elems as $elem) { | 521 | foreach ($elems as $elem) { |
522 | if (!isset($elem->parentNode)) continue; | 522 | if (!isset($elem->parentNode)) continue; |
523 | $isDescendant = false; | 523 | $isDescendant = false; |
524 | foreach ($this->body->childNodes as $parent) { | 524 | foreach ($this->body->childNodes as $parent) { |
525 | if ($this->isDescendant($parent, $elem)) { | 525 | if ($this->isDescendant($parent, $elem)) { |
526 | $isDescendant = true; | 526 | $isDescendant = true; |
527 | break; | 527 | break; |
528 | } | 528 | } |
529 | } | 529 | } |
530 | if ($isDescendant) { | 530 | if ($isDescendant) { |
531 | $this->debug('Element is child of another body element, skipping.'); | 531 | $this->debug('Element is child of another body element, skipping.'); |
532 | } else { | 532 | } else { |
533 | // prune (clean up elements that may not be content) | 533 | // prune (clean up elements that may not be content) |
534 | if ($this->config->prune()) { | 534 | if ($this->config->prune()) { |
535 | $this->debug('Pruning content'); | 535 | $this->debug('Pruning content'); |
536 | $this->readability->prepArticle($elem); | 536 | $this->readability->prepArticle($elem); |
537 | } | 537 | } |
538 | $this->debug('Element added to body'); | 538 | $this->debug('Element added to body'); |
539 | $this->body->appendChild($elem); | 539 | $this->body->appendChild($elem); |
540 | } | 540 | } |
541 | } | 541 | } |
542 | $detect_body = false; | 542 | $detect_body = false; |
543 | } | 543 | } |
544 | } | 544 | } |
545 | } | 545 | } |
546 | } | 546 | } |
547 | } | 547 | } |
548 | 548 | ||
549 | // check for elements marked with instapaper_title | 549 | // check for elements marked with instapaper_title |
550 | if ($detect_title) { | 550 | if ($detect_title) { |
551 | // check for instapaper_title | 551 | // check for instapaper_title |
552 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); | 552 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); |
553 | if ($elems && $elems->length > 0) { | 553 | if ($elems && $elems->length > 0) { |
554 | $this->title = $elems->item(0)->textContent; | 554 | $this->title = $elems->item(0)->textContent; |
555 | $this->debug('Title found (.instapaper_title): '.$this->title); | 555 | $this->debug('Title found (.instapaper_title): '.$this->title); |
556 | // remove title from document | 556 | // remove title from document |
557 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 557 | $elems->item(0)->parentNode->removeChild($elems->item(0)); |
558 | $detect_title = false; | 558 | $detect_title = false; |
559 | } | 559 | } |
560 | } | 560 | } |
561 | // check for elements marked with instapaper_body | 561 | // check for elements marked with instapaper_body |
562 | if ($detect_body) { | 562 | if ($detect_body) { |
563 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); | 563 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); |
564 | if ($elems && $elems->length > 0) { | 564 | if ($elems && $elems->length > 0) { |
565 | $this->debug('body found (.instapaper_body)'); | 565 | $this->debug('body found (.instapaper_body)'); |
566 | $this->body = $elems->item(0); | 566 | $this->body = $elems->item(0); |
567 | // prune (clean up elements that may not be content) | 567 | // prune (clean up elements that may not be content) |
568 | if ($this->config->prune()) { | 568 | if ($this->config->prune()) { |
569 | $this->debug('Pruning content'); | 569 | $this->debug('Pruning content'); |
570 | $this->readability->prepArticle($this->body); | 570 | $this->readability->prepArticle($this->body); |
571 | } | 571 | } |
572 | $detect_body = false; | 572 | $detect_body = false; |
573 | } | 573 | } |
574 | } | 574 | } |
575 | 575 | ||
576 | // Find author in rel="author" marked element | 576 | // Find author in rel="author" marked element |
577 | // We only use this if there's exactly one. | 577 | // We only use this if there's exactly one. |
578 | // If there's more than one, it could indicate more than | 578 | // If there's more than one, it could indicate more than |
579 | // one author, but it could also indicate that we're processing | 579 | // one author, but it could also indicate that we're processing |
580 | // a page listing different articles with different authors. | 580 | // a page listing different articles with different authors. |
581 | if ($detect_author) { | 581 | if ($detect_author) { |
582 | $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); | 582 | $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); |
583 | if ($elems && $elems->length == 1) { | 583 | if ($elems && $elems->length == 1) { |
584 | $author = trim($elems->item(0)->textContent); | 584 | $author = trim($elems->item(0)->textContent); |
585 | if ($author != '') { | 585 | if ($author != '') { |
586 | $this->debug("Author found (rel=\"author\"): $author"); | 586 | $this->debug("Author found (rel=\"author\"): $author"); |
587 | $this->author[] = $author; | 587 | $this->author[] = $author; |
588 | $detect_author = false; | 588 | $detect_author = false; |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | // Find date in pubdate marked time element | 593 | // Find date in pubdate marked time element |
594 | // For the same reason given above, we only use this | 594 | // For the same reason given above, we only use this |
595 | // if there's exactly one element. | 595 | // if there's exactly one element. |
596 | if ($detect_date) { | 596 | if ($detect_date) { |
597 | $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); | 597 | $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); |
598 | if ($elems && $elems->length == 1) { | 598 | if ($elems && $elems->length == 1) { |
599 | $this->date = strtotime(trim($elems->item(0)->textContent)); | 599 | $this->date = strtotime(trim($elems->item(0)->textContent)); |
600 | // remove date from document | 600 | // remove date from document |
601 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); | 601 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); |
602 | if ($this->date) { | 602 | if ($this->date) { |
603 | $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); | 603 | $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); |
604 | $detect_date = false; | 604 | $detect_date = false; |
605 | } else { | 605 | } else { |
606 | $this->date = null; | 606 | $this->date = null; |
607 | } | 607 | } |
608 | } | 608 | } |
609 | } | 609 | } |
610 | 610 | ||
611 | // still missing title or body, so we detect using Readability | 611 | // still missing title or body, so we detect using Readability |
612 | if ($detect_title || $detect_body) { | 612 | if ($detect_title || $detect_body) { |
613 | $this->debug('Using Readability'); | 613 | $this->debug('Using Readability'); |
614 | // clone body if we're only using Readability for title (otherwise it may interfere with body element) | 614 | // clone body if we're only using Readability for title (otherwise it may interfere with body element) |
615 | if (isset($this->body)) $this->body = $this->body->cloneNode(true); | 615 | if (isset($this->body)) $this->body = $this->body->cloneNode(true); |
616 | $success = $this->readability->init(); | 616 | $success = $this->readability->init(); |
617 | } | 617 | } |
618 | if ($detect_title) { | 618 | if ($detect_title) { |
619 | $this->debug('Detecting title'); | 619 | $this->debug('Detecting title'); |
620 | $this->title = $this->readability->getTitle()->textContent; | 620 | $this->title = $this->readability->getTitle()->textContent; |
621 | } | 621 | } |
622 | if ($detect_body && $success) { | 622 | if ($detect_body && $success) { |
623 | $this->debug('Detecting body'); | 623 | $this->debug('Detecting body'); |
624 | $this->body = $this->readability->getContent(); | 624 | $this->body = $this->readability->getContent(); |
625 | if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { | 625 | if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { |
626 | $this->body = $this->body->firstChild; | 626 | $this->body = $this->body->firstChild; |
627 | } | 627 | } |
628 | // prune (clean up elements that may not be content) | 628 | // prune (clean up elements that may not be content) |
629 | if ($this->config->prune()) { | 629 | if ($this->config->prune()) { |
630 | $this->debug('Pruning content'); | 630 | $this->debug('Pruning content'); |
631 | $this->readability->prepArticle($this->body); | 631 | $this->readability->prepArticle($this->body); |
632 | } | 632 | } |
633 | } | 633 | } |
634 | if (isset($this->body)) { | 634 | if (isset($this->body)) { |
635 | // remove scripts | 635 | // remove scripts |
636 | $this->readability->removeScripts($this->body); | 636 | $this->readability->removeScripts($this->body); |
637 | // remove any h1-h6 elements that appear as first thing in the body | 637 | // remove any h1-h6 elements that appear as first thing in the body |
638 | // and which match our title | 638 | // and which match our title |
639 | if (isset($this->title) && ($this->title != '')) { | 639 | if (isset($this->title) && ($this->title != '')) { |
640 | $firstChild = $this->body->firstChild; | 640 | $firstChild = $this->body->firstChild; |
641 | while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { | 641 | while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { |
642 | $firstChild = $firstChild->nextSibling; | 642 | $firstChild = $firstChild->nextSibling; |
643 | } | 643 | } |
644 | if (($firstChild->nodeType === XML_ELEMENT_NODE) | 644 | if (($firstChild->nodeType === XML_ELEMENT_NODE) |
645 | && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) | 645 | && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) |
646 | && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { | 646 | && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { |
647 | $this->body->removeChild($firstChild); | 647 | $this->body->removeChild($firstChild); |
648 | } | 648 | } |
649 | } | 649 | } |
650 | // prevent self-closing iframes | 650 | // prevent self-closing iframes |
651 | $elems = $this->body->getElementsByTagName('iframe'); | 651 | $elems = $this->body->getElementsByTagName('iframe'); |
652 | for ($i = $elems->length-1; $i >= 0; $i--) { | 652 | for ($i = $elems->length-1; $i >= 0; $i--) { |
653 | $e = $elems->item($i); | 653 | $e = $elems->item($i); |
654 | if (!$e->hasChildNodes()) { | 654 | if (!$e->hasChildNodes()) { |
655 | $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); | 655 | $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); |
656 | } | 656 | } |
657 | } | 657 | } |
658 | // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ | 658 | // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ |
659 | // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src | 659 | // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src |
660 | // inside the data-lazy-src attribute. It also places the original image inside a noscript element | 660 | // inside the data-lazy-src attribute. It also places the original image inside a noscript element |
661 | // next to the amended one. | 661 | // next to the amended one. |
662 | $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); | 662 | $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); |
663 | for ($i = $elems->length-1; $i >= 0; $i--) { | 663 | for ($i = $elems->length-1; $i >= 0; $i--) { |
664 | $e = $elems->item($i); | 664 | $e = $elems->item($i); |
665 | // let's see if we can grab image from noscript | 665 | // let's see if we can grab image from noscript |
666 | if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { | 666 | if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { |
667 | $_new_elem = $e->ownerDocument->createDocumentFragment(); | 667 | $_new_elem = $e->ownerDocument->createDocumentFragment(); |
668 | @$_new_elem->appendXML($e->nextSibling->innerHTML); | 668 | @$_new_elem->appendXML($e->nextSibling->innerHTML); |
669 | $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); | 669 | $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); |
670 | $e->parentNode->removeChild($e); | 670 | $e->parentNode->removeChild($e); |
671 | } else { | 671 | } else { |
672 | // Use data-lazy-src as src value | 672 | // Use data-lazy-src as src value |
673 | $e->setAttribute('src', $e->getAttribute('data-lazy-src')); | 673 | $e->setAttribute('src', $e->getAttribute('data-lazy-src')); |
674 | $e->removeAttribute('data-lazy-src'); | 674 | $e->removeAttribute('data-lazy-src'); |
675 | } | 675 | } |
676 | } | 676 | } |
677 | 677 | ||
678 | $this->success = true; | 678 | $this->success = true; |
679 | } | 679 | } |
680 | 680 | ||
681 | // if we've had no success and we've used tidy, there's a chance | 681 | // if we've had no success and we've used tidy, there's a chance |
682 | // that tidy has messed up. So let's try again without tidy... | 682 | // that tidy has messed up. So let's try again without tidy... |
683 | if (!$this->success && $tidied && $smart_tidy) { | 683 | if (!$this->success && $tidied && $smart_tidy) { |
684 | $this->debug('Trying again without tidy'); | 684 | $this->debug('Trying again without tidy'); |
685 | $this->process($original_html, $url, false); | 685 | $this->process($original_html, $url, false); |
686 | } | 686 | } |
687 | 687 | ||
688 | return $this->success; | 688 | return $this->success; |
689 | } | 689 | } |
690 | 690 | ||
691 | private function isDescendant(DOMElement $parent, DOMElement $child) { | 691 | private function isDescendant(DOMElement $parent, DOMElement $child) { |
692 | $node = $child->parentNode; | 692 | $node = $child->parentNode; |
693 | while ($node != null) { | 693 | while ($node != null) { |
694 | if ($node->isSameNode($parent)) return true; | 694 | if ($node->isSameNode($parent)) return true; |
695 | $node = $node->parentNode; | 695 | $node = $node->parentNode; |
696 | } | 696 | } |
697 | return false; | 697 | return false; |
698 | } | 698 | } |
699 | 699 | ||
700 | public function getContent() { | 700 | public function getContent() { |
701 | return $this->body; | 701 | return $this->body; |
702 | } | 702 | } |
703 | 703 | ||
704 | public function getTitle() { | 704 | public function getTitle() { |
705 | return $this->title; | 705 | return $this->title; |
706 | } | 706 | } |
707 | 707 | ||
708 | public function getAuthors() { | 708 | public function getAuthors() { |
709 | return $this->author; | 709 | return $this->author; |
710 | } | 710 | } |
711 | 711 | ||
712 | public function getLanguage() { | 712 | public function getLanguage() { |
713 | return $this->language; | 713 | return $this->language; |
714 | } | 714 | } |
715 | 715 | ||
716 | public function getDate() { | 716 | public function getDate() { |
717 | return $this->date; | 717 | return $this->date; |
718 | } | 718 | } |
719 | 719 | ||
720 | public function getSiteConfig() { | 720 | public function getSiteConfig() { |
721 | return $this->config; | 721 | return $this->config; |
722 | } | 722 | } |
723 | 723 | ||
724 | public function getNextPageUrl() { | 724 | public function getNextPageUrl() { |
725 | return $this->nextPageUrl; | 725 | return $this->nextPageUrl; |
726 | } | 726 | } |
727 | } | 727 | } \ No newline at end of file |
728 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php index c5e300d7..1f6a7603 100644 --- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php | |||
@@ -1,338 +1,343 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Site Config | 3 | * Site Config |
4 | * | 4 | * |
5 | * Each instance of this class should hold extraction patterns and other directives | 5 | * Each instance of this class should hold extraction patterns and other directives |
6 | * for a website. See ContentExtractor class to see how it's used. | 6 | * for a website. See ContentExtractor class to see how it's used. |
7 | * | 7 | * |
8 | * @version 0.7 | 8 | * @version 0.8 |
9 | * @date 2012-08-27 | 9 | * @date 2013-04-16 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2012 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class SiteConfig | 15 | class SiteConfig |
16 | { | 16 | { |
17 | // Use first matching element as title (0 or more xpath expressions) | 17 | // Use first matching element as title (0 or more xpath expressions) |
18 | public $title = array(); | 18 | public $title = array(); |
19 | 19 | ||
20 | // Use first matching element as body (0 or more xpath expressions) | 20 | // Use first matching element as body (0 or more xpath expressions) |
21 | public $body = array(); | 21 | public $body = array(); |
22 | 22 | ||
23 | // Use first matching element as author (0 or more xpath expressions) | 23 | // Use first matching element as author (0 or more xpath expressions) |
24 | public $author = array(); | 24 | public $author = array(); |
25 | 25 | ||
26 | // Use first matching element as date (0 or more xpath expressions) | 26 | // Use first matching element as date (0 or more xpath expressions) |
27 | public $date = array(); | 27 | public $date = array(); |
28 | 28 | ||
29 | // Strip elements matching these xpath expressions (0 or more) | 29 | // Strip elements matching these xpath expressions (0 or more) |
30 | public $strip = array(); | 30 | public $strip = array(); |
31 | 31 | ||
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute | 32 | // Strip elements which contain these strings (0 or more) in the id or class attribute |
33 | public $strip_id_or_class = array(); | 33 | public $strip_id_or_class = array(); |
34 | 34 | ||
35 | // Strip images which contain these strings (0 or more) in the src attribute | 35 | // Strip images which contain these strings (0 or more) in the src attribute |
36 | public $strip_image_src = array(); | 36 | public $strip_image_src = array(); |
37 | 37 | ||
38 | // Additional HTTP headers to send | 38 | // Additional HTTP headers to send |
39 | // NOT YET USED | 39 | // NOT YET USED |
40 | public $http_header = array(); | 40 | public $http_header = array(); |
41 | 41 | ||
42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) | 42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) |
43 | public $tidy = null; | 43 | public $tidy = null; |
44 | 44 | ||
45 | protected $default_tidy = true; // used if undeclared | 45 | protected $default_tidy = true; // used if undeclared |
46 | 46 | ||
47 | // Autodetect title/body if xpath expressions fail to produce results. | 47 | // Autodetect title/body if xpath expressions fail to produce results. |
48 | // Note that this applies to title and body separately, ie. | 48 | // Note that this applies to title and body separately, ie. |
49 | // * if we get a body match but no title match, this option will determine whether we autodetect title | 49 | // * if we get a body match but no title match, this option will determine whether we autodetect title |
50 | // * if neither match, this determines whether we autodetect title and body. | 50 | // * if neither match, this determines whether we autodetect title and body. |
51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. | 51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. |
52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) | 52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) |
53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. | 53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. |
54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). | 54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). |
55 | // bool or null if undeclared | 55 | // bool or null if undeclared |
56 | public $autodetect_on_failure = null; | 56 | public $autodetect_on_failure = null; |
57 | protected $default_autodetect_on_failure = true; // used if undeclared | 57 | protected $default_autodetect_on_failure = true; // used if undeclared |
58 | 58 | ||
59 | // Clean up content block - attempt to remove elements that appear to be superfluous | 59 | // Clean up content block - attempt to remove elements that appear to be superfluous |
60 | // bool or null if undeclared | 60 | // bool or null if undeclared |
61 | public $prune = null; | 61 | public $prune = null; |
62 | protected $default_prune = true; // used if undeclared | 62 | protected $default_prune = true; // used if undeclared |
63 | 63 | ||
64 | // Test URL - if present, can be used to test the config above | 64 | // Test URL - if present, can be used to test the config above |
65 | public $test_url = array(); | 65 | public $test_url = array(); |
66 | 66 | ||
67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article | 67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article |
68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to | 68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to |
69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page | 69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page |
70 | // which displays the entire article on one page (e.g. 'print view'). | 70 | // which displays the entire article on one page (e.g. 'print view'). |
71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, | 71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, |
72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. | 72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. |
73 | public $single_page_link = array(); | 73 | public $single_page_link = array(); |
74 | 74 | ||
75 | public $next_page_link = array(); | 75 | public $next_page_link = array(); |
76 | 76 | ||
77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed | 77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed |
78 | public $single_page_link_in_feed = array(); | 78 | public $single_page_link_in_feed = array(); |
79 | 79 | ||
80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
81 | // string or null if undeclared | 81 | // string or null if undeclared |
82 | public $parser = null; | 82 | public $parser = null; |
83 | protected $default_parser = 'libxml'; // used if undeclared | 83 | protected $default_parser = 'libxml'; // used if undeclared |
84 | 84 | ||
85 | // Strings to search for in HTML before processing begins (used with $replace_string) | 85 | // Strings to search for in HTML before processing begins (used with $replace_string) |
86 | public $find_string = array(); | 86 | public $find_string = array(); |
87 | // Strings to replace those found in $find_string before HTML processing begins | 87 | // Strings to replace those found in $find_string before HTML processing begins |
88 | public $replace_string = array(); | 88 | public $replace_string = array(); |
89 | 89 | ||
90 | // the options below cannot be set in the config files which this class represents | 90 | // the options below cannot be set in the config files which this class represents |
91 | 91 | ||
92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not | 92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not |
93 | public $cache_key = null; | 93 | public $cache_key = null; |
94 | public static $debug = false; | 94 | public static $debug = false; |
95 | protected static $apc = false; | 95 | protected static $apc = false; |
96 | protected static $config_path; | 96 | protected static $config_path; |
97 | protected static $config_path_fallback; | 97 | protected static $config_path_fallback; |
98 | protected static $config_cache = array(); | 98 | protected static $config_cache = array(); |
99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; | 99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; |
100 | 100 | ||
101 | protected static function debug($msg) { | 101 | protected static function debug($msg) { |
102 | if (self::$debug) { | 102 | if (self::$debug) { |
103 | //$mem = round(memory_get_usage()/1024, 2); | 103 | //$mem = round(memory_get_usage()/1024, 2); |
104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); |
105 | echo '* ',$msg; | 105 | echo '* ',$msg; |
106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
107 | echo "\n"; | 107 | echo "\n"; |
108 | ob_flush(); | 108 | ob_flush(); |
109 | flush(); | 109 | flush(); |
110 | } | 110 | } |
111 | } | 111 | } |
112 | 112 | ||
113 | // enable APC caching of certain site config files? | 113 | // enable APC caching of certain site config files? |
114 | // If enabled the following site config files will be | 114 | // If enabled the following site config files will be |
115 | // cached in APC cache (when requested for first time): | 115 | // cached in APC cache (when requested for first time): |
116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ | 116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ |
117 | // * the site config files associated with HTML fingerprints | 117 | // * the site config files associated with HTML fingerprints |
118 | // * the global site config file | 118 | // * the global site config file |
119 | // returns true if enabled, false otherwise | 119 | // returns true if enabled, false otherwise |
120 | public static function use_apc($apc=true) { | 120 | public static function use_apc($apc=true) { |
121 | if (!function_exists('apc_add')) { | 121 | if (!function_exists('apc_add')) { |
122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); | 122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); |
123 | return false; | 123 | return false; |
124 | } | 124 | } |
125 | self::$apc = $apc; | 125 | self::$apc = $apc; |
126 | return $apc; | 126 | return $apc; |
127 | } | 127 | } |
128 | 128 | ||
129 | // return bool or null | 129 | // return bool or null |
130 | public function tidy($use_default=true) { | 130 | public function tidy($use_default=true) { |
131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; | 131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; |
132 | return $this->tidy; | 132 | return $this->tidy; |
133 | } | 133 | } |
134 | 134 | ||
135 | // return bool or null | 135 | // return bool or null |
136 | public function prune($use_default=true) { | 136 | public function prune($use_default=true) { |
137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; | 137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; |
138 | return $this->prune; | 138 | return $this->prune; |
139 | } | 139 | } |
140 | 140 | ||
141 | // return string or null | 141 | // return string or null |
142 | public function parser($use_default=true) { | 142 | public function parser($use_default=true) { |
143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; | 143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; |
144 | return $this->parser; | 144 | return $this->parser; |
145 | } | 145 | } |
146 | 146 | ||
147 | // return bool or null | 147 | // return bool or null |
148 | public function autodetect_on_failure($use_default=true) { | 148 | public function autodetect_on_failure($use_default=true) { |
149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; | 149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; |
150 | return $this->autodetect_on_failure; | 150 | return $this->autodetect_on_failure; |
151 | } | 151 | } |
152 | 152 | ||
153 | public static function set_config_path($path, $fallback=null) { | 153 | public static function set_config_path($path, $fallback=null) { |
154 | self::$config_path = $path; | 154 | self::$config_path = $path; |
155 | self::$config_path_fallback = $fallback; | 155 | self::$config_path_fallback = $fallback; |
156 | } | 156 | } |
157 | 157 | ||
158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { | 158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { |
159 | $key = strtolower($key); | 159 | $key = strtolower($key); |
160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
161 | if ($config->cache_key) $key = $config->cache_key; | 161 | if ($config->cache_key) $key = $config->cache_key; |
162 | self::$config_cache[$key] = $config; | 162 | self::$config_cache[$key] = $config; |
163 | if (self::$apc && $use_apc) { | 163 | if (self::$apc && $use_apc) { |
164 | self::debug("Adding site config to APC cache with key sc.$key"); | 164 | self::debug("Adding site config to APC cache with key sc.$key"); |
165 | apc_add("sc.$key", $config); | 165 | apc_add("sc.$key", $config); |
166 | } | 166 | } |
167 | self::debug("Cached site config with key $key"); | 167 | self::debug("Cached site config with key $key"); |
168 | } | 168 | } |
169 | 169 | ||
170 | public static function is_cached($key) { | 170 | public static function is_cached($key) { |
171 | $key = strtolower($key); | 171 | $key = strtolower($key); |
172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
173 | if (array_key_exists($key, self::$config_cache)) { | 173 | if (array_key_exists($key, self::$config_cache)) { |
174 | return true; | 174 | return true; |
175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { | 175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { |
176 | return true; | 176 | return true; |
177 | } | 177 | } |
178 | return false; | 178 | return false; |
179 | } | 179 | } |
180 | 180 | ||
181 | public function append(SiteConfig $newconfig) { | 181 | public function append(SiteConfig $newconfig) { |
182 | // check for commands where we accept multiple statements (no test_url) | 182 | // check for commands where we accept multiple statements (no test_url) |
183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { | 183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { |
184 | // append array elements for this config variable from $newconfig to this config | 184 | // append array elements for this config variable from $newconfig to this config |
185 | //$this->$var = $this->$var + $newconfig->$var; | 185 | //$this->$var = $this->$var + $newconfig->$var; |
186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); | 186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); |
187 | } | 187 | } |
188 | // check for single statement commands | 188 | // check for single statement commands |
189 | // we do not overwrite existing non null values | 189 | // we do not overwrite existing non null values |
190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { | 190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { |
191 | if ($this->$var === null) $this->$var = $newconfig->$var; | 191 | if ($this->$var === null) $this->$var = $newconfig->$var; |
192 | } | 192 | } |
193 | } | 193 | // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) |
194 | 194 | foreach (array('find_string', 'replace_string') as $var) { | |
195 | // returns SiteConfig instance if an appropriate one is found, false otherwise | 195 | // append array elements for this config variable from $newconfig to this config |
196 | // if $exact_host_match is true, we will not look for wildcard config matches | 196 | //$this->$var = $this->$var + $newconfig->$var; |
197 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists | 197 | $this->$var = array_merge($this->$var, $newconfig->$var); |
198 | public static function build($host, $exact_host_match=false) { | 198 | } |
199 | $host = strtolower($host); | 199 | } |
200 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 200 | |
201 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; | 201 | // returns SiteConfig instance if an appropriate one is found, false otherwise |
202 | // check for site configuration | 202 | // if $exact_host_match is true, we will not look for wildcard config matches |
203 | $try = array($host); | 203 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists |
204 | // should we look for wildcard matches | 204 | public static function build($host, $exact_host_match=false) { |
205 | if (!$exact_host_match) { | 205 | $host = strtolower($host); |
206 | $split = explode('.', $host); | 206 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
207 | if (count($split) > 1) { | 207 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; |
208 | array_shift($split); | 208 | // check for site configuration |
209 | $try[] = '.'.implode('.', $split); | 209 | $try = array($host); |
210 | } | 210 | // should we look for wildcard matches |
211 | } | 211 | if (!$exact_host_match) { |
212 | 212 | $split = explode('.', $host); | |
213 | // look for site config file in primary folder | 213 | if (count($split) > 1) { |
214 | self::debug(". looking for site config for $host in primary folder"); | 214 | array_shift($split); |
215 | foreach ($try as $h) { | 215 | $try[] = '.'.implode('.', $split); |
216 | if (array_key_exists($h, self::$config_cache)) { | 216 | } |
217 | self::debug("... site config for $h already loaded in this request"); | 217 | } |
218 | return self::$config_cache[$h]; | 218 | |
219 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { | 219 | // look for site config file in primary folder |
220 | self::debug("... site config for $h in APC cache"); | 220 | self::debug(". looking for site config for $host in primary folder"); |
221 | return $sconfig; | 221 | foreach ($try as $h) { |
222 | } elseif (file_exists(self::$config_path."/$h.txt")) { | 222 | if (array_key_exists($h, self::$config_cache)) { |
223 | self::debug("... found site config ($h.txt)"); | 223 | self::debug("... site config for $h already loaded in this request"); |
224 | $file_primary = self::$config_path."/$h.txt"; | 224 | return self::$config_cache[$h]; |
225 | $matched_name = $h; | 225 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { |
226 | break; | 226 | self::debug("... site config for $h in APC cache"); |
227 | } | 227 | return $sconfig; |
228 | } | 228 | } elseif (file_exists(self::$config_path."/$h.txt")) { |
229 | 229 | self::debug("... found site config ($h.txt)"); | |
230 | // if we found site config, process it | 230 | $file_primary = self::$config_path."/$h.txt"; |
231 | if (isset($file_primary)) { | 231 | $matched_name = $h; |
232 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 232 | break; |
233 | if (!$config_lines || !is_array($config_lines)) return false; | 233 | } |
234 | $config = self::build_from_array($config_lines); | 234 | } |
235 | // if APC caching is available and enabled, mark this for cache | 235 | |
236 | //$config->cache_in_apc = true; | 236 | // if we found site config, process it |
237 | $config->cache_key = $matched_name; | 237 | if (isset($file_primary)) { |
238 | 238 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
239 | // if autodetec on failure is off (on by default) we do not need to look | 239 | if (!$config_lines || !is_array($config_lines)) return false; |
240 | // in secondary folder | 240 | $config = self::build_from_array($config_lines); |
241 | if (!$config->autodetect_on_failure()) { | 241 | // if APC caching is available and enabled, mark this for cache |
242 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); | 242 | //$config->cache_in_apc = true; |
243 | return $config; | 243 | $config->cache_key = $matched_name; |
244 | } | 244 | |
245 | } | 245 | // if autodetec on failure is off (on by default) we do not need to look |
246 | 246 | // in secondary folder | |
247 | // look for site config file in secondary folder | 247 | if (!$config->autodetect_on_failure()) { |
248 | if (isset(self::$config_path_fallback)) { | 248 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); |
249 | self::debug(". looking for site config for $host in secondary folder"); | 249 | return $config; |
250 | foreach ($try as $h) { | 250 | } |
251 | if (file_exists(self::$config_path_fallback."/$h.txt")) { | 251 | } |
252 | self::debug("... found site config in secondary folder ($h.txt)"); | 252 | |
253 | $file_secondary = self::$config_path_fallback."/$h.txt"; | 253 | // look for site config file in secondary folder |
254 | $matched_name = $h; | 254 | if (isset(self::$config_path_fallback)) { |
255 | break; | 255 | self::debug(". looking for site config for $host in secondary folder"); |
256 | } | 256 | foreach ($try as $h) { |
257 | } | 257 | if (file_exists(self::$config_path_fallback."/$h.txt")) { |
258 | if (!isset($file_secondary)) { | 258 | self::debug("... found site config in secondary folder ($h.txt)"); |
259 | self::debug("... no site config match in secondary folder"); | 259 | $file_secondary = self::$config_path_fallback."/$h.txt"; |
260 | } | 260 | $matched_name = $h; |
261 | } | 261 | break; |
262 | 262 | } | |
263 | // return false if no config file found | 263 | } |
264 | if (!isset($file_primary) && !isset($file_secondary)) { | 264 | if (!isset($file_secondary)) { |
265 | self::debug("... no site config match for $host"); | 265 | self::debug("... no site config match in secondary folder"); |
266 | return false; | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
269 | // return primary config if secondary not found | 269 | // return false if no config file found |
270 | if (!isset($file_secondary) && isset($config)) { | 270 | if (!isset($file_primary) && !isset($file_secondary)) { |
271 | return $config; | 271 | self::debug("... no site config match for $host"); |
272 | } | 272 | return false; |
273 | 273 | } | |
274 | // process secondary config file | 274 | |
275 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 275 | // return primary config if secondary not found |
276 | if (!$config_lines || !is_array($config_lines)) { | 276 | if (!isset($file_secondary) && isset($config)) { |
277 | // failed to process secondary | 277 | return $config; |
278 | if (isset($config)) { | 278 | } |
279 | // return primary config | 279 | |
280 | return $config; | 280 | // process secondary config file |
281 | } else { | 281 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
282 | return false; | 282 | if (!$config_lines || !is_array($config_lines)) { |
283 | } | 283 | // failed to process secondary |
284 | } | 284 | if (isset($config)) { |
285 | 285 | // return primary config | |
286 | // merge with primary and return | 286 | return $config; |
287 | if (isset($config)) { | 287 | } else { |
288 | self::debug('. merging config files'); | 288 | return false; |
289 | $config->append(self::build_from_array($config_lines)); | 289 | } |
290 | return $config; | 290 | } |
291 | } else { | 291 | |
292 | // return just secondary | 292 | // merge with primary and return |
293 | $config = self::build_from_array($config_lines); | 293 | if (isset($config)) { |
294 | // if APC caching is available and enabled, mark this for cache | 294 | self::debug('. merging config files'); |
295 | //$config->cache_in_apc = true; | 295 | $config->append(self::build_from_array($config_lines)); |
296 | $config->cache_key = $matched_name; | 296 | return $config; |
297 | return $config; | 297 | } else { |
298 | } | 298 | // return just secondary |
299 | } | 299 | $config = self::build_from_array($config_lines); |
300 | 300 | // if APC caching is available and enabled, mark this for cache | |
301 | public static function build_from_array(array $lines) { | 301 | //$config->cache_in_apc = true; |
302 | $config = new SiteConfig(); | 302 | $config->cache_key = $matched_name; |
303 | foreach ($lines as $line) { | 303 | return $config; |
304 | $line = trim($line); | 304 | } |
305 | 305 | } | |
306 | // skip comments, empty lines | 306 | |
307 | if ($line == '' || $line[0] == '#') continue; | 307 | public static function build_from_array(array $lines) { |
308 | 308 | $config = new SiteConfig(); | |
309 | // get command | 309 | foreach ($lines as $line) { |
310 | $command = explode(':', $line, 2); | 310 | $line = trim($line); |
311 | // if there's no colon ':', skip this line | 311 | |
312 | if (count($command) != 2) continue; | 312 | // skip comments, empty lines |
313 | $val = trim($command[1]); | 313 | if ($line == '' || $line[0] == '#') continue; |
314 | $command = trim($command[0]); | 314 | |
315 | if ($command == '' || $val == '') continue; | 315 | // get command |
316 | 316 | $command = explode(':', $line, 2); | |
317 | // check for commands where we accept multiple statements | 317 | // if there's no colon ':', skip this line |
318 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { | 318 | if (count($command) != 2) continue; |
319 | array_push($config->$command, $val); | 319 | $val = trim($command[1]); |
320 | // check for single statement commands that evaluate to true or false | 320 | $command = trim($command[0]); |
321 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { | 321 | if ($command == '' || $val == '') continue; |
322 | $config->$command = ($val == 'yes'); | 322 | |
323 | // check for single statement commands stored as strings | 323 | // check for commands where we accept multiple statements |
324 | } elseif (in_array($command, array('parser'))) { | 324 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { |
325 | $config->$command = $val; | 325 | array_push($config->$command, $val); |
326 | // check for replace_string(find): replace | 326 | // check for single statement commands that evaluate to true or false |
327 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { | 327 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { |
328 | if (in_array($match[1], array('replace_string'))) { | 328 | $config->$command = ($val == 'yes'); |
329 | $command = $match[1]; | 329 | // check for single statement commands stored as strings |
330 | array_push($config->find_string, $match[2]); | 330 | } elseif (in_array($command, array('parser'))) { |
331 | array_push($config->$command, $val); | 331 | $config->$command = $val; |
332 | } | 332 | // check for replace_string(find): replace |
333 | } | 333 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { |
334 | } | 334 | if (in_array($match[1], array('replace_string'))) { |
335 | return $config; | 335 | $command = $match[1]; |
336 | } | 336 | array_push($config->find_string, $match[2]); |
337 | } | 337 | array_push($config->$command, $val); |
338 | ?> \ No newline at end of file | 338 | } |
339 | } | ||
340 | } | ||
341 | return $config; | ||
342 | } | ||
343 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php index 3487423f..40786598 100644..100755 --- a/inc/3rdparty/libraries/feedwriter/FeedItem.php +++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php | |||
@@ -1,7 +1,7 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Univarsel Feed Writer | 3 | * Univarsel Feed Writer |
4 | * | 4 | * |
5 | * FeedItem class - Used as feed element in FeedWriter class | 5 | * FeedItem class - Used as feed element in FeedWriter class |
6 | * | 6 | * |
7 | * @package UnivarselFeedWriter | 7 | * @package UnivarselFeedWriter |
@@ -12,20 +12,20 @@ | |||
12 | { | 12 | { |
13 | private $elements = array(); //Collection of feed elements | 13 | private $elements = array(); //Collection of feed elements |
14 | private $version; | 14 | private $version; |
15 | 15 | ||
16 | /** | 16 | /** |
17 | * Constructor | 17 | * Constructor |
18 | * | 18 | * |
19 | * @param contant (RSS1/RSS2/ATOM) RSS2 is default. | 19 | * @param contant (RSS1/RSS2/ATOM) RSS2 is default. |
20 | */ | 20 | */ |
21 | function __construct($version = RSS2) | 21 | function __construct($version = RSS2) |
22 | { | 22 | { |
23 | $this->version = $version; | 23 | $this->version = $version; |
24 | } | 24 | } |
25 | 25 | ||
26 | /** | 26 | /** |
27 | * Set element (overwrites existing elements with $elementName) | 27 | * Set element (overwrites existing elements with $elementName) |
28 | * | 28 | * |
29 | * @access public | 29 | * @access public |
30 | * @param srting The tag name of an element | 30 | * @param srting The tag name of an element |
31 | * @param srting The content of tag | 31 | * @param srting The content of tag |
@@ -38,11 +38,11 @@ | |||
38 | unset($this->elements[$elementName]); | 38 | unset($this->elements[$elementName]); |
39 | } | 39 | } |
40 | $this->addElement($elementName, $content, $attributes); | 40 | $this->addElement($elementName, $content, $attributes); |
41 | } | 41 | } |
42 | 42 | ||
43 | /** | 43 | /** |
44 | * Add an element to elements array | 44 | * Add an element to elements array |
45 | * | 45 | * |
46 | * @access public | 46 | * @access public |
47 | * @param srting The tag name of an element | 47 | * @param srting The tag name of an element |
48 | * @param srting The content of tag | 48 | * @param srting The content of tag |
@@ -61,11 +61,11 @@ | |||
61 | $this->elements[$elementName][$i]['content'] = $content; | 61 | $this->elements[$elementName][$i]['content'] = $content; |
62 | $this->elements[$elementName][$i]['attributes'] = $attributes; | 62 | $this->elements[$elementName][$i]['attributes'] = $attributes; |
63 | } | 63 | } |
64 | 64 | ||
65 | /** | 65 | /** |
66 | * Set multiple feed elements from an array. | 66 | * Set multiple feed elements from an array. |
67 | * Elements which have attributes cannot be added by this method | 67 | * Elements which have attributes cannot be added by this method |
68 | * | 68 | * |
69 | * @access public | 69 | * @access public |
70 | * @param array array of elements in 'tagName' => 'tagContent' format. | 70 | * @param array array of elements in 'tagName' => 'tagContent' format. |
71 | * @return void | 71 | * @return void |
@@ -73,15 +73,15 @@ | |||
73 | public function addElementArray($elementArray) | 73 | public function addElementArray($elementArray) |
74 | { | 74 | { |
75 | if(! is_array($elementArray)) return; | 75 | if(! is_array($elementArray)) return; |
76 | foreach ($elementArray as $elementName => $content) | 76 | foreach ($elementArray as $elementName => $content) |
77 | { | 77 | { |
78 | $this->addElement($elementName, $content); | 78 | $this->addElement($elementName, $content); |
79 | } | 79 | } |
80 | } | 80 | } |
81 | 81 | ||
82 | /** | 82 | /** |
83 | * Return the collection of elements in this feed item | 83 | * Return the collection of elements in this feed item |
84 | * | 84 | * |
85 | * @access public | 85 | * @access public |
86 | * @return array | 86 | * @return array |
87 | */ | 87 | */ |
@@ -89,68 +89,74 @@ | |||
89 | { | 89 | { |
90 | return $this->elements; | 90 | return $this->elements; |
91 | } | 91 | } |
92 | 92 | ||
93 | // Wrapper functions ------------------------------------------------------ | 93 | // Wrapper functions ------------------------------------------------------ |
94 | 94 | ||
95 | /** | 95 | /** |
96 | * Set the 'dscription' element of feed item | 96 | * Set the 'dscription' element of feed item |
97 | * | 97 | * |
98 | * @access public | 98 | * @access public |
99 | * @param string The content of 'description' element | 99 | * @param string The content of 'description' element |
100 | * @return void | 100 | * @return void |
101 | */ | 101 | */ |
102 | public function setDescription($description) | 102 | public function setDescription($description) |
103 | { | 103 | { |
104 | $this->setElement('description', $description); | 104 | $tag = ($this->version == ATOM)? 'summary' : 'description'; |
105 | $this->setElement($tag, $description); | ||
105 | } | 106 | } |
106 | 107 | ||
107 | /** | 108 | /** |
108 | * @desc Set the 'title' element of feed item | 109 | * @desc Set the 'title' element of feed item |
109 | * @access public | 110 | * @access public |
110 | * @param string The content of 'title' element | 111 | * @param string The content of 'title' element |
111 | * @return void | 112 | * @return void |
112 | */ | 113 | */ |
113 | public function setTitle($title) | 114 | public function setTitle($title) |
114 | { | 115 | { |
115 | $this->setElement('title', $title); | 116 | $this->setElement('title', $title); |
116 | } | 117 | } |
117 | 118 | ||
118 | /** | 119 | /** |
119 | * Set the 'date' element of feed item | 120 | * Set the 'date' element of feed item |
120 | * | 121 | * |
121 | * @access public | 122 | * @access public |
122 | * @param string The content of 'date' element | 123 | * @param string The content of 'date' element |
123 | * @return void | 124 | * @return void |
124 | */ | 125 | */ |
125 | public function setDate($date) | 126 | public function setDate($date) |
126 | { | 127 | { |
127 | if(! is_numeric($date)) | 128 | if(! is_numeric($date)) |
128 | { | 129 | { |
129 | $date = strtotime($date); | 130 | $date = strtotime($date); |
130 | } | 131 | } |
131 | 132 | ||
132 | if($this->version == RSS2) | 133 | if($this->version == ATOM) |
134 | { | ||
135 | $tag = 'updated'; | ||
136 | $value = date(DATE_ATOM, $date); | ||
137 | } | ||
138 | elseif($this->version == RSS2) | ||
133 | { | 139 | { |
134 | $tag = 'pubDate'; | 140 | $tag = 'pubDate'; |
135 | $value = date(DATE_RSS, $date); | 141 | $value = date(DATE_RSS, $date); |
136 | } | 142 | } |
137 | else | 143 | else |
138 | { | 144 | { |
139 | $tag = 'dc:date'; | 145 | $tag = 'dc:date'; |
140 | $value = date("Y-m-d", $date); | 146 | $value = date("Y-m-d", $date); |
141 | } | 147 | } |
142 | 148 | ||
143 | $this->setElement($tag, $value); | 149 | $this->setElement($tag, $value); |
144 | } | 150 | } |
145 | 151 | ||
146 | /** | 152 | /** |
147 | * Set the 'link' element of feed item | 153 | * Set the 'link' element of feed item |
148 | * | 154 | * |
149 | * @access public | 155 | * @access public |
150 | * @param string The content of 'link' element | 156 | * @param string The content of 'link' element |
151 | * @return void | 157 | * @return void |
152 | */ | 158 | */ |
153 | public function setLink($link) | 159 | public function setLink($link) |
154 | { | 160 | { |
155 | if($this->version == RSS2 || $this->version == RSS1) | 161 | if($this->version == RSS2 || $this->version == RSS1) |
156 | { | 162 | { |
@@ -161,26 +167,27 @@ | |||
161 | { | 167 | { |
162 | $this->setElement('link','',array('href'=>$link)); | 168 | $this->setElement('link','',array('href'=>$link)); |
163 | $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); | 169 | $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); |
164 | } | 170 | } |
165 | 171 | ||
166 | } | 172 | } |
167 | 173 | ||
168 | /** | 174 | /** |
169 | * Set the 'source' element of feed item | 175 | * Set the 'source' element of feed item |
170 | * | 176 | * |
171 | * @access public | 177 | * @access public |
172 | * @param string The content of 'source' element | 178 | * @param string The content of 'source' element |
173 | * @return void | 179 | * @return void |
174 | */ | 180 | */ |
175 | public function setSource($link) | 181 | public function setSource($link) |
176 | { | 182 | { |
177 | $this->setElement('source', $link); | 183 | $attributes = array('url'=>$link); |
184 | $this->setElement('source', "wallabag",$attributes); | ||
178 | } | 185 | } |
179 | 186 | ||
180 | /** | 187 | /** |
181 | * Set the 'encloser' element of feed item | 188 | * Set the 'encloser' element of feed item |
182 | * For RSS 2.0 only | 189 | * For RSS 2.0 only |
183 | * | 190 | * |
184 | * @access public | 191 | * @access public |
185 | * @param string The url attribute of encloser tag | 192 | * @param string The url attribute of encloser tag |
186 | * @param string The length attribute of encloser tag | 193 | * @param string The length attribute of encloser tag |
@@ -192,6 +199,6 @@ | |||
192 | $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); | 199 | $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); |
193 | $this->setElement('enclosure','',$attributes); | 200 | $this->setElement('enclosure','',$attributes); |
194 | } | 201 | } |
195 | 202 | ||
196 | } // end of class FeedItem | 203 | } // end of class FeedItem |
197 | ?> \ No newline at end of file | 204 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index df4c8b4b..aa064afb 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php | |||
@@ -87,20 +87,26 @@ define('JSONP', 3, true); | |||
87 | * @access public | 87 | * @access public |
88 | * @return void | 88 | * @return void |
89 | */ | 89 | */ |
90 | public function genarateFeed() | 90 | public function genarateFeed($withHeaders = true) |
91 | { | 91 | { |
92 | if ($this->version == RSS2) { | 92 | if ($withHeaders) { |
93 | // header('Content-type: text/xml; charset=UTF-8'); | 93 | if ($this->version == RSS2) { |
94 | // this line prevents Chrome 20 from prompting download | 94 | header('Content-type: text/xml; charset=UTF-8'); |
95 | // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss | 95 | // this line prevents Chrome 20 from prompting download |
96 | // header('X-content-type-options: nosniff'); | 96 | // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss |
97 | } elseif ($this->version == JSON) { | 97 | header('X-content-type-options: nosniff'); |
98 | // header('Content-type: application/json; charset=UTF-8'); | 98 | } elseif ($this->version == JSON) { |
99 | $this->json = new stdClass(); | 99 | header('Content-type: application/json; charset=UTF-8'); |
100 | } elseif ($this->version == JSONP) { | 100 | } elseif ($this->version == JSONP) { |
101 | // header('Content-type: application/javascript; charset=UTF-8'); | 101 | header('Content-type: application/javascript; charset=UTF-8'); |
102 | $this->json = new stdClass(); | 102 | } |
103 | } | 103 | } |
104 | |||
105 | if ($this->version == JSON || $this->version == JSONP) { | ||
106 | $this->json = new stdClass(); | ||
107 | } | ||
108 | |||
109 | |||
104 | $this->printHead(); | 110 | $this->printHead(); |
105 | $this->printChannels(); | 111 | $this->printChannels(); |
106 | $this->printItems(); | 112 | $this->printItems(); |
@@ -110,6 +116,11 @@ define('JSONP', 3, true); | |||
110 | } | 116 | } |
111 | } | 117 | } |
112 | 118 | ||
119 | public function &getItems() | ||
120 | { | ||
121 | return $this->items; | ||
122 | } | ||
123 | |||
113 | /** | 124 | /** |
114 | * Create a new FeedItem. | 125 | * Create a new FeedItem. |
115 | * | 126 | * |
@@ -193,7 +204,8 @@ define('JSONP', 3, true); | |||
193 | */ | 204 | */ |
194 | public function setDescription($description) | 205 | public function setDescription($description) |
195 | { | 206 | { |
196 | $this->setChannelElement('description', $description); | 207 | $tag = ($this->version == ATOM)? 'subtitle' : 'description'; |
208 | $this->setChannelElement($tag, $description); | ||
197 | } | 209 | } |
198 | 210 | ||
199 | /** | 211 | /** |
@@ -238,7 +250,7 @@ define('JSONP', 3, true); | |||
238 | { | 250 | { |
239 | $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; | 251 | $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; |
240 | if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; | 252 | if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; |
241 | $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; | 253 | $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; |
242 | echo $out; | 254 | echo $out; |
243 | } | 255 | } |
244 | elseif ($this->version == JSON || $this->version == JSONP) | 256 | elseif ($this->version == JSON || $this->version == JSONP) |
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php index 2f5244f9..c4a48b21 100644 --- a/inc/3rdparty/libraries/html5/TreeBuilder.php +++ b/inc/3rdparty/libraries/html5/TreeBuilder.php | |||
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder { | |||
134 | 134 | ||
135 | // Namespaces for foreign content | 135 | // Namespaces for foreign content |
136 | const NS_HTML = null; // to prevent DOM from requiring NS on everything | 136 | const NS_HTML = null; // to prevent DOM from requiring NS on everything |
137 | const NS_XHTML = 'http://www.w3.org/1999/xhtml'; | ||
137 | const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; | 138 | const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
138 | const NS_SVG = 'http://www.w3.org/2000/svg'; | 139 | const NS_SVG = 'http://www.w3.org/2000/svg'; |
139 | const NS_XLINK = 'http://www.w3.org/1999/xlink'; | 140 | const NS_XLINK = 'http://www.w3.org/1999/xlink'; |
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder { | |||
3157 | } | 3158 | } |
3158 | 3159 | ||
3159 | private function insertElement($token, $append = true) { | 3160 | private function insertElement($token, $append = true) { |
3160 | $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); | 3161 | //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']); |
3162 | $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML; | ||
3163 | $el = $this->dom->createElementNS($namespaceURI, $token['name']); | ||
3161 | 3164 | ||
3162 | if (!empty($token['attr'])) { | 3165 | if (!empty($token['attr'])) { |
3163 | foreach($token['attr'] as $attr) { | 3166 | foreach($token['attr'] as $attr) { |
3164 | if(!$el->hasAttribute($attr['name'])) { | 3167 | |
3168 | // mike@macgirvin.com 2011-11-17, check attribute name for | ||
3169 | // validity (ignoring extenders and combiners) as illegal chars in names | ||
3170 | // causes everything to abort | ||
3171 | |||
3172 | $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']); | ||
3173 | if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) { | ||
3165 | $el->setAttribute($attr['name'], $attr['value']); | 3174 | $el->setAttribute($attr['name'], $attr['value']); |
3166 | } | 3175 | } |
3167 | } | 3176 | } |
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php index 83e94f14..e4d5f495 100644 --- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php +++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php | |||
@@ -1,404 +1,403 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Cookie Jar | 3 | * Cookie Jar |
4 | * | 4 | * |
5 | * PHP class for handling cookies, as defined by the Netscape spec: | 5 | * PHP class for handling cookies, as defined by the Netscape spec: |
6 | * <http://curl.haxx.se/rfc/cookie_spec.html> | 6 | * <http://curl.haxx.se/rfc/cookie_spec.html> |
7 | * | 7 | * |
8 | * This class should be used to handle cookies (storing cookies from HTTP response messages, and | 8 | * This class should be used to handle cookies (storing cookies from HTTP response messages, and |
9 | * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org | 9 | * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org |
10 | * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ | 10 | * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ |
11 | * | 11 | * |
12 | * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ | 12 | * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ |
13 | * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. | 13 | * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. |
14 | * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. | 14 | * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. |
15 | * | 15 | * |
16 | * @version 0.5 | 16 | * @version 0.5 |
17 | * @date 2011-03-15 | 17 | * @date 2011-03-15 |
18 | * @see http://php.net/HttpRequestPool | 18 | * @see http://php.net/HttpRequestPool |
19 | * @author Keyvan Minoukadeh | 19 | * @author Keyvan Minoukadeh |
20 | * @copyright 2011 Keyvan Minoukadeh | 20 | * @copyright 2011 Keyvan Minoukadeh |
21 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 21 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
22 | */ | 22 | */ |
23 | 23 | ||
24 | class CookieJar | 24 | class CookieJar |
25 | { | 25 | { |
26 | /** | 26 | /** |
27 | * Cookies - array containing all cookies. | 27 | * Cookies - array containing all cookies. |
28 | * | 28 | * |
29 | * <pre> | 29 | * <pre> |
30 | * Cookies are stored like this: | 30 | * Cookies are stored like this: |
31 | * [domain][path][name] = array | 31 | * [domain][path][name] = array |
32 | * where array is: | 32 | * where array is: |
33 | * 0 => value, 1 => secure, 2 => expires | 33 | * 0 => value, 1 => secure, 2 => expires |
34 | * </pre> | 34 | * </pre> |
35 | * @var array | 35 | * @var array |
36 | * @access private | 36 | * @access private |
37 | */ | 37 | */ |
38 | public $cookies = array(); | 38 | public $cookies = array(); |
39 | public $debug = false; | 39 | public $debug = false; |
40 | 40 | ||
41 | /** | 41 | /** |
42 | * Constructor | 42 | * Constructor |
43 | */ | 43 | */ |
44 | function __construct() { | 44 | function __construct() { |
45 | } | 45 | } |
46 | 46 | ||
47 | protected function debug($msg, $file=null, $line=null) { | 47 | protected function debug($msg, $file=null, $line=null) { |
48 | if ($this->debug) { | 48 | if ($this->debug) { |
49 | $mem = round(memory_get_usage()/1024, 2); | 49 | $mem = round(memory_get_usage()/1024, 2); |
50 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 50 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
51 | echo '* ',$msg; | 51 | echo '* ',$msg; |
52 | if (isset($file, $line)) echo " ($file line $line)"; | 52 | if (isset($file, $line)) echo " ($file line $line)"; |
53 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 53 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
54 | ob_flush(); | 54 | ob_flush(); |
55 | flush(); | 55 | flush(); |
56 | } | 56 | } |
57 | } | 57 | } |
58 | 58 | ||
59 | /** | 59 | /** |
60 | * Get matching cookies | 60 | * Get matching cookies |
61 | * | 61 | * |
62 | * Only use this method if you cannot use add_cookie_header(), for example, if you want to use | 62 | * Only use this method if you cannot use add_cookie_header(), for example, if you want to use |
63 | * this cookie jar class without using the request class. | 63 | * this cookie jar class without using the request class. |
64 | * | 64 | * |
65 | * @param array $param associative array containing 'domain', 'path', 'secure' keys | 65 | * @param array $param associative array containing 'domain', 'path', 'secure' keys |
66 | * @return string | 66 | * @return string |
67 | * @see add_cookie_header() | 67 | * @see add_cookie_header() |
68 | */ | 68 | */ |
69 | public function getMatchingCookies($url) | 69 | public function getMatchingCookies($url) |
70 | { | 70 | { |
71 | if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { | 71 | if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { |
72 | $param['domain'] = $parts['host']; | 72 | $param['domain'] = $parts['host']; |
73 | $param['path'] = $parts['path']; | 73 | $param['path'] = $parts['path']; |
74 | $param['secure'] = (strtolower($parts['scheme']) == 'https'); | 74 | $param['secure'] = (strtolower($parts['scheme']) == 'https'); |
75 | unset($parts); | 75 | unset($parts); |
76 | } else { | 76 | } else { |
77 | return false; | 77 | return false; |
78 | } | 78 | } |
79 | // RFC 2965 notes: | 79 | // RFC 2965 notes: |
80 | // If multiple cookies satisfy the criteria above, they are ordered in | 80 | // If multiple cookies satisfy the criteria above, they are ordered in |
81 | // the Cookie header such that those with more specific Path attributes | 81 | // the Cookie header such that those with more specific Path attributes |
82 | // precede those with less specific. Ordering with respect to other | 82 | // precede those with less specific. Ordering with respect to other |
83 | // attributes (e.g., Domain) is unspecified. | 83 | // attributes (e.g., Domain) is unspecified. |
84 | $domain = $param['domain']; | 84 | $domain = $param['domain']; |
85 | if (strpos($domain, '.') === false) $domain .= '.local'; | 85 | if (strpos($domain, '.') === false) $domain .= '.local'; |
86 | $request_path = $param['path']; | 86 | $request_path = $param['path']; |
87 | if ($request_path == '') $request_path = '/'; | 87 | if ($request_path == '') $request_path = '/'; |
88 | $request_secure = $param['secure']; | 88 | $request_secure = $param['secure']; |
89 | $now = time(); | 89 | $now = time(); |
90 | $matched_cookies = array(); | 90 | $matched_cookies = array(); |
91 | // domain - find matching domains | 91 | // domain - find matching domains |
92 | $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); | 92 | $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); |
93 | while (strpos($domain, '.') !== false) { | 93 | while (strpos($domain, '.') !== false) { |
94 | if (isset($this->cookies[$domain])) { | 94 | if (isset($this->cookies[$domain])) { |
95 | $this->debug(' domain match found: '.$domain); | 95 | $this->debug(' domain match found: '.$domain); |
96 | $cookies =& $this->cookies[$domain]; | 96 | $cookies =& $this->cookies[$domain]; |
97 | } else { | 97 | } else { |
98 | $domain = $this->_reduce_domain($domain); | 98 | $domain = $this->_reduce_domain($domain); |
99 | continue; | 99 | continue; |
100 | } | 100 | } |
101 | // paths - find matching paths starting from most specific | 101 | // paths - find matching paths starting from most specific |
102 | $this->debug(' - Finding matching paths for '.$request_path); | 102 | $this->debug(' - Finding matching paths for '.$request_path); |
103 | $paths = array_keys($cookies); | 103 | $paths = array_keys($cookies); |
104 | usort($paths, array($this, '_cmp_length')); | 104 | usort($paths, array($this, '_cmp_length')); |
105 | foreach ($paths as $path) { | 105 | foreach ($paths as $path) { |
106 | // continue to next cookie if request path does not path-match cookie path | 106 | // continue to next cookie if request path does not path-match cookie path |
107 | if (!$this->_path_match($request_path, $path)) continue; | 107 | if (!$this->_path_match($request_path, $path)) continue; |
108 | // loop through cookie names | 108 | // loop through cookie names |
109 | $this->debug(' path match found: '.$path); | 109 | $this->debug(' path match found: '.$path); |
110 | foreach ($cookies[$path] as $name => $values) { | 110 | foreach ($cookies[$path] as $name => $values) { |
111 | // if this cookie is secure but request isn't, continue to next cookie | 111 | // if this cookie is secure but request isn't, continue to next cookie |
112 | if ($values[1] && !$request_secure) continue; | 112 | if ($values[1] && !$request_secure) continue; |
113 | // if cookie is not a session cookie and has expired, continue to next cookie | 113 | // if cookie is not a session cookie and has expired, continue to next cookie |
114 | if (is_int($values[2]) && ($values[2] < $now)) continue; | 114 | if (is_int($values[2]) && ($values[2] < $now)) continue; |
115 | // cookie matches request | 115 | // cookie matches request |
116 | $this->debug(' cookie match: '.$name.'='.$values[0]); | 116 | $this->debug(' cookie match: '.$name.'='.$values[0]); |
117 | $matched_cookies[] = $name.'='.$values[0]; | 117 | $matched_cookies[] = $name.'='.$values[0]; |
118 | } | 118 | } |
119 | } | 119 | } |
120 | $domain = $this->_reduce_domain($domain); | 120 | $domain = $this->_reduce_domain($domain); |
121 | } | 121 | } |
122 | // return cookies | 122 | // return cookies |
123 | return implode('; ', $matched_cookies); | 123 | return implode('; ', $matched_cookies); |
124 | } | 124 | } |
125 | 125 | ||
126 | /** | 126 | /** |
127 | * Parse Set-Cookie values. | 127 | * Parse Set-Cookie values. |
128 | * | 128 | * |
129 | * Only use this method if you cannot use extract_cookies(), for example, if you want to use | 129 | * Only use this method if you cannot use extract_cookies(), for example, if you want to use |
130 | * this cookie jar class without using the response class. | 130 | * this cookie jar class without using the response class. |
131 | * | 131 | * |
132 | * @param array $set_cookies array holding 1 or more "Set-Cookie" header values | 132 | * @param array $set_cookies array holding 1 or more "Set-Cookie" header values |
133 | * @param array $param associative array containing 'host', 'path' keys | 133 | * @param array $param associative array containing 'host', 'path' keys |
134 | * @return void | 134 | * @return void |
135 | * @see extract_cookies() | 135 | * @see extract_cookies() |
136 | */ | 136 | */ |
137 | public function storeCookies($url, $set_cookies) | 137 | public function storeCookies($url, $set_cookies) |
138 | { | 138 | { |
139 | if (count($set_cookies) == 0) return; | 139 | if (count($set_cookies) == 0) return; |
140 | $param = @parse_url($url); | 140 | $param = @parse_url($url); |
141 | if (!is_array($param) || !isset($param['host'])) return; | 141 | if (!is_array($param) || !isset($param['host'])) return; |
142 | $request_host = $param['host']; | 142 | $request_host = $param['host']; |
143 | if (strpos($request_host, '.') === false) $request_host .= '.local'; | 143 | if (strpos($request_host, '.') === false) $request_host .= '.local'; |
144 | $request_path = @$param['path']; | 144 | $request_path = @$param['path']; |
145 | if ($request_path == '') $request_path = '/'; | 145 | if ($request_path == '') $request_path = '/'; |
146 | // | 146 | // |
147 | // loop through set-cookie headers | 147 | // loop through set-cookie headers |
148 | // | 148 | // |
149 | foreach ($set_cookies as $set_cookie) { | 149 | foreach ($set_cookies as $set_cookie) { |
150 | $this->debug('Parsing: '.$set_cookie); | 150 | $this->debug('Parsing: '.$set_cookie); |
151 | // temporary cookie store (before adding to jar) | 151 | // temporary cookie store (before adding to jar) |
152 | $tmp_cookie = array(); | 152 | $tmp_cookie = array(); |
153 | $param = explode(';', $set_cookie); | 153 | $param = explode(';', $set_cookie); |
154 | // loop through params | 154 | // loop through params |
155 | for ($x=0; $x<count($param); $x++) { | 155 | for ($x=0; $x<count($param); $x++) { |
156 | $key_val = explode('=', $param[$x], 2); | 156 | $key_val = explode('=', $param[$x], 2); |
157 | if (count($key_val) != 2) { | 157 | if (count($key_val) != 2) { |
158 | // if the first param isn't a name=value pair, continue to the next set-cookie | 158 | // if the first param isn't a name=value pair, continue to the next set-cookie |
159 | // header | 159 | // header |
160 | if ($x == 0) continue 2; | 160 | if ($x == 0) continue 2; |
161 | // check for secure flag | 161 | // check for secure flag |
162 | if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; | 162 | if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; |
163 | // continue to next param | 163 | // continue to next param |
164 | continue; | 164 | continue; |
165 | } | 165 | } |
166 | list($key, $val) = array_map('trim', $key_val); | 166 | list($key, $val) = array_map('trim', $key_val); |
167 | // first name=value pair is the cookie name and value | 167 | // first name=value pair is the cookie name and value |
168 | // the name and value are stored under 'name' and 'value' to avoid conflicts | 168 | // the name and value are stored under 'name' and 'value' to avoid conflicts |
169 | // with later parameters. | 169 | // with later parameters. |
170 | if ($x == 0) { | 170 | if ($x == 0) { |
171 | $tmp_cookie = array('name'=>$key, 'value'=>$val); | 171 | $tmp_cookie = array('name'=>$key, 'value'=>$val); |
172 | continue; | 172 | continue; |
173 | } | 173 | } |
174 | $key = strtolower($key); | 174 | $key = strtolower($key); |
175 | if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { | 175 | if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { |
176 | $tmp_cookie[$key] = $val; | 176 | $tmp_cookie[$key] = $val; |
177 | } | 177 | } |
178 | } | 178 | } |
179 | // | 179 | // |
180 | // set cookie | 180 | // set cookie |
181 | // | 181 | // |
182 | // check domain | 182 | // check domain |
183 | if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && | 183 | if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && |
184 | ($tmp_cookie['domain'] != ".$request_host")) { | 184 | ($tmp_cookie['domain'] != ".$request_host")) { |
185 | $domain = $tmp_cookie['domain']; | 185 | $domain = $tmp_cookie['domain']; |
186 | if ((strpos($domain, '.') === false) && ($domain != 'local')) { | 186 | if ((strpos($domain, '.') === false) && ($domain != 'local')) { |
187 | $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); | 187 | $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); |
188 | continue; | 188 | continue; |
189 | } | 189 | } |
190 | if (preg_match('/\.[0-9]+$/', $domain)) { | 190 | if (preg_match('/\.[0-9]+$/', $domain)) { |
191 | $this->debug(' - domain "'.$domain.'" appears to be an ip address'); | 191 | $this->debug(' - domain "'.$domain.'" appears to be an ip address'); |
192 | continue; | 192 | continue; |
193 | } | 193 | } |
194 | if (substr($domain, 0, 1) != '.') $domain = ".$domain"; | 194 | if (substr($domain, 0, 1) != '.') $domain = ".$domain"; |
195 | if (!$this->_domain_match($request_host, $domain)) { | 195 | if (!$this->_domain_match($request_host, $domain)) { |
196 | $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); | 196 | $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); |
197 | continue; | 197 | continue; |
198 | } | 198 | } |
199 | } else { | 199 | } else { |
200 | // if domain is not specified in the set-cookie header, domain will default to | 200 | // if domain is not specified in the set-cookie header, domain will default to |
201 | // the request host | 201 | // the request host |
202 | $domain = $request_host; | 202 | $domain = $request_host; |
203 | } | 203 | } |
204 | // check path | 204 | // check path |
205 | if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { | 205 | if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { |
206 | $path = urldecode($tmp_cookie['path']); | 206 | $path = urldecode($tmp_cookie['path']); |
207 | if (!$this->_path_match($request_path, $path)) { | 207 | if (!$this->_path_match($request_path, $path)) { |
208 | $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); | 208 | $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); |
209 | continue; | 209 | continue; |
210 | } | 210 | } |
211 | } else { | 211 | } else { |
212 | $path = $request_path; | 212 | $path = $request_path; |
213 | $path = substr($path, 0, strrpos($path, '/')); | 213 | $path = substr($path, 0, strrpos($path, '/')); |
214 | if ($path == '') $path = '/'; | 214 | if ($path == '') $path = '/'; |
215 | } | 215 | } |
216 | // check if secure | 216 | // check if secure |
217 | $secure = (isset($tmp_cookie['secure'])) ? true : false; | 217 | $secure = (isset($tmp_cookie['secure'])) ? true : false; |
218 | // check expiry | 218 | // check expiry |
219 | if (isset($tmp_cookie['expires'])) { | 219 | if (isset($tmp_cookie['expires'])) { |
220 | if (($expires = strtotime($tmp_cookie['expires'])) < 0) { | 220 | if (($expires = strtotime($tmp_cookie['expires'])) < 0) { |
221 | $expires = null; | 221 | $expires = null; |
222 | } | 222 | } |
223 | } else { | 223 | } else { |
224 | $expires = null; | 224 | $expires = null; |
225 | } | 225 | } |
226 | // set cookie | 226 | // set cookie |
227 | $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); | 227 | $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); |
228 | } | 228 | } |
229 | } | 229 | } |
230 | 230 | ||
231 | // return array of set-cookie values extracted from HTTP response headers (string $h) | 231 | // return array of set-cookie values extracted from HTTP response headers (string $h) |
232 | public function extractCookies($h) { | 232 | public function extractCookies($h) { |
233 | $x = 0; | 233 | $x = 0; |
234 | $lines = 0; | 234 | $lines = 0; |
235 | $headers = array(); | 235 | $headers = array(); |
236 | $last_match = false; | 236 | $last_match = false; |
237 | $h = explode("\n", $h); | 237 | $h = explode("\n", $h); |
238 | foreach ($h as $line) { | 238 | foreach ($h as $line) { |
239 | $line = rtrim($line); | 239 | $line = rtrim($line); |
240 | $lines++; | 240 | $lines++; |
241 | 241 | ||
242 | $trimmed_line = trim($line); | 242 | $trimmed_line = trim($line); |
243 | if (isset($line_last)) { | 243 | if (isset($line_last)) { |
244 | // check if we have \r\n\r\n (indicating the end of headers) | 244 | // check if we have \r\n\r\n (indicating the end of headers) |
245 | // some servers will not use CRLF (\r\n), so we make CR (\r) optional. | 245 | // some servers will not use CRLF (\r\n), so we make CR (\r) optional. |
246 | // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { | 246 | // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { |
247 | // break; | 247 | // break; |
248 | // } | 248 | // } |
249 | // As an alternative, we can check if the current trimmed line is empty | 249 | // As an alternative, we can check if the current trimmed line is empty |
250 | if ($trimmed_line == '') { | 250 | if ($trimmed_line == '') { |
251 | break; | 251 | break; |
252 | } | 252 | } |
253 | 253 | ||
254 | // check for continuation line... | 254 | // check for continuation line... |
255 | // RFC 2616 Section 2.2 "Basic Rules": | 255 | // RFC 2616 Section 2.2 "Basic Rules": |
256 | // HTTP/1.1 header field values can be folded onto multiple lines if the | 256 | // HTTP/1.1 header field values can be folded onto multiple lines if the |
257 | // continuation line begins with a space or horizontal tab. All linear | 257 | // continuation line begins with a space or horizontal tab. All linear |
258 | // white space, including folding, has the same semantics as SP. A | 258 | // white space, including folding, has the same semantics as SP. A |
259 | // recipient MAY replace any linear white space with a single SP before | 259 | // recipient MAY replace any linear white space with a single SP before |
260 | // interpreting the field value or forwarding the message downstream. | 260 | // interpreting the field value or forwarding the message downstream. |
261 | if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { | 261 | if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { |
262 | // append to previous header value | 262 | // append to previous header value |
263 | $headers[$x-1] .= ' '.rtrim($match[1]); | 263 | $headers[$x-1] .= ' '.rtrim($match[1]); |
264 | continue; | 264 | continue; |
265 | } | 265 | } |
266 | } | 266 | } |
267 | $line_last = $line; | 267 | $line_last = $line; |
268 | 268 | ||
269 | // split header name and value | 269 | // split header name and value |
270 | if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { | 270 | if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { |
271 | $headers[$x++] = rtrim($match[1]); | 271 | $headers[$x++] = rtrim($match[1]); |
272 | $last_match = true; | 272 | $last_match = true; |
273 | } else { | 273 | } else { |
274 | $last_match = false; | 274 | $last_match = false; |
275 | } | 275 | } |
276 | } | 276 | } |
277 | return $headers; | 277 | return $headers; |
278 | } | 278 | } |
279 | 279 | ||
280 | /** | 280 | /** |
281 | * Set Cookie | 281 | * Set Cookie |
282 | * @param string $domain | 282 | * @param string $domain |
283 | * @param string $path | 283 | * @param string $path |
284 | * @param string $name cookie name | 284 | * @param string $name cookie name |
285 | * @param string $value cookie value | 285 | * @param string $value cookie value |
286 | * @param bool $secure | 286 | * @param bool $secure |
287 | * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) | 287 | * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) |
288 | * @return void | 288 | * @return void |
289 | */ | 289 | */ |
290 | function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) | 290 | function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) |
291 | { | 291 | { |
292 | if ($domain == '') return; | 292 | if ($domain == '') return; |
293 | if ($path == '') return; | 293 | if ($path == '') return; |
294 | if ($name == '') return; | 294 | if ($name == '') return; |
295 | // check if cookie needs to go | 295 | // check if cookie needs to go |
296 | if (isset($expires) && ($expires <= 0)) { | 296 | if (isset($expires) && ($expires <= 0)) { |
297 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); | 297 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); |
298 | return; | 298 | return; |
299 | } | 299 | } |
300 | if ($value == '') return; | 300 | if ($value == '') return; |
301 | $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); | 301 | $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); |
302 | return; | 302 | return; |
303 | } | 303 | } |
304 | 304 | ||
305 | /** | 305 | /** |
306 | * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. | 306 | * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. |
307 | * @param string $domain | 307 | * @param string $domain |
308 | * @param string $path | 308 | * @param string $path |
309 | * @param string $name | 309 | * @param string $name |
310 | * @return void | 310 | * @return void |
311 | */ | 311 | */ |
312 | function clear($domain=null, $path=null, $name=null) | 312 | function clear($domain=null, $path=null, $name=null) |
313 | { | 313 | { |
314 | if (!isset($domain)) { | 314 | if (!isset($domain)) { |
315 | $this->cookies = array(); | 315 | $this->cookies = array(); |
316 | } elseif (!isset($path)) { | 316 | } elseif (!isset($path)) { |
317 | if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); | 317 | if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); |
318 | } elseif (!isset($name)) { | 318 | } elseif (!isset($name)) { |
319 | if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); | 319 | if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); |
320 | } elseif (isset($name)) { | 320 | } elseif (isset($name)) { |
321 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); | 321 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); |
322 | } | 322 | } |
323 | } | 323 | } |
324 | 324 | ||
325 | /** | 325 | /** |
326 | * Compare string length - used for sorting | 326 | * Compare string length - used for sorting |
327 | * @access private | 327 | * @access private |
328 | * @return int | 328 | * @return int |
329 | */ | 329 | */ |
330 | function _cmp_length($a, $b) | 330 | function _cmp_length($a, $b) |
331 | { | 331 | { |
332 | $la = strlen($a); $lb = strlen($b); | 332 | $la = strlen($a); $lb = strlen($b); |
333 | if ($la == $lb) return 0; | 333 | if ($la == $lb) return 0; |
334 | return ($la > $lb) ? -1 : 1; | 334 | return ($la > $lb) ? -1 : 1; |
335 | } | 335 | } |
336 | 336 | ||
337 | /** | 337 | /** |
338 | * Reduce domain | 338 | * Reduce domain |
339 | * @param string $domain | 339 | * @param string $domain |
340 | * @return string | 340 | * @return string |
341 | * @access private | 341 | * @access private |
342 | */ | 342 | */ |
343 | function _reduce_domain($domain) | 343 | function _reduce_domain($domain) |
344 | { | 344 | { |
345 | if ($domain == '') return ''; | 345 | if ($domain == '') return ''; |
346 | if (substr($domain, 0, 1) == '.') return substr($domain, 1); | 346 | if (substr($domain, 0, 1) == '.') return substr($domain, 1); |
347 | return substr($domain, strpos($domain, '.')); | 347 | return substr($domain, strpos($domain, '.')); |
348 | } | 348 | } |
349 | 349 | ||
350 | /** | 350 | /** |
351 | * Path match - check if path1 path-matches path2 | 351 | * Path match - check if path1 path-matches path2 |
352 | * | 352 | * |
353 | * From RFC 2965: | 353 | * From RFC 2965: |
354 | * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 | 354 | * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 |
355 | * if P2 is a prefix of P1 (including the case where P1 and P2 string- | 355 | * if P2 is a prefix of P1 (including the case where P1 and P2 string- |
356 | * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> | 356 | * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> |
357 | * @param string $path1 | 357 | * @param string $path1 |
358 | * @param string $path2 | 358 | * @param string $path2 |
359 | * @return bool | 359 | * @return bool |
360 | * @access private | 360 | * @access private |
361 | */ | 361 | */ |
362 | function _path_match($path1, $path2) | 362 | function _path_match($path1, $path2) |
363 | { | 363 | { |
364 | return (substr($path1, 0, strlen($path2)) == $path2); | 364 | return (substr($path1, 0, strlen($path2)) == $path2); |
365 | } | 365 | } |
366 | 366 | ||
367 | /** | 367 | /** |
368 | * Domain match - check if domain1 domain-matches domain2 | 368 | * Domain match - check if domain1 domain-matches domain2 |
369 | * | 369 | * |
370 | * A few extracts from RFC 2965: | 370 | * A few extracts from RFC 2965: |
371 | * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com | 371 | * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com |
372 | * would be rejected, because H is y.x and contains a dot. | 372 | * would be rejected, because H is y.x and contains a dot. |
373 | * | 373 | * |
374 | * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com | 374 | * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com |
375 | * would be accepted. | 375 | * would be accepted. |
376 | * | 376 | * |
377 | * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be | 377 | * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be |
378 | * rejected, because there is no embedded dot. | 378 | * rejected, because there is no embedded dot. |
379 | * | 379 | * |
380 | * - A Set-Cookie2 from request-host example for Domain=.local will | 380 | * - A Set-Cookie2 from request-host example for Domain=.local will |
381 | * be accepted, because the effective host name for the request- | 381 | * be accepted, because the effective host name for the request- |
382 | * host is example.local, and example.local domain-matches .local. | 382 | * host is example.local, and example.local domain-matches .local. |
383 | * | 383 | * |
384 | * I'm ignoring the first point for now (must check to see how other browsers handle | 384 | * I'm ignoring the first point for now (must check to see how other browsers handle |
385 | * this rule for Set-Cookie headers) | 385 | * this rule for Set-Cookie headers) |
386 | * | 386 | * |
387 | * @param string $domain1 | 387 | * @param string $domain1 |
388 | * @param string $domain2 | 388 | * @param string $domain2 |
389 | * @return bool | 389 | * @return bool |
390 | * @access private | 390 | * @access private |
391 | */ | 391 | */ |
392 | function _domain_match($domain1, $domain2) | 392 | function _domain_match($domain1, $domain2) |
393 | { | 393 | { |
394 | $domain1 = strtolower($domain1); | 394 | $domain1 = strtolower($domain1); |
395 | $domain2 = strtolower($domain2); | 395 | $domain2 = strtolower($domain2); |
396 | while (strpos($domain1, '.') !== false) { | 396 | while (strpos($domain1, '.') !== false) { |
397 | if ($domain1 == $domain2) return true; | 397 | if ($domain1 == $domain2) return true; |
398 | $domain1 = $this->_reduce_domain($domain1); | 398 | $domain1 = $this->_reduce_domain($domain1); |
399 | continue; | 399 | continue; |
400 | } | 400 | } |
401 | return false; | 401 | return false; |
402 | } | 402 | } |
403 | } | 403 | } \ No newline at end of file |
404 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php index e4f1b3b3..963f0c05 100644 --- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | |||
@@ -1,779 +1,810 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent | 3 | * Humble HTTP Agent |
4 | * | 4 | * |
5 | * This class is designed to take advantage of parallel HTTP requests | 5 | * This class is designed to take advantage of parallel HTTP requests |
6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. | 6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. |
7 | * For environments which do not have these options, it reverts to standard sequential | 7 | * For environments which do not have these options, it reverts to standard sequential |
8 | * requests (using file_get_contents()) | 8 | * requests (using file_get_contents()) |
9 | * | 9 | * |
10 | * @version 1.1 | 10 | * @version 1.4 |
11 | * @date 2012-08-20 | 11 | * @date 2013-05-10 |
12 | * @see http://php.net/HttpRequestPool | 12 | * @see http://php.net/HttpRequestPool |
13 | * @author Keyvan Minoukadeh | 13 | * @author Keyvan Minoukadeh |
14 | * @copyright 2011-2012 Keyvan Minoukadeh | 14 | * @copyright 2011-2013 Keyvan Minoukadeh |
15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
16 | */ | 16 | */ |
17 | 17 | ||
18 | class HumbleHttpAgent | 18 | class HumbleHttpAgent |
19 | { | 19 | { |
20 | const METHOD_REQUEST_POOL = 1; | 20 | const METHOD_REQUEST_POOL = 1; |
21 | const METHOD_CURL_MULTI = 2; | 21 | const METHOD_CURL_MULTI = 2; |
22 | const METHOD_FILE_GET_CONTENTS = 4; | 22 | const METHOD_FILE_GET_CONTENTS = 4; |
23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; | 23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; |
24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; | 24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; |
25 | const UA_PHP = 'PHP/5.2'; | 25 | const UA_PHP = 'PHP/5.4'; |
26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; | 26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; |
27 | 27 | ||
28 | protected $requests = array(); | 28 | protected $requests = array(); |
29 | protected $redirectQueue = array(); | 29 | protected $redirectQueue = array(); |
30 | protected $requestOptions; | 30 | protected $requestOptions; |
31 | protected $maxParallelRequests = 5; | 31 | protected $maxParallelRequests = 5; |
32 | protected $cache = null; //TODO | 32 | protected $cache = null; //TODO |
33 | protected $httpContext; | 33 | protected $httpContext; |
34 | protected $minimiseMemoryUse = false; //TODO | 34 | protected $minimiseMemoryUse = false; //TODO |
35 | protected $method; | 35 | protected $method; |
36 | protected $cookieJar; | 36 | protected $cookieJar; |
37 | public $debug = false; | 37 | public $debug = false; |
38 | public $debugVerbose = false; | 38 | public $debugVerbose = false; |
39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html | 39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html |
40 | public $maxRedirects = 5; | 40 | public $maxRedirects = 5; |
41 | public $userAgentMap = array(); | 41 | public $userAgentMap = array(); |
42 | public $rewriteUrls = array(); | 42 | public $rewriteUrls = array(); |
43 | public $userAgentDefault; | 43 | public $userAgentDefault; |
44 | public $referer; | 44 | public $referer; |
45 | //public $userAgent = 'Mozilla/5.0'; | 45 | //public $userAgent = 'Mozilla/5.0'; |
46 | 46 | ||
47 | // Prevent certain file/mime types | 47 | // Prevent certain file/mime types |
48 | // HTTP responses which match these content types will | 48 | // HTTP responses which match these content types will |
49 | // be returned without body. | 49 | // be returned without body. |
50 | public $headerOnlyTypes = array(); | 50 | public $headerOnlyTypes = array(); |
51 | // URLs ending with one of these extensions will | 51 | // URLs ending with one of these extensions will |
52 | // prompt Humble HTTP Agent to send a HEAD request first | 52 | // prompt Humble HTTP Agent to send a HEAD request first |
53 | // to see if returned content type matches $headerOnlyTypes. | 53 | // to see if returned content type matches $headerOnlyTypes. |
54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); | 54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); |
55 | // AJAX triggers to search for. | 55 | // AJAX triggers to search for. |
56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); | 57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); |
58 | 58 | ||
59 | //TODO: set max file size | 59 | //TODO: set max file size |
60 | //TODO: normalise headers | 60 | //TODO: normalise headers |
61 | 61 | ||
62 | function __construct($requestOptions=null, $method=null) { | 62 | function __construct($requestOptions=null, $method=null) { |
63 | $this->userAgentDefault = self::UA_BROWSER; | 63 | $this->userAgentDefault = self::UA_BROWSER; |
64 | $this->referer = self::REF_GOOGLE; | 64 | $this->referer = self::REF_GOOGLE; |
65 | // set the request method | 65 | // set the request method |
66 | if (in_array($method, array(1,2,4))) { | 66 | if (in_array($method, array(1,2,4))) { |
67 | $this->method = $method; | 67 | $this->method = $method; |
68 | } else { | 68 | } else { |
69 | if (class_exists('HttpRequestPool')) { | 69 | if (class_exists('HttpRequestPool')) { |
70 | $this->method = self::METHOD_REQUEST_POOL; | 70 | $this->method = self::METHOD_REQUEST_POOL; |
71 | } elseif (function_exists('curl_multi_init')) { | 71 | } elseif (function_exists('curl_multi_init')) { |
72 | $this->method = self::METHOD_CURL_MULTI; | 72 | $this->method = self::METHOD_CURL_MULTI; |
73 | } else { | 73 | } else { |
74 | $this->method = self::METHOD_FILE_GET_CONTENTS; | 74 | $this->method = self::METHOD_FILE_GET_CONTENTS; |
75 | } | 75 | } |
76 | } | 76 | } |
77 | if ($this->method == self::METHOD_CURL_MULTI) { | 77 | if ($this->method == self::METHOD_CURL_MULTI) { |
78 | require_once(dirname(__FILE__).'/RollingCurl.php'); | 78 | require_once(dirname(__FILE__).'/RollingCurl.php'); |
79 | } | 79 | } |
80 | // create cookie jar | 80 | // create cookie jar |
81 | $this->cookieJar = new CookieJar(); | 81 | $this->cookieJar = new CookieJar(); |
82 | // set request options (redirect must be 0) | 82 | // set request options (redirect must be 0) |
83 | $this->requestOptions = array( | 83 | $this->requestOptions = array( |
84 | 'timeout' => 15, | 84 | 'timeout' => 15, |
85 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web | 85 | 'connecttimeout' => 15, |
86 | // TODO: test onprogress? | 86 | 'dns_cache_timeout' => 300, |
87 | ); | 87 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web |
88 | if (is_array($requestOptions)) { | 88 | // TODO: test onprogress? |
89 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); | 89 | ); |
90 | } | 90 | if (is_array($requestOptions)) { |
91 | $this->httpContext = array( | 91 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); |
92 | 'http' => array( | 92 | } |
93 | 'ignore_errors' => true, | 93 | $this->httpContext = array( |
94 | 'timeout' => $this->requestOptions['timeout'], | 94 | 'http' => array( |
95 | 'max_redirects' => $this->requestOptions['redirect'], | 95 | 'ignore_errors' => true, |
96 | 'header' => "Accept: */*\r\n" | 96 | 'timeout' => $this->requestOptions['timeout'], |
97 | ) | 97 | 'max_redirects' => $this->requestOptions['redirect'], |
98 | ); | 98 | 'header' => "Accept: */*\r\n" |
99 | } | 99 | ) |
100 | 100 | ); | |
101 | protected function debug($msg) { | 101 | } |
102 | if ($this->debug) { | 102 | |
103 | $mem = round(memory_get_usage()/1024, 2); | 103 | protected function debug($msg) { |
104 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | if ($this->debug) { |
105 | echo '* ',$msg; | 105 | $mem = round(memory_get_usage()/1024, 2); |
106 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 106 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
107 | echo "\n"; | 107 | echo '* ',$msg; |
108 | ob_flush(); | 108 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
109 | flush(); | 109 | echo "\n"; |
110 | } | 110 | ob_flush(); |
111 | } | 111 | flush(); |
112 | 112 | } | |
113 | protected function getUserAgent($url, $asArray=false) { | 113 | } |
114 | $host = @parse_url($url, PHP_URL_HOST); | 114 | |
115 | if (strtolower(substr($host, 0, 4)) == 'www.') { | 115 | protected function getUserAgent($url, $asArray=false) { |
116 | $host = substr($host, 4); | 116 | $host = @parse_url($url, PHP_URL_HOST); |
117 | } | 117 | if (strtolower(substr($host, 0, 4)) == 'www.') { |
118 | if ($host) { | 118 | $host = substr($host, 4); |
119 | $try = array($host); | 119 | } |
120 | $split = explode('.', $host); | 120 | if ($host) { |
121 | if (count($split) > 1) { | 121 | $try = array($host); |
122 | array_shift($split); | 122 | $split = explode('.', $host); |
123 | $try[] = '.'.implode('.', $split); | 123 | if (count($split) > 1) { |
124 | } | 124 | array_shift($split); |
125 | foreach ($try as $h) { | 125 | $try[] = '.'.implode('.', $split); |
126 | if (isset($this->userAgentMap[$h])) { | 126 | } |
127 | $ua = $this->userAgentMap[$h]; | 127 | foreach ($try as $h) { |
128 | break; | 128 | if (isset($this->userAgentMap[$h])) { |
129 | } | 129 | $ua = $this->userAgentMap[$h]; |
130 | } | 130 | break; |
131 | } | 131 | } |
132 | if (!isset($ua)) $ua = $this->userAgentDefault; | 132 | } |
133 | if ($asArray) { | 133 | } |
134 | return array('User-Agent' => $ua); | 134 | if (!isset($ua)) $ua = $this->userAgentDefault; |
135 | } else { | 135 | if ($asArray) { |
136 | return 'User-Agent: '.$ua; | 136 | return array('User-Agent' => $ua); |
137 | } | 137 | } else { |
138 | } | 138 | return 'User-Agent: '.$ua; |
139 | 139 | } | |
140 | public function rewriteHashbangFragment($url) { | 140 | } |
141 | // return $url if there's no '#!' | 141 | |
142 | if (strpos($url, '#!') === false) return $url; | 142 | public function rewriteHashbangFragment($url) { |
143 | // split $url and rewrite | 143 | // return $url if there's no '#!' |
144 | // TODO: is SimplePie_IRI included? | 144 | if (strpos($url, '#!') === false) return $url; |
145 | $iri = new SimplePie_IRI($url); | 145 | // split $url and rewrite |
146 | $fragment = substr($iri->fragment, 1); // strip '!' | 146 | // TODO: is SimplePie_IRI included? |
147 | $iri->fragment = null; | 147 | $iri = new SimplePie_IRI($url); |
148 | if (isset($iri->query)) { | 148 | $fragment = substr($iri->fragment, 1); // strip '!' |
149 | parse_str($iri->query, $query); | 149 | $iri->fragment = null; |
150 | } else { | 150 | if (isset($iri->query)) { |
151 | $query = array(); | 151 | parse_str($iri->query, $query); |
152 | } | 152 | } else { |
153 | $query['_escaped_fragment_'] = (string)$fragment; | 153 | $query = array(); |
154 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 154 | } |
155 | return $iri->get_iri(); | 155 | $query['_escaped_fragment_'] = (string)$fragment; |
156 | } | 156 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
157 | 157 | return $iri->get_iri(); | |
158 | public function getUglyURL($url, $html) { | 158 | } |
159 | if ($html == '') return false; | 159 | |
160 | $found = false; | 160 | public function getRedirectURLfromHTML($url, $html) { |
161 | foreach ($this->ajaxTriggers as $string) { | 161 | $redirect_url = $this->getMetaRefreshURL($url, $html); |
162 | if (stripos($html, $string)) { | 162 | if (!$redirect_url) { |
163 | $found = true; | 163 | $redirect_url = $this->getUglyURL($url, $html); |
164 | break; | 164 | } |
165 | } | 165 | return $redirect_url; |
166 | } | 166 | } |
167 | if (!$found) return false; | 167 | |
168 | $iri = new SimplePie_IRI($url); | 168 | public function getMetaRefreshURL($url, $html) { |
169 | if (isset($iri->query)) { | 169 | if ($html == '') return false; |
170 | parse_str($iri->query, $query); | 170 | // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513"> |
171 | } else { | 171 | if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) { |
172 | $query = array(); | 172 | return false; |
173 | } | 173 | } |
174 | $query['_escaped_fragment_'] = ''; | 174 | $redirect_url = $match[1]; |
175 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 175 | if (preg_match('!^https?://!i', $redirect_url)) { |
176 | return $iri->get_iri(); | 176 | // already absolute |
177 | } | 177 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); |
178 | 178 | return $redirect_url; | |
179 | public function removeFragment($url) { | 179 | } |
180 | $pos = strpos($url, '#'); | 180 | // absolutize redirect URL |
181 | if ($pos === false) { | 181 | $base = new SimplePie_IRI($url); |
182 | return $url; | 182 | // remove '//' in URL path (causes URLs not to resolve properly) |
183 | } else { | 183 | if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); |
184 | return substr($url, 0, $pos); | 184 | if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { |
185 | } | 185 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); |
186 | } | 186 | return $absolute; |
187 | 187 | } | |
188 | public function rewriteUrls($url) { | 188 | return false; |
189 | foreach ($this->rewriteUrls as $find => $action) { | 189 | } |
190 | if (strpos($url, $find) !== false) { | 190 | |
191 | if (is_array($action)) { | 191 | public function getUglyURL($url, $html) { |
192 | return strtr($url, $action); | 192 | if ($html == '') return false; |
193 | } | 193 | $found = false; |
194 | } | 194 | foreach ($this->ajaxTriggers as $string) { |
195 | } | 195 | if (stripos($html, $string)) { |
196 | return $url; | 196 | $found = true; |
197 | } | 197 | break; |
198 | 198 | } | |
199 | public function enableDebug($bool=true) { | 199 | } |
200 | $this->debug = (bool)$bool; | 200 | if (!$found) return false; |
201 | } | 201 | $iri = new SimplePie_IRI($url); |
202 | 202 | if (isset($iri->query)) { | |
203 | public function minimiseMemoryUse($bool = true) { | 203 | parse_str($iri->query, $query); |
204 | $this->minimiseMemoryUse = $bool; | 204 | } else { |
205 | } | 205 | $query = array(); |
206 | 206 | } | |
207 | public function setMaxParallelRequests($max) { | 207 | $query['_escaped_fragment_'] = ''; |
208 | $this->maxParallelRequests = $max; | 208 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
209 | } | 209 | $ugly_url = $iri->get_iri(); |
210 | 210 | $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); | |
211 | public function validateUrl($url) { | 211 | return $ugly_url; |
212 | $url = filter_var($url, FILTER_SANITIZE_URL); | 212 | } |
213 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 213 | |
214 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) | 214 | public function removeFragment($url) { |
215 | if ($test === false) { | 215 | $pos = strpos($url, '#'); |
216 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 216 | if ($pos === false) { |
217 | } | 217 | return $url; |
218 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { | 218 | } else { |
219 | return $url; | 219 | return substr($url, 0, $pos); |
220 | } else { | 220 | } |
221 | return false; | 221 | } |
222 | } | 222 | |
223 | } | 223 | public function rewriteUrls($url) { |
224 | 224 | foreach ($this->rewriteUrls as $find => $action) { | |
225 | public function fetchAll(array $urls) { | 225 | if (strpos($url, $find) !== false) { |
226 | $this->fetchAllOnce($urls, $isRedirect=false); | 226 | if (is_array($action)) { |
227 | $redirects = 0; | 227 | return strtr($url, $action); |
228 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { | 228 | } |
229 | $this->debug("Following redirects #$redirects..."); | 229 | } |
230 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); | 230 | } |
231 | } | 231 | return $url; |
232 | } | 232 | } |
233 | 233 | ||
234 | // fetch all URLs without following redirects | 234 | public function enableDebug($bool=true) { |
235 | public function fetchAllOnce(array $urls, $isRedirect=false) { | 235 | $this->debug = (bool)$bool; |
236 | if (!$isRedirect) $urls = array_unique($urls); | 236 | } |
237 | if (empty($urls)) return; | 237 | |
238 | 238 | public function minimiseMemoryUse($bool = true) { | |
239 | ////////////////////////////////////////////////////// | 239 | $this->minimiseMemoryUse = $bool; |
240 | // parallel (HttpRequestPool) | 240 | } |
241 | if ($this->method == self::METHOD_REQUEST_POOL) { | 241 | |
242 | $this->debug('Starting parallel fetch (HttpRequestPool)'); | 242 | public function setMaxParallelRequests($max) { |
243 | try { | 243 | $this->maxParallelRequests = $max; |
244 | while (count($urls) > 0) { | 244 | } |
245 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 245 | |
246 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 246 | public function validateUrl($url) { |
247 | $pool = new HttpRequestPool(); | 247 | $url = filter_var($url, FILTER_SANITIZE_URL); |
248 | foreach ($subset as $orig => $url) { | 248 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
249 | if (!$isRedirect) $orig = $url; | 249 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) |
250 | unset($this->redirectQueue[$orig]); | 250 | if ($test === false) { |
251 | $this->debug("...$url"); | 251 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
252 | if (!$isRedirect && isset($this->requests[$url])) { | 252 | } |
253 | $this->debug("......in memory"); | 253 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { |
254 | /* | 254 | return $url; |
255 | } elseif ($this->isCached($url)) { | 255 | } else { |
256 | $this->debug("......is cached"); | 256 | return false; |
257 | if (!$this->minimiseMemoryUse) { | 257 | } |
258 | $this->requests[$url] = $this->getCached($url); | 258 | } |
259 | } | 259 | |
260 | */ | 260 | public function fetchAll(array $urls) { |
261 | } else { | 261 | $this->fetchAllOnce($urls, $isRedirect=false); |
262 | $this->debug("......adding to pool"); | 262 | $redirects = 0; |
263 | $req_url = $this->rewriteUrls($url); | 263 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { |
264 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 264 | $this->debug("Following redirects #$redirects..."); |
265 | $req_url = $this->removeFragment($req_url); | 265 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); |
266 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 266 | } |
267 | $_meth = HttpRequest::METH_HEAD; | 267 | } |
268 | } else { | 268 | |
269 | $_meth = HttpRequest::METH_GET; | 269 | // fetch all URLs without following redirects |
270 | unset($this->requests[$orig]['wrongGuess']); | 270 | public function fetchAllOnce(array $urls, $isRedirect=false) { |
271 | } | 271 | if (!$isRedirect) $urls = array_unique($urls); |
272 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); | 272 | if (empty($urls)) return; |
273 | // send cookies, if we have any | 273 | |
274 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 274 | ////////////////////////////////////////////////////// |
275 | $this->debug("......sending cookies: $cookies"); | 275 | // parallel (HttpRequestPool) |
276 | $httpRequest->addHeaders(array('Cookie' => $cookies)); | 276 | if ($this->method == self::METHOD_REQUEST_POOL) { |
277 | } | 277 | $this->debug('Starting parallel fetch (HttpRequestPool)'); |
278 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); | 278 | try { |
279 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); | 279 | while (count($urls) > 0) { |
280 | // add referer for picky sites | 280 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
281 | $httpRequest->addheaders(array('Referer' => $this->referer)); | 281 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
282 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 282 | $pool = new HttpRequestPool(); |
283 | $this->requests[$orig]['original_url'] = $orig; | 283 | foreach ($subset as $orig => $url) { |
284 | $pool->attach($httpRequest); | 284 | if (!$isRedirect) $orig = $url; |
285 | } | 285 | unset($this->redirectQueue[$orig]); |
286 | } | 286 | $this->debug("...$url"); |
287 | // did we get anything into the pool? | 287 | if (!$isRedirect && isset($this->requests[$url])) { |
288 | if (count($pool) > 0) { | 288 | $this->debug("......in memory"); |
289 | $this->debug('Sending request...'); | 289 | /* |
290 | try { | 290 | } elseif ($this->isCached($url)) { |
291 | $pool->send(); | 291 | $this->debug("......is cached"); |
292 | } catch (HttpRequestPoolException $e) { | 292 | if (!$this->minimiseMemoryUse) { |
293 | // do nothing | 293 | $this->requests[$url] = $this->getCached($url); |
294 | } | 294 | } |
295 | $this->debug('Received responses'); | 295 | */ |
296 | foreach($subset as $orig => $url) { | 296 | } else { |
297 | if (!$isRedirect) $orig = $url; | 297 | $this->debug("......adding to pool"); |
298 | $request = $this->requests[$orig]['httpRequest']; | 298 | $req_url = $this->rewriteUrls($url); |
299 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); | 299 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
300 | // getResponseHeader() doesn't return status line, so, for consistency... | 300 | $req_url = $this->removeFragment($req_url); |
301 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); | 301 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
302 | // check content type | 302 | $_meth = HttpRequest::METH_HEAD; |
303 | // TODO: use getResponseHeader('content-type') or getResponseInfo() | 303 | } else { |
304 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 304 | $_meth = HttpRequest::METH_GET; |
305 | $this->requests[$orig]['body'] = ''; | 305 | unset($this->requests[$orig]['wrongGuess']); |
306 | $_header_only_type = true; | 306 | } |
307 | $this->debug('Header only type returned'); | 307 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); |
308 | } else { | 308 | // send cookies, if we have any |
309 | $this->requests[$orig]['body'] = $request->getResponseBody(); | 309 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
310 | $_header_only_type = false; | 310 | $this->debug("......sending cookies: $cookies"); |
311 | } | 311 | $httpRequest->addHeaders(array('Cookie' => $cookies)); |
312 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); | 312 | } |
313 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); | 313 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); |
314 | // is redirect? | 314 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); |
315 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { | 315 | // add referer for picky sites |
316 | $redirectURL = $request->getResponseHeader('location'); | 316 | $httpRequest->addheaders(array('Referer' => $this->referer)); |
317 | if (!preg_match('!^https?://!i', $redirectURL)) { | 317 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
318 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 318 | $this->requests[$orig]['original_url'] = $orig; |
319 | } | 319 | $pool->attach($httpRequest); |
320 | if ($this->validateURL($redirectURL)) { | 320 | } |
321 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 321 | } |
322 | // store any cookies | 322 | // did we get anything into the pool? |
323 | $cookies = $request->getResponseHeader('set-cookie'); | 323 | if (count($pool) > 0) { |
324 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); | 324 | $this->debug('Sending request...'); |
325 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); | 325 | try { |
326 | $this->redirectQueue[$orig] = $redirectURL; | 326 | $pool->send(); |
327 | } else { | 327 | } catch (HttpRequestPoolException $e) { |
328 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 328 | // do nothing |
329 | } | 329 | } |
330 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { | 330 | $this->debug('Received responses'); |
331 | // the response content-type did not match our 'header only' types, | 331 | foreach($subset as $orig => $url) { |
332 | // but we'd issues a HEAD request because we assumed it would. So | 332 | if (!$isRedirect) $orig = $url; |
333 | // let's queue a proper GET request for this item... | 333 | $request = $this->requests[$orig]['httpRequest']; |
334 | $this->debug('Wrong guess at content-type, queing GET request'); | 334 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); |
335 | $this->requests[$orig]['wrongGuess'] = true; | 335 | // getResponseHeader() doesn't return status line, so, for consistency... |
336 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 336 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); |
337 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 337 | // check content type |
338 | // check for <meta name='fragment' content='!'/> | 338 | // TODO: use getResponseHeader('content-type') or getResponseInfo() |
339 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 339 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
340 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 340 | $this->requests[$orig]['body'] = ''; |
341 | if (isset($this->requests[$orig]['body'])) { | 341 | $_header_only_type = true; |
342 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 342 | $this->debug('Header only type returned'); |
343 | if ($redirectURL) { | 343 | } else { |
344 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 344 | $this->requests[$orig]['body'] = $request->getResponseBody(); |
345 | $this->redirectQueue[$orig] = $redirectURL; | 345 | $_header_only_type = false; |
346 | } | 346 | } |
347 | } | 347 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); |
348 | } | 348 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); |
349 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 349 | // is redirect? |
350 | $pool->detach($request); | 350 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { |
351 | unset($this->requests[$orig]['httpRequest'], $request); | 351 | $redirectURL = $request->getResponseHeader('location'); |
352 | /* | 352 | if (!preg_match('!^https?://!i', $redirectURL)) { |
353 | if ($this->minimiseMemoryUse) { | 353 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
354 | if ($this->cache($url)) { | 354 | } |
355 | unset($this->requests[$url]); | 355 | if ($this->validateURL($redirectURL)) { |
356 | } | 356 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
357 | } | 357 | // store any cookies |
358 | */ | 358 | $cookies = $request->getResponseHeader('set-cookie'); |
359 | } | 359 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); |
360 | } | 360 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); |
361 | } | 361 | $this->redirectQueue[$orig] = $redirectURL; |
362 | } catch (HttpException $e) { | 362 | } else { |
363 | $this->debug($e); | 363 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
364 | return false; | 364 | } |
365 | } | 365 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { |
366 | } | 366 | // the response content-type did not match our 'header only' types, |
367 | 367 | // but we'd issues a HEAD request because we assumed it would. So | |
368 | ////////////////////////////////////////////////////////// | 368 | // let's queue a proper GET request for this item... |
369 | // parallel (curl_multi_*) | 369 | $this->debug('Wrong guess at content-type, queing GET request'); |
370 | elseif ($this->method == self::METHOD_CURL_MULTI) { | 370 | $this->requests[$orig]['wrongGuess'] = true; |
371 | $this->debug('Starting parallel fetch (curl_multi_*)'); | 371 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
372 | while (count($urls) > 0) { | 372 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
373 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 373 | // check for <meta name='fragment' content='!'/> |
374 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 374 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
375 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); | 375 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
376 | $pool->window_size = count($subset); | 376 | if (isset($this->requests[$orig]['body'])) { |
377 | 377 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | |
378 | foreach ($subset as $orig => $url) { | 378 | if ($redirectURL) { |
379 | if (!$isRedirect) $orig = $url; | 379 | $this->redirectQueue[$orig] = $redirectURL; |
380 | unset($this->redirectQueue[$orig]); | 380 | } |
381 | $this->debug("...$url"); | 381 | } |
382 | if (!$isRedirect && isset($this->requests[$url])) { | 382 | } |
383 | $this->debug("......in memory"); | 383 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
384 | /* | 384 | $pool->detach($request); |
385 | } elseif ($this->isCached($url)) { | 385 | unset($this->requests[$orig]['httpRequest'], $request); |
386 | $this->debug("......is cached"); | 386 | /* |
387 | if (!$this->minimiseMemoryUse) { | 387 | if ($this->minimiseMemoryUse) { |
388 | $this->requests[$url] = $this->getCached($url); | 388 | if ($this->cache($url)) { |
389 | } | 389 | unset($this->requests[$url]); |
390 | */ | 390 | } |
391 | } else { | 391 | } |
392 | $this->debug("......adding to pool"); | 392 | */ |
393 | $req_url = $this->rewriteUrls($url); | 393 | } |
394 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 394 | } |
395 | $req_url = $this->removeFragment($req_url); | 395 | } |
396 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 396 | } catch (HttpException $e) { |
397 | $_meth = 'HEAD'; | 397 | $this->debug($e); |
398 | } else { | 398 | return false; |
399 | $_meth = 'GET'; | 399 | } |
400 | unset($this->requests[$orig]['wrongGuess']); | 400 | } |
401 | } | 401 | |
402 | $headers = array(); | 402 | ////////////////////////////////////////////////////////// |
403 | //$headers[] = 'User-Agent: '.$this->userAgent; | 403 | // parallel (curl_multi_*) |
404 | $headers[] = $this->getUserAgent($req_url); | 404 | elseif ($this->method == self::METHOD_CURL_MULTI) { |
405 | // add referer for picky sites | 405 | $this->debug('Starting parallel fetch (curl_multi_*)'); |
406 | $headers[] = 'Referer: '.$this->referer; | 406 | while (count($urls) > 0) { |
407 | // send cookies, if we have any | 407 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
408 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 408 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
409 | $this->debug("......sending cookies: $cookies"); | 409 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); |
410 | $headers[] = 'Cookie: '.$cookies; | 410 | $pool->window_size = count($subset); |
411 | } | 411 | |
412 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( | 412 | foreach ($subset as $orig => $url) { |
413 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], | 413 | if (!$isRedirect) $orig = $url; |
414 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] | 414 | unset($this->redirectQueue[$orig]); |
415 | )); | 415 | $this->debug("...$url"); |
416 | $httpRequest->set_original_url($orig); | 416 | if (!$isRedirect && isset($this->requests[$url])) { |
417 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 417 | $this->debug("......in memory"); |
418 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? | 418 | /* |
419 | $pool->add($httpRequest); | 419 | } elseif ($this->isCached($url)) { |
420 | } | 420 | $this->debug("......is cached"); |
421 | } | 421 | if (!$this->minimiseMemoryUse) { |
422 | // did we get anything into the pool? | 422 | $this->requests[$url] = $this->getCached($url); |
423 | if (count($pool) > 0) { | 423 | } |
424 | $this->debug('Sending request...'); | 424 | */ |
425 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] | 425 | } else { |
426 | $this->debug('Received responses'); | 426 | $this->debug("......adding to pool"); |
427 | foreach($subset as $orig => $url) { | 427 | $req_url = $this->rewriteUrls($url); |
428 | if (!$isRedirect) $orig = $url; | 428 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
429 | // $this->requests[$orig]['headers'] | 429 | $req_url = $this->removeFragment($req_url); |
430 | // $this->requests[$orig]['body'] | 430 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
431 | // $this->requests[$orig]['effective_url'] | 431 | $_meth = 'HEAD'; |
432 | // check content type | 432 | } else { |
433 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 433 | $_meth = 'GET'; |
434 | $this->requests[$orig]['body'] = ''; | 434 | unset($this->requests[$orig]['wrongGuess']); |
435 | $_header_only_type = true; | 435 | } |
436 | $this->debug('Header only type returned'); | 436 | $headers = array(); |
437 | } else { | 437 | //$headers[] = 'User-Agent: '.$this->userAgent; |
438 | $_header_only_type = false; | 438 | $headers[] = $this->getUserAgent($req_url); |
439 | } | 439 | // add referer for picky sites |
440 | $status_code = $this->requests[$orig]['status_code']; | 440 | $headers[] = 'Referer: '.$this->referer; |
441 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 441 | // send cookies, if we have any |
442 | $redirectURL = $this->requests[$orig]['location']; | 442 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
443 | if (!preg_match('!^https?://!i', $redirectURL)) { | 443 | $this->debug("......sending cookies: $cookies"); |
444 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 444 | $headers[] = 'Cookie: '.$cookies; |
445 | } | 445 | } |
446 | if ($this->validateURL($redirectURL)) { | 446 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( |
447 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 447 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], |
448 | // store any cookies | 448 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] |
449 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 449 | )); |
450 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 450 | $httpRequest->set_original_url($orig); |
451 | $this->redirectQueue[$orig] = $redirectURL; | 451 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
452 | } else { | 452 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? |
453 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 453 | $pool->add($httpRequest); |
454 | } | 454 | } |
455 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { | 455 | } |
456 | // the response content-type did not match our 'header only' types, | 456 | // did we get anything into the pool? |
457 | // but we'd issues a HEAD request because we assumed it would. So | 457 | if (count($pool) > 0) { |
458 | // let's queue a proper GET request for this item... | 458 | $this->debug('Sending request...'); |
459 | $this->debug('Wrong guess at content-type, queing GET request'); | 459 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] |
460 | $this->requests[$orig]['wrongGuess'] = true; | 460 | $this->debug('Received responses'); |
461 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 461 | foreach($subset as $orig => $url) { |
462 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 462 | if (!$isRedirect) $orig = $url; |
463 | // check for <meta name='fragment' content='!'/> | 463 | // $this->requests[$orig]['headers'] |
464 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 464 | // $this->requests[$orig]['body'] |
465 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 465 | // $this->requests[$orig]['effective_url'] |
466 | if (isset($this->requests[$orig]['body'])) { | 466 | // check content type |
467 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 467 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
468 | if ($redirectURL) { | 468 | $this->requests[$orig]['body'] = ''; |
469 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 469 | $_header_only_type = true; |
470 | $this->redirectQueue[$orig] = $redirectURL; | 470 | $this->debug('Header only type returned'); |
471 | } | 471 | } else { |
472 | } | 472 | $_header_only_type = false; |
473 | } | 473 | } |
474 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 474 | $status_code = $this->requests[$orig]['status_code']; |
475 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); | 475 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
476 | } | 476 | $redirectURL = $this->requests[$orig]['location']; |
477 | } | 477 | if (!preg_match('!^https?://!i', $redirectURL)) { |
478 | } | 478 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
479 | } | 479 | } |
480 | 480 | if ($this->validateURL($redirectURL)) { | |
481 | ////////////////////////////////////////////////////// | 481 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
482 | // sequential (file_get_contents) | 482 | // store any cookies |
483 | else { | 483 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
484 | $this->debug('Starting sequential fetch (file_get_contents)'); | 484 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
485 | $this->debug('Processing set of '.count($urls)); | 485 | $this->redirectQueue[$orig] = $redirectURL; |
486 | foreach ($urls as $orig => $url) { | 486 | } else { |
487 | if (!$isRedirect) $orig = $url; | 487 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
488 | unset($this->redirectQueue[$orig]); | 488 | } |
489 | $this->debug("...$url"); | 489 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { |
490 | if (!$isRedirect && isset($this->requests[$url])) { | 490 | // the response content-type did not match our 'header only' types, |
491 | $this->debug("......in memory"); | 491 | // but we'd issues a HEAD request because we assumed it would. So |
492 | /* | 492 | // let's queue a proper GET request for this item... |
493 | } elseif ($this->isCached($url)) { | 493 | $this->debug('Wrong guess at content-type, queing GET request'); |
494 | $this->debug("......is cached"); | 494 | $this->requests[$orig]['wrongGuess'] = true; |
495 | if (!$this->minimiseMemoryUse) { | 495 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
496 | $this->requests[$url] = $this->getCached($url); | 496 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
497 | } | 497 | // check for <meta name='fragment' content='!'/> |
498 | */ | 498 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
499 | } else { | 499 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
500 | $this->debug("Sending request for $url"); | 500 | if (isset($this->requests[$orig]['body'])) { |
501 | $this->requests[$orig]['original_url'] = $orig; | 501 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
502 | $req_url = $this->rewriteUrls($url); | 502 | if ($redirectURL) { |
503 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 503 | $this->redirectQueue[$orig] = $redirectURL; |
504 | $req_url = $this->removeFragment($req_url); | 504 | } |
505 | // send cookies, if we have any | 505 | } |
506 | $httpContext = $this->httpContext; | 506 | } |
507 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; | 507 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
508 | // add referer for picky sites | 508 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); |
509 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; | 509 | } |
510 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 510 | } |
511 | $this->debug("......sending cookies: $cookies"); | 511 | } |
512 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; | 512 | } |
513 | } | 513 | |
514 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { | 514 | ////////////////////////////////////////////////////// |
515 | $this->debug('Received response'); | 515 | // sequential (file_get_contents) |
516 | // get status code | 516 | else { |
517 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { | 517 | $this->debug('Starting sequential fetch (file_get_contents)'); |
518 | $this->debug('Error: no status code found'); | 518 | $this->debug('Processing set of '.count($urls)); |
519 | // TODO: handle error - no status code | 519 | foreach ($urls as $orig => $url) { |
520 | } else { | 520 | if (!$isRedirect) $orig = $url; |
521 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); | 521 | unset($this->redirectQueue[$orig]); |
522 | // check content type | 522 | $this->debug("...$url"); |
523 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 523 | if (!$isRedirect && isset($this->requests[$url])) { |
524 | $this->requests[$orig]['body'] = ''; | 524 | $this->debug("......in memory"); |
525 | } else { | 525 | /* |
526 | $this->requests[$orig]['body'] = $html; | 526 | } elseif ($this->isCached($url)) { |
527 | } | 527 | $this->debug("......is cached"); |
528 | $this->requests[$orig]['effective_url'] = $req_url; | 528 | if (!$this->minimiseMemoryUse) { |
529 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; | 529 | $this->requests[$url] = $this->getCached($url); |
530 | unset($match); | 530 | } |
531 | // handle redirect | 531 | */ |
532 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 532 | } else { |
533 | $this->requests[$orig]['location'] = trim($match[1]); | 533 | $this->debug("Sending request for $url"); |
534 | } | 534 | $this->requests[$orig]['original_url'] = $orig; |
535 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 535 | $req_url = $this->rewriteUrls($url); |
536 | $redirectURL = $this->requests[$orig]['location']; | 536 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
537 | if (!preg_match('!^https?://!i', $redirectURL)) { | 537 | $req_url = $this->removeFragment($req_url); |
538 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 538 | // send cookies, if we have any |
539 | } | 539 | $httpContext = $this->httpContext; |
540 | if ($this->validateURL($redirectURL)) { | 540 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; |
541 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 541 | // add referer for picky sites |
542 | // store any cookies | 542 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; |
543 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 543 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
544 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 544 | $this->debug("......sending cookies: $cookies"); |
545 | $this->redirectQueue[$orig] = $redirectURL; | 545 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; |
546 | } else { | 546 | } |
547 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 547 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { |
548 | } | 548 | $this->debug('Received response'); |
549 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 549 | // get status code |
550 | // check for <meta name='fragment' content='!'/> | 550 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { |
551 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 551 | $this->debug('Error: no status code found'); |
552 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 552 | // TODO: handle error - no status code |
553 | if (isset($this->requests[$orig]['body'])) { | 553 | } else { |
554 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 554 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); |
555 | if ($redirectURL) { | 555 | // check content type |
556 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 556 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
557 | $this->redirectQueue[$orig] = $redirectURL; | 557 | $this->requests[$orig]['body'] = ''; |
558 | } | 558 | } else { |
559 | } | 559 | $this->requests[$orig]['body'] = $html; |
560 | } | 560 | } |
561 | } | 561 | $this->requests[$orig]['effective_url'] = $req_url; |
562 | } else { | 562 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; |
563 | $this->debug('Error retrieving URL'); | 563 | unset($match); |
564 | //print_r($req_url); | 564 | // handle redirect |
565 | //print_r($http_response_header); | 565 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
566 | //print_r($html); | 566 | $this->requests[$orig]['location'] = trim($match[1]); |
567 | 567 | } | |
568 | // TODO: handle error - failed to retrieve URL | 568 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
569 | } | 569 | $redirectURL = $this->requests[$orig]['location']; |
570 | } | 570 | if (!preg_match('!^https?://!i', $redirectURL)) { |
571 | } | 571 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
572 | } | 572 | } |
573 | } | 573 | if ($this->validateURL($redirectURL)) { |
574 | 574 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | |
575 | public function handleCurlResponse($response, $info, $request) { | 575 | // store any cookies |
576 | $orig = $request->url_original; | 576 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
577 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); | 577 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
578 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); | 578 | $this->redirectQueue[$orig] = $redirectURL; |
579 | $this->requests[$orig]['method'] = $request->method; | 579 | } else { |
580 | $this->requests[$orig]['effective_url'] = $info['url']; | 580 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
581 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; | 581 | } |
582 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 582 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
583 | $this->requests[$orig]['location'] = trim($match[1]); | 583 | // check for <meta name='fragment' content='!'/> |
584 | } | 584 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
585 | } | 585 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
586 | 586 | if (isset($this->requests[$orig]['body'])) { | |
587 | protected function headersToString(array $headers, $associative=true) { | 587 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
588 | if (!$associative) { | 588 | if ($redirectURL) { |
589 | return implode("\n", $headers); | 589 | $this->redirectQueue[$orig] = $redirectURL; |
590 | } else { | 590 | } |
591 | $str = ''; | 591 | } |
592 | foreach ($headers as $key => $val) { | 592 | } |
593 | if (is_array($val)) { | 593 | } |
594 | foreach ($val as $v) $str .= "$key: $v\n"; | 594 | } else { |
595 | } else { | 595 | $this->debug('Error retrieving URL'); |
596 | $str .= "$key: $val\n"; | 596 | //print_r($req_url); |
597 | } | 597 | //print_r($http_response_header); |
598 | } | 598 | //print_r($html); |
599 | return rtrim($str); | 599 | |
600 | } | 600 | // TODO: handle error - failed to retrieve URL |
601 | } | 601 | } |
602 | 602 | } | |
603 | public function get($url, $remove=false, $gzdecode=true) { | 603 | } |
604 | $url = "$url"; | 604 | } |
605 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 605 | } |
606 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | 606 | |
607 | $response = $this->requests[$url]; | 607 | public function handleCurlResponse($response, $info, $request) { |
608 | /* | 608 | $orig = $request->url_original; |
609 | } elseif ($this->isCached($url)) { | 609 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); |
610 | $this->debug("URL already fetched - in disk cache ($url)"); | 610 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); |
611 | $response = $this->getCached($url); | 611 | $this->requests[$orig]['method'] = $request->method; |
612 | $this->requests[$url] = $response; | 612 | $this->requests[$orig]['effective_url'] = $info['url']; |
613 | */ | 613 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; |
614 | } else { | 614 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
615 | $this->debug("Fetching URL ($url)"); | 615 | $this->requests[$orig]['location'] = trim($match[1]); |
616 | $this->fetchAll(array($url)); | 616 | } |
617 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 617 | } |
618 | $response = $this->requests[$url]; | 618 | |
619 | } else { | 619 | protected function headersToString(array $headers, $associative=true) { |
620 | $this->debug("Request failed"); | 620 | if (!$associative) { |
621 | $response = false; | 621 | return implode("\n", $headers); |
622 | } | 622 | } else { |
623 | } | 623 | $str = ''; |
624 | /* | 624 | foreach ($headers as $key => $val) { |
625 | if ($this->minimiseMemoryUse && $response) { | 625 | if (is_array($val)) { |
626 | $this->cache($url); | 626 | foreach ($val as $v) $str .= "$key: $v\n"; |
627 | unset($this->requests[$url]); | 627 | } else { |
628 | } | 628 | $str .= "$key: $val\n"; |
629 | */ | 629 | } |
630 | if ($remove && $response) unset($this->requests[$url]); | 630 | } |
631 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { | 631 | return rtrim($str); |
632 | if ($html = gzdecode($response['body'])) { | 632 | } |
633 | $response['body'] = $html; | 633 | } |
634 | } | 634 | |
635 | } | 635 | public function get($url, $remove=false, $gzdecode=true) { |
636 | return $response; | 636 | $url = "$url"; |
637 | } | 637 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
638 | 638 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | |
639 | public function parallelSupport() { | 639 | $response = $this->requests[$url]; |
640 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); | 640 | /* |
641 | } | 641 | } elseif ($this->isCached($url)) { |
642 | 642 | $this->debug("URL already fetched - in disk cache ($url)"); | |
643 | private function headerOnlyType($headers) { | 643 | $response = $this->getCached($url); |
644 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { | 644 | $this->requests[$url] = $response; |
645 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) | 645 | */ |
646 | $match[1] = strtolower(trim($match[1])); | 646 | } else { |
647 | $match[2] = strtolower(trim($match[2])); | 647 | $this->debug("Fetching URL ($url)"); |
648 | foreach (array($match[1], $match[2]) as $mime) { | 648 | $this->fetchAll(array($url)); |
649 | if (in_array($mime, $this->headerOnlyTypes)) return true; | 649 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
650 | } | 650 | $response = $this->requests[$url]; |
651 | } | 651 | } else { |
652 | return false; | 652 | $this->debug("Request failed"); |
653 | } | 653 | $response = false; |
654 | 654 | } | |
655 | private function possibleUnsupportedType($url) { | 655 | } |
656 | $path = @parse_url($url, PHP_URL_PATH); | 656 | /* |
657 | if ($path && strpos($path, '.') !== false) { | 657 | if ($this->minimiseMemoryUse && $response) { |
658 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); | 658 | $this->cache($url); |
659 | return in_array($ext, $this->headerOnlyClues); | 659 | unset($this->requests[$url]); |
660 | } | 660 | } |
661 | return false; | 661 | */ |
662 | } | 662 | if ($remove && $response) unset($this->requests[$url]); |
663 | } | 663 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { |
664 | 664 | if ($html = gzdecode($response['body'])) { | |
665 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 | 665 | $response['body'] = $html; |
666 | if (!function_exists('gzdecode')) { | 666 | } |
667 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) | 667 | } |
668 | { | 668 | return $response; |
669 | $len = strlen($data); | 669 | } |
670 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { | 670 | |
671 | $error = "Not in GZIP format."; | 671 | public function parallelSupport() { |
672 | return null; // Not GZIP format (See RFC 1952) | 672 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); |
673 | } | 673 | } |
674 | $method = ord(substr($data,2,1)); // Compression method | 674 | |
675 | $flags = ord(substr($data,3,1)); // Flags | 675 | private function headerOnlyType($headers) { |
676 | if ($flags & 31 != $flags) { | 676 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { |
677 | $error = "Reserved bits not allowed."; | 677 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) |
678 | return null; | 678 | $match[1] = strtolower(trim($match[1])); |
679 | } | 679 | $match[2] = strtolower(trim($match[2])); |
680 | // NOTE: $mtime may be negative (PHP integer limitations) | 680 | foreach (array($match[1], $match[2]) as $mime) { |
681 | $mtime = unpack("V", substr($data,4,4)); | 681 | if (in_array($mime, $this->headerOnlyTypes)) return true; |
682 | $mtime = $mtime[1]; | 682 | } |
683 | $xfl = substr($data,8,1); | 683 | } |
684 | $os = substr($data,8,1); | 684 | return false; |
685 | $headerlen = 10; | 685 | } |
686 | $extralen = 0; | 686 | |
687 | $extra = ""; | 687 | private function possibleUnsupportedType($url) { |
688 | if ($flags & 4) { | 688 | $path = @parse_url($url, PHP_URL_PATH); |
689 | // 2-byte length prefixed EXTRA data in header | 689 | if ($path && strpos($path, '.') !== false) { |
690 | if ($len - $headerlen - 2 < 8) { | 690 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); |
691 | return false; // invalid | 691 | return in_array($ext, $this->headerOnlyClues); |
692 | } | 692 | } |
693 | $extralen = unpack("v",substr($data,8,2)); | 693 | return false; |
694 | $extralen = $extralen[1]; | 694 | } |
695 | if ($len - $headerlen - 2 - $extralen < 8) { | 695 | } |
696 | return false; // invalid | 696 | |
697 | } | 697 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 |
698 | $extra = substr($data,10,$extralen); | 698 | if (!function_exists('gzdecode')) { |
699 | $headerlen += 2 + $extralen; | 699 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) |
700 | } | 700 | { |
701 | $filenamelen = 0; | 701 | $len = strlen($data); |
702 | $filename = ""; | 702 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { |
703 | if ($flags & 8) { | 703 | $error = "Not in GZIP format."; |
704 | // C-style string | 704 | return null; // Not GZIP format (See RFC 1952) |
705 | if ($len - $headerlen - 1 < 8) { | 705 | } |
706 | return false; // invalid | 706 | $method = ord(substr($data,2,1)); // Compression method |
707 | } | 707 | $flags = ord(substr($data,3,1)); // Flags |
708 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); | 708 | if ($flags & 31 != $flags) { |
709 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { | 709 | $error = "Reserved bits not allowed."; |
710 | return false; // invalid | 710 | return null; |
711 | } | 711 | } |
712 | $filename = substr($data,$headerlen,$filenamelen); | 712 | // NOTE: $mtime may be negative (PHP integer limitations) |
713 | $headerlen += $filenamelen + 1; | 713 | $mtime = unpack("V", substr($data,4,4)); |
714 | } | 714 | $mtime = $mtime[1]; |
715 | $commentlen = 0; | 715 | $xfl = substr($data,8,1); |
716 | $comment = ""; | 716 | $os = substr($data,8,1); |
717 | if ($flags & 16) { | 717 | $headerlen = 10; |
718 | // C-style string COMMENT data in header | 718 | $extralen = 0; |
719 | if ($len - $headerlen - 1 < 8) { | 719 | $extra = ""; |
720 | return false; // invalid | 720 | if ($flags & 4) { |
721 | } | 721 | // 2-byte length prefixed EXTRA data in header |
722 | $commentlen = strpos(substr($data,$headerlen),chr(0)); | 722 | if ($len - $headerlen - 2 < 8) { |
723 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { | 723 | return false; // invalid |
724 | return false; // Invalid header format | 724 | } |
725 | } | 725 | $extralen = unpack("v",substr($data,8,2)); |
726 | $comment = substr($data,$headerlen,$commentlen); | 726 | $extralen = $extralen[1]; |
727 | $headerlen += $commentlen + 1; | 727 | if ($len - $headerlen - 2 - $extralen < 8) { |
728 | } | 728 | return false; // invalid |
729 | $headercrc = ""; | 729 | } |
730 | if ($flags & 2) { | 730 | $extra = substr($data,10,$extralen); |
731 | // 2-bytes (lowest order) of CRC32 on header present | 731 | $headerlen += 2 + $extralen; |
732 | if ($len - $headerlen - 2 < 8) { | 732 | } |
733 | return false; // invalid | 733 | $filenamelen = 0; |
734 | } | 734 | $filename = ""; |
735 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; | 735 | if ($flags & 8) { |
736 | $headercrc = unpack("v", substr($data,$headerlen,2)); | 736 | // C-style string |
737 | $headercrc = $headercrc[1]; | 737 | if ($len - $headerlen - 1 < 8) { |
738 | if ($headercrc != $calccrc) { | 738 | return false; // invalid |
739 | $error = "Header checksum failed."; | 739 | } |
740 | return false; // Bad header CRC | 740 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); |
741 | } | 741 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { |
742 | $headerlen += 2; | 742 | return false; // invalid |
743 | } | 743 | } |
744 | // GZIP FOOTER | 744 | $filename = substr($data,$headerlen,$filenamelen); |
745 | $datacrc = unpack("V",substr($data,-8,4)); | 745 | $headerlen += $filenamelen + 1; |
746 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); | 746 | } |
747 | $isize = unpack("V",substr($data,-4)); | 747 | $commentlen = 0; |
748 | $isize = $isize[1]; | 748 | $comment = ""; |
749 | // decompression: | 749 | if ($flags & 16) { |
750 | $bodylen = $len-$headerlen-8; | 750 | // C-style string COMMENT data in header |
751 | if ($bodylen < 1) { | 751 | if ($len - $headerlen - 1 < 8) { |
752 | // IMPLEMENTATION BUG! | 752 | return false; // invalid |
753 | return null; | 753 | } |
754 | } | 754 | $commentlen = strpos(substr($data,$headerlen),chr(0)); |
755 | $body = substr($data,$headerlen,$bodylen); | 755 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { |
756 | $data = ""; | 756 | return false; // Invalid header format |
757 | if ($bodylen > 0) { | 757 | } |
758 | switch ($method) { | 758 | $comment = substr($data,$headerlen,$commentlen); |
759 | case 8: | 759 | $headerlen += $commentlen + 1; |
760 | // Currently the only supported compression method: | 760 | } |
761 | $data = gzinflate($body,$maxlength); | 761 | $headercrc = ""; |
762 | break; | 762 | if ($flags & 2) { |
763 | default: | 763 | // 2-bytes (lowest order) of CRC32 on header present |
764 | $error = "Unknown compression method."; | 764 | if ($len - $headerlen - 2 < 8) { |
765 | return false; | 765 | return false; // invalid |
766 | } | 766 | } |
767 | } // zero-byte body content is allowed | 767 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; |
768 | // Verifiy CRC32 | 768 | $headercrc = unpack("v", substr($data,$headerlen,2)); |
769 | $crc = sprintf("%u",crc32($data)); | 769 | $headercrc = $headercrc[1]; |
770 | $crcOK = $crc == $datacrc; | 770 | if ($headercrc != $calccrc) { |
771 | $lenOK = $isize == strlen($data); | 771 | $error = "Header checksum failed."; |
772 | if (!$lenOK || !$crcOK) { | 772 | return false; // Bad header CRC |
773 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | 773 | } |
774 | return false; | 774 | $headerlen += 2; |
775 | } | 775 | } |
776 | return $data; | 776 | // GZIP FOOTER |
777 | } | 777 | $datacrc = unpack("V",substr($data,-8,4)); |
778 | } | 778 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); |
779 | ?> \ No newline at end of file | 779 | $isize = unpack("V",substr($data,-4)); |
780 | $isize = $isize[1]; | ||
781 | // decompression: | ||
782 | $bodylen = $len-$headerlen-8; | ||
783 | if ($bodylen < 1) { | ||
784 | // IMPLEMENTATION BUG! | ||
785 | return null; | ||
786 | } | ||
787 | $body = substr($data,$headerlen,$bodylen); | ||
788 | $data = ""; | ||
789 | if ($bodylen > 0) { | ||
790 | switch ($method) { | ||
791 | case 8: | ||
792 | // Currently the only supported compression method: | ||
793 | $data = gzinflate($body,$maxlength); | ||
794 | break; | ||
795 | default: | ||
796 | $error = "Unknown compression method."; | ||
797 | return false; | ||
798 | } | ||
799 | } // zero-byte body content is allowed | ||
800 | // Verifiy CRC32 | ||
801 | $crc = sprintf("%u",crc32($data)); | ||
802 | $crcOK = $crc == $datacrc; | ||
803 | $lenOK = $isize == strlen($data); | ||
804 | if (!$lenOK || !$crcOK) { | ||
805 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | ||
806 | return false; | ||
807 | } | ||
808 | return $data; | ||
809 | } | ||
810 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php index ecd46d5f..c524a1ee 100644 --- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php | |||
@@ -1,79 +1,78 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent extension for SimplePie_File | 3 | * Humble HTTP Agent extension for SimplePie_File |
4 | * | 4 | * |
5 | * This class is designed to extend and override SimplePie_File | 5 | * This class is designed to extend and override SimplePie_File |
6 | * in order to prevent duplicate HTTP requests being sent out. | 6 | * in order to prevent duplicate HTTP requests being sent out. |
7 | * The idea is to initialise an instance of Humble HTTP Agent | 7 | * The idea is to initialise an instance of Humble HTTP Agent |
8 | * and attach it, to a static class variable, of this class. | 8 | * and attach it, to a static class variable, of this class. |
9 | * SimplePie will then automatically initialise this class | 9 | * SimplePie will then automatically initialise this class |
10 | * | 10 | * |
11 | * @date 2011-02-28 | 11 | * @date 2011-02-28 |
12 | */ | 12 | */ |
13 | 13 | ||
14 | class SimplePie_HumbleHttpAgent extends SimplePie_File | 14 | class SimplePie_HumbleHttpAgent extends SimplePie_File |
15 | { | 15 | { |
16 | protected static $agent; | 16 | protected static $agent; |
17 | var $url; | 17 | var $url; |
18 | var $useragent; | 18 | var $useragent; |
19 | var $success = true; | 19 | var $success = true; |
20 | var $headers = array(); | 20 | var $headers = array(); |
21 | var $body; | 21 | var $body; |
22 | var $status_code; | 22 | var $status_code; |
23 | var $redirects = 0; | 23 | var $redirects = 0; |
24 | var $error; | 24 | var $error; |
25 | var $method = SIMPLEPIE_FILE_SOURCE_NONE; | 25 | var $method = SIMPLEPIE_FILE_SOURCE_NONE; |
26 | 26 | ||
27 | public static function set_agent(HumbleHttpAgent $agent) { | 27 | public static function set_agent(HumbleHttpAgent $agent) { |
28 | self::$agent = $agent; | 28 | self::$agent = $agent; |
29 | } | 29 | } |
30 | 30 | ||
31 | public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { | 31 | public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { |
32 | if (class_exists('idna_convert')) | 32 | if (class_exists('idna_convert')) |
33 | { | 33 | { |
34 | $idn = new idna_convert(); | 34 | $idn = new idna_convert(); |
35 | $parsed = SimplePie_Misc::parse_url($url); | 35 | $parsed = SimplePie_Misc::parse_url($url); |
36 | $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); | 36 | $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); |
37 | } | 37 | } |
38 | $this->url = $url; | 38 | $this->url = $url; |
39 | $this->useragent = $useragent; | 39 | $this->useragent = $useragent; |
40 | if (preg_match('/^http(s)?:\/\//i', $url)) | 40 | if (preg_match('/^http(s)?:\/\//i', $url)) |
41 | { | 41 | { |
42 | if (!is_array($headers)) | 42 | if (!is_array($headers)) |
43 | { | 43 | { |
44 | $headers = array(); | 44 | $headers = array(); |
45 | } | 45 | } |
46 | $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; | 46 | $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; |
47 | $headers2 = array(); | 47 | $headers2 = array(); |
48 | foreach ($headers as $key => $value) { | 48 | foreach ($headers as $key => $value) { |
49 | $headers2[] = "$key: $value"; | 49 | $headers2[] = "$key: $value"; |
50 | } | 50 | } |
51 | //TODO: allow for HTTP headers | 51 | //TODO: allow for HTTP headers |
52 | // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); | 52 | // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); |
53 | 53 | ||
54 | $response = self::$agent->get($url); | 54 | $response = self::$agent->get($url); |
55 | 55 | ||
56 | if ($response === false || !isset($response['status_code'])) { | 56 | if ($response === false || !isset($response['status_code'])) { |
57 | $this->error = 'failed to fetch URL'; | 57 | $this->error = 'failed to fetch URL'; |
58 | $this->success = false; | 58 | $this->success = false; |
59 | } else { | 59 | } else { |
60 | // The extra lines at the end are there to satisfy SimplePie's HTTP parser. | 60 | // The extra lines at the end are there to satisfy SimplePie's HTTP parser. |
61 | // The class expects a full HTTP message, whereas we're giving it only | 61 | // The class expects a full HTTP message, whereas we're giving it only |
62 | // headers - the new lines indicate the start of the body. | 62 | // headers - the new lines indicate the start of the body. |
63 | $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); | 63 | $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); |
64 | if ($parser->parse()) { | 64 | if ($parser->parse()) { |
65 | $this->headers = $parser->headers; | 65 | $this->headers = $parser->headers; |
66 | //$this->body = $parser->body; | 66 | //$this->body = $parser->body; |
67 | $this->body = $response['body']; | 67 | $this->body = $response['body']; |
68 | $this->status_code = $parser->status_code; | 68 | $this->status_code = $parser->status_code; |
69 | } | 69 | } |
70 | } | 70 | } |
71 | } | 71 | } |
72 | else | 72 | else |
73 | { | 73 | { |
74 | $this->error = 'invalid URL'; | 74 | $this->error = 'invalid URL'; |
75 | $this->success = false; | 75 | $this->success = false; |
76 | } | 76 | } |
77 | } | 77 | } |
78 | } | 78 | } \ No newline at end of file |
79 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php index 09b11546..382d869c 100644 --- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php | |||
@@ -6,23 +6,24 @@ | |||
6 | * Attempts to detect the language of a sample of text by correlating ranked | 6 | * Attempts to detect the language of a sample of text by correlating ranked |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | 7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. |
8 | * | 8 | * |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | 9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle |
10 | * (1994): "N-Gram-Based Text Categorization" | 10 | * (1994): "N-Gram-Based Text Categorization" |
11 | * | 11 | * |
12 | * PHP versions 4 and 5 | 12 | * PHP version 5 |
13 | * | 13 | * |
14 | * @category Text | 14 | * @category Text |
15 | * @package Text_LanguageDetect | 15 | * @package Text_LanguageDetect |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
17 | * @copyright 2005-2006 Nicholas Pisarro | 17 | * @copyright 2005-2006 Nicholas Pisarro |
18 | * @license http://www.debian.org/misc/bsd.license BSD | 18 | * @license http://www.debian.org/misc/bsd.license BSD |
19 | * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ | 19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | 20 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
21 | * @link http://langdetect.blogspot.com/ | 21 | * @link http://langdetect.blogspot.com/ |
22 | */ | 22 | */ |
23 | 23 | ||
24 | //require_once 'PEAR.php'; | 24 | require_once 'LanguageDetect/Exception.php'; |
25 | require_once 'Parser.php'; | 25 | require_once 'LanguageDetect/Parser.php'; |
26 | require_once 'LanguageDetect/ISO639.php'; | ||
26 | 27 | ||
27 | /** | 28 | /** |
28 | * Language detection class | 29 | * Language detection class |
@@ -41,9 +42,10 @@ require_once 'Parser.php'; | |||
41 | * | 42 | * |
42 | * echo "Supported languages:\n"; | 43 | * echo "Supported languages:\n"; |
43 | * | 44 | * |
44 | * $langs = $l->getLanguages(); | 45 | * try { |
45 | * if (PEAR::isError($langs)) { | 46 | * $langs = $l->getLanguages(); |
46 | * die($langs->getMessage()); | 47 | * } catch (Text_LanguageDetect_Exception $e) { |
48 | * die($e->getMessage()); | ||
47 | * } | 49 | * } |
48 | * | 50 | * |
49 | * sort($langs); | 51 | * sort($langs); |
@@ -54,38 +56,38 @@ require_once 'Parser.php'; | |||
54 | * } | 56 | * } |
55 | * </code> | 57 | * </code> |
56 | * | 58 | * |
57 | * @category Text | 59 | * @category Text |
58 | * @package Text_LanguageDetect | 60 | * @package Text_LanguageDetect |
59 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
60 | * @copyright 2005 Nicholas Pisarro | 62 | * @copyright 2005 Nicholas Pisarro |
61 | * @license http://www.debian.org/misc/bsd.license BSD | 63 | * @license http://www.debian.org/misc/bsd.license BSD |
62 | * @version Release: @package_version@ | 64 | * @version Release: @package_version@ |
63 | * @todo allow users to generate their own language models | 65 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
66 | * @todo allow users to generate their own language models | ||
64 | */ | 67 | */ |
65 | |||
66 | class Text_LanguageDetect | 68 | class Text_LanguageDetect |
67 | { | 69 | { |
68 | /** | 70 | /** |
69 | * The filename that stores the trigram data for the detector | 71 | * The filename that stores the trigram data for the detector |
70 | * | 72 | * |
71 | * If this value starts with a slash (/) or a dot (.) the value of | 73 | * If this value starts with a slash (/) or a dot (.) the value of |
72 | * $this->_data_dir will be ignored | 74 | * $this->_data_dir will be ignored |
73 | * | 75 | * |
74 | * @var string | 76 | * @var string |
75 | * @access private | 77 | * @access private |
76 | */ | 78 | */ |
77 | var $_db_filename = './lang.dat'; | 79 | var $_db_filename = 'lang.dat'; |
78 | 80 | ||
79 | /** | 81 | /** |
80 | * The filename that stores the unicode block definitions | 82 | * The filename that stores the unicode block definitions |
81 | * | 83 | * |
82 | * If this value starts with a slash (/) or a dot (.) the value of | 84 | * If this value starts with a slash (/) or a dot (.) the value of |
83 | * $this->_data_dir will be ignored | 85 | * $this->_data_dir will be ignored |
84 | * | 86 | * |
85 | * @var string | 87 | * @var string |
86 | * @access private | 88 | * @access private |
87 | */ | 89 | */ |
88 | var $_unicode_db_filename = './unicode_blocks.dat'; | 90 | var $_unicode_db_filename = 'unicode_blocks.dat'; |
89 | 91 | ||
90 | /** | 92 | /** |
91 | * The data directory | 93 | * The data directory |
@@ -99,11 +101,8 @@ class Text_LanguageDetect | |||
99 | 101 | ||
100 | /** | 102 | /** |
101 | * The trigram data for comparison | 103 | * The trigram data for comparison |
102 | * | ||
103 | * Will be loaded on start from $this->_db_filename | ||
104 | * | 104 | * |
105 | * May be set to a PEAR_Error object if there is an error during its | 105 | * Will be loaded on start from $this->_db_filename |
106 | * initialization | ||
107 | * | 106 | * |
108 | * @var array | 107 | * @var array |
109 | * @access private | 108 | * @access private |
@@ -120,7 +119,7 @@ class Text_LanguageDetect | |||
120 | 119 | ||
121 | /** | 120 | /** |
122 | * The size of the trigram data arrays | 121 | * The size of the trigram data arrays |
123 | * | 122 | * |
124 | * @var int | 123 | * @var int |
125 | * @access private | 124 | * @access private |
126 | */ | 125 | */ |
@@ -140,7 +139,7 @@ class Text_LanguageDetect | |||
140 | 139 | ||
141 | /** | 140 | /** |
142 | * Whether or not to simulate perl's Language::Guess exactly | 141 | * Whether or not to simulate perl's Language::Guess exactly |
143 | * | 142 | * |
144 | * @access private | 143 | * @access private |
145 | * @var bool | 144 | * @var bool |
146 | * @see setPerlCompatible() | 145 | * @see setPerlCompatible() |
@@ -165,18 +164,24 @@ class Text_LanguageDetect | |||
165 | var $_clusters; | 164 | var $_clusters; |
166 | 165 | ||
167 | /** | 166 | /** |
167 | * Which type of "language names" are accepted and returned: | ||
168 | * | ||
169 | * 0 - language name ("english") | ||
170 | * 2 - 2-letter ISO 639-1 code ("en") | ||
171 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
172 | */ | ||
173 | var $_name_mode = 0; | ||
174 | |||
175 | /** | ||
168 | * Constructor | 176 | * Constructor |
169 | * | 177 | * |
170 | * Will attempt to load the language database. If it fails, you will get | 178 | * Will attempt to load the language database. If it fails, you will get |
171 | * a PEAR_Error object returned when you try to use detect() | 179 | * an exception. |
172 | * | ||
173 | */ | 180 | */ |
174 | function Text_LanguageDetect($db=null, $unicode_db=null) | 181 | function __construct() |
175 | { | 182 | { |
176 | if (isset($db)) $this->_db_filename = $db; | ||
177 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
178 | |||
179 | $data = $this->_readdb($this->_db_filename); | 183 | $data = $this->_readdb($this->_db_filename); |
184 | $this->_checkTrigram($data['trigram']); | ||
180 | $this->_lang_db = $data['trigram']; | 185 | $this->_lang_db = $data['trigram']; |
181 | 186 | ||
182 | if (isset($data['trigram-unicodemap'])) { | 187 | if (isset($data['trigram-unicodemap'])) { |
@@ -186,29 +191,32 @@ class Text_LanguageDetect | |||
186 | // Not yet implemented: | 191 | // Not yet implemented: |
187 | if (isset($data['trigram-clusters'])) { | 192 | if (isset($data['trigram-clusters'])) { |
188 | $this->_clusters = $data['trigram-clusters']; | 193 | $this->_clusters = $data['trigram-clusters']; |
189 | } | 194 | } |
190 | } | 195 | } |
191 | 196 | ||
192 | /** | 197 | /** |
193 | * Returns the path to the location of the database | 198 | * Returns the path to the location of the database |
194 | * | 199 | * |
195 | * @access private | 200 | * @param string $fname File name to load |
196 | * @return string expected path to the language model database | 201 | * |
202 | * @return string expected path to the language model database | ||
203 | * @access private | ||
197 | */ | 204 | */ |
198 | function _get_data_loc($fname) | 205 | function _get_data_loc($fname) |
199 | { | 206 | { |
200 | return $fname; | 207 | return dirname(__FILE__).'/'.$fname; |
201 | } | 208 | } |
202 | 209 | ||
203 | /** | 210 | /** |
204 | * Loads the language trigram database from filename | 211 | * Loads the language trigram database from filename |
205 | * | 212 | * |
206 | * Trigram datbase should be a serialize()'d array | 213 | * Trigram datbase should be a serialize()'d array |
207 | * | 214 | * |
208 | * @access private | 215 | * @param string $fname the filename where the data is stored |
209 | * @param string $fname the filename where the data is stored | 216 | * |
210 | * @return array the language model data | 217 | * @return array the language model data |
211 | * @throws PEAR_Error | 218 | * @throws Text_LanguageDetect_Exception |
219 | * @access private | ||
212 | */ | 220 | */ |
213 | function _readdb($fname) | 221 | function _readdb($fname) |
214 | { | 222 | { |
@@ -217,79 +225,74 @@ class Text_LanguageDetect | |||
217 | 225 | ||
218 | // input check | 226 | // input check |
219 | if (!file_exists($fname)) { | 227 | if (!file_exists($fname)) { |
220 | throw new Exception('Language database does not exist.'); | 228 | throw new Text_LanguageDetect_Exception( |
229 | 'Language database does not exist: ' . $fname, | ||
230 | Text_LanguageDetect_Exception::DB_NOT_FOUND | ||
231 | ); | ||
221 | } elseif (!is_readable($fname)) { | 232 | } elseif (!is_readable($fname)) { |
222 | throw new Exception('Language database is not readable.'); | 233 | throw new Text_LanguageDetect_Exception( |
234 | 'Language database is not readable: ' . $fname, | ||
235 | Text_LanguageDetect_Exception::DB_NOT_READABLE | ||
236 | ); | ||
223 | } | 237 | } |
224 | 238 | ||
225 | if (function_exists('file_get_contents')) { | 239 | return unserialize(file_get_contents($fname)); |
226 | return unserialize(file_get_contents($fname)); | ||
227 | } else { | ||
228 | // if you don't have file_get_contents(), | ||
229 | // then this is the next fastest way | ||
230 | ob_start(); | ||
231 | readfile($fname); | ||
232 | $contents = ob_get_contents(); | ||
233 | ob_end_clean(); | ||
234 | return unserialize($contents); | ||
235 | } | ||
236 | } | 240 | } |
237 | 241 | ||
238 | 242 | ||
239 | /** | 243 | /** |
240 | * Checks if this object is ready to detect languages | 244 | * Checks if this object is ready to detect languages |
241 | * | 245 | * |
242 | * @access private | 246 | * @param array $trigram Trigram data from database |
243 | * @param mixed &$err error object to be returned by reference, if any | 247 | * |
244 | * @return bool true if no errors | 248 | * @return void |
249 | * @access private | ||
245 | */ | 250 | */ |
246 | function _setup_ok(&$err) | 251 | function _checkTrigram($trigram) |
247 | { | 252 | { |
248 | if (!is_array($this->_lang_db)) { | 253 | if (!is_array($trigram)) { |
249 | if (ini_get('magic_quotes_runtime')) { | 254 | if (ini_get('magic_quotes_runtime')) { |
250 | throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); | 255 | throw new Text_LanguageDetect_Exception( |
251 | } else { | 256 | 'Error loading database. Try turning magic_quotes_runtime off.', |
252 | throw new Exception('Language database is not an array.'); | 257 | Text_LanguageDetect_Exception::MAGIC_QUOTES |
258 | ); | ||
253 | } | 259 | } |
254 | return false; | 260 | throw new Text_LanguageDetect_Exception( |
255 | 261 | 'Language database is not an array.', | |
256 | } elseif (empty($this->_lang_db)) { | 262 | Text_LanguageDetect_Exception::DB_NOT_ARRAY |
257 | throw new Exception('Language database has no elements.'); | 263 | ); |
258 | return false; | 264 | } elseif (empty($trigram)) { |
259 | 265 | throw new Text_LanguageDetect_Exception( | |
260 | } else { | 266 | 'Language database has no elements.', |
261 | return true; | 267 | Text_LanguageDetect_Exception::DB_EMPTY |
268 | ); | ||
262 | } | 269 | } |
263 | } | 270 | } |
264 | 271 | ||
265 | /** | 272 | /** |
266 | * Omits languages | 273 | * Omits languages |
267 | * | 274 | * |
268 | * Pass this function the name of or an array of names of | 275 | * Pass this function the name of or an array of names of |
269 | * languages that you don't want considered | 276 | * languages that you don't want considered |
270 | * | 277 | * |
271 | * If you're only expecting a limited set of languages, this can greatly | 278 | * If you're only expecting a limited set of languages, this can greatly |
272 | * speed up processing | 279 | * speed up processing |
273 | * | 280 | * |
274 | * @access public | 281 | * @param mixed $omit_list language name or array of names to omit |
275 | * @param mixed $omit_list language name or array of names to omit | 282 | * @param bool $include_only if true will include (rather than |
276 | * @param bool $include_only if true will include (rather than | 283 | * exclude) only those in the list |
277 | * exclude) only those in the list | 284 | * |
278 | * @return int number of languages successfully deleted | 285 | * @return int number of languages successfully deleted |
279 | * @throws PEAR_Error | 286 | * @throws Text_LanguageDetect_Exception |
280 | */ | 287 | */ |
281 | function omitLanguages($omit_list, $include_only = false) | 288 | public function omitLanguages($omit_list, $include_only = false) |
282 | { | 289 | { |
283 | |||
284 | // setup check | ||
285 | if (!$this->_setup_ok($err)) { | ||
286 | return $err; | ||
287 | } | ||
288 | |||
289 | $deleted = 0; | 290 | $deleted = 0; |
290 | 291 | ||
291 | // deleting the given languages | 292 | $omit_list = $this->_convertFromNameMode($omit_list); |
293 | |||
292 | if (!$include_only) { | 294 | if (!$include_only) { |
295 | // deleting the given languages | ||
293 | if (!is_array($omit_list)) { | 296 | if (!is_array($omit_list)) { |
294 | $omit_list = strtolower($omit_list); // case desensitize | 297 | $omit_list = strtolower($omit_list); // case desensitize |
295 | if (isset($this->_lang_db[$omit_list])) { | 298 | if (isset($this->_lang_db[$omit_list])) { |
@@ -301,12 +304,12 @@ class Text_LanguageDetect | |||
301 | if (isset($this->_lang_db[$omit_lang])) { | 304 | if (isset($this->_lang_db[$omit_lang])) { |
302 | unset($this->_lang_db[$omit_lang]); | 305 | unset($this->_lang_db[$omit_lang]); |
303 | $deleted++; | 306 | $deleted++; |
304 | } | 307 | } |
305 | } | 308 | } |
306 | } | 309 | } |
307 | 310 | ||
308 | // deleting all except the given languages | ||
309 | } else { | 311 | } else { |
312 | // deleting all except the given languages | ||
310 | if (!is_array($omit_list)) { | 313 | if (!is_array($omit_list)) { |
311 | $omit_list = array($omit_list); | 314 | $omit_list = array($omit_list); |
312 | } | 315 | } |
@@ -327,7 +330,7 @@ class Text_LanguageDetect | |||
327 | // reset the cluster cache if the number of languages changes | 330 | // reset the cluster cache if the number of languages changes |
328 | // this will then have to be recalculated | 331 | // this will then have to be recalculated |
329 | if (isset($this->_clusters) && $deleted > 0) { | 332 | if (isset($this->_clusters) && $deleted > 0) { |
330 | unset($this->_clusters); | 333 | $this->_clusters = null; |
331 | } | 334 | } |
332 | 335 | ||
333 | return $deleted; | 336 | return $deleted; |
@@ -339,49 +342,40 @@ class Text_LanguageDetect | |||
339 | * | 342 | * |
340 | * @access public | 343 | * @access public |
341 | * @return int the number of languages | 344 | * @return int the number of languages |
342 | * @throws PEAR_Error | 345 | * @throws Text_LanguageDetect_Exception |
343 | */ | 346 | */ |
344 | function getLanguageCount() | 347 | function getLanguageCount() |
345 | { | 348 | { |
346 | if (!$this->_setup_ok($err)) { | 349 | return count($this->_lang_db); |
347 | return $err; | ||
348 | } else { | ||
349 | return count($this->_lang_db); | ||
350 | } | ||
351 | } | 350 | } |
352 | 351 | ||
353 | /** | 352 | /** |
354 | * Returns true if a given language exists | 353 | * Checks if the language with the given name exists in the database |
355 | * | 354 | * |
356 | * If passed an array of names, will return true only if all exist | 355 | * @param mixed $lang Language name or array of language names |
357 | * | 356 | * |
358 | * @access public | 357 | * @return bool true if language model exists |
359 | * @param mixed $lang language name or array of language names | ||
360 | * @return bool true if language model exists | ||
361 | * @throws PEAR_Error | ||
362 | */ | 358 | */ |
363 | function languageExists($lang) | 359 | public function languageExists($lang) |
364 | { | 360 | { |
365 | if (!$this->_setup_ok($err)) { | 361 | $lang = $this->_convertFromNameMode($lang); |
366 | return $err; | ||
367 | } else { | ||
368 | // string | ||
369 | if (is_string($lang)) { | ||
370 | return isset($this->_lang_db[strtolower($lang)]); | ||
371 | |||
372 | // array | ||
373 | } elseif (is_array($lang)) { | ||
374 | foreach ($lang as $test_lang) { | ||
375 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
376 | return false; | ||
377 | } | ||
378 | } | ||
379 | return true; | ||
380 | 362 | ||
381 | // other (error) | 363 | if (is_string($lang)) { |
382 | } else { | 364 | return isset($this->_lang_db[strtolower($lang)]); |
383 | throw new Exception('Unknown type passed to languageExists()'); | 365 | |
366 | } elseif (is_array($lang)) { | ||
367 | foreach ($lang as $test_lang) { | ||
368 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
369 | return false; | ||
370 | } | ||
384 | } | 371 | } |
372 | return true; | ||
373 | |||
374 | } else { | ||
375 | throw new Text_LanguageDetect_Exception( | ||
376 | 'Unsupported parameter type passed to languageExists()', | ||
377 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
378 | ); | ||
385 | } | 379 | } |
386 | } | 380 | } |
387 | 381 | ||
@@ -389,25 +383,24 @@ class Text_LanguageDetect | |||
389 | * Returns the list of detectable languages | 383 | * Returns the list of detectable languages |
390 | * | 384 | * |
391 | * @access public | 385 | * @access public |
392 | * @return array the names of the languages known to this object | 386 | * @return array the names of the languages known to this object<<<<<<< |
393 | * @throws PEAR_Error | 387 | * @throws Text_LanguageDetect_Exception |
394 | */ | 388 | */ |
395 | function getLanguages() | 389 | function getLanguages() |
396 | { | 390 | { |
397 | if (!$this->_setup_ok($err)) { | 391 | return $this->_convertToNameMode( |
398 | return $err; | 392 | array_keys($this->_lang_db) |
399 | } else { | 393 | ); |
400 | return array_keys($this->_lang_db); | ||
401 | } | ||
402 | } | 394 | } |
403 | 395 | ||
404 | /** | 396 | /** |
405 | * Make this object behave like Language::Guess | 397 | * Make this object behave like Language::Guess |
406 | * | 398 | * |
407 | * @access public | 399 | * @param bool $setting false to turn off perl compatibility |
408 | * @param bool $setting false to turn off perl compatibility | 400 | * |
401 | * @return void | ||
409 | */ | 402 | */ |
410 | function setPerlCompatible($setting = true) | 403 | public function setPerlCompatible($setting = true) |
411 | { | 404 | { |
412 | if (is_bool($setting)) { // input check | 405 | if (is_bool($setting)) { // input check |
413 | $this->_perl_compatible = $setting; | 406 | $this->_perl_compatible = $setting; |
@@ -422,6 +415,21 @@ class Text_LanguageDetect | |||
422 | } | 415 | } |
423 | 416 | ||
424 | /** | 417 | /** |
418 | * Sets the way how language names are accepted and returned. | ||
419 | * | ||
420 | * @param integer $name_mode One of the following modes: | ||
421 | * 0 - language name ("english") | ||
422 | * 2 - 2-letter ISO 639-1 code ("en") | ||
423 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
424 | * | ||
425 | * @return void | ||
426 | */ | ||
427 | function setNameMode($name_mode) | ||
428 | { | ||
429 | $this->_name_mode = $name_mode; | ||
430 | } | ||
431 | |||
432 | /** | ||
425 | * Whether to use unicode block ranges in detection | 433 | * Whether to use unicode block ranges in detection |
426 | * | 434 | * |
427 | * Should speed up most detections if turned on (detault is on). In some | 435 | * Should speed up most detections if turned on (detault is on). In some |
@@ -429,10 +437,11 @@ class Text_LanguageDetect | |||
429 | * in languages that use latin scripts. In other cases it should speed up | 437 | * in languages that use latin scripts. In other cases it should speed up |
430 | * detection noticeably. | 438 | * detection noticeably. |
431 | * | 439 | * |
432 | * @access public | 440 | * @param bool $setting false to turn off |
433 | * @param bool $setting false to turn off | 441 | * |
442 | * @return void | ||
434 | */ | 443 | */ |
435 | function useUnicodeBlocks($setting = true) | 444 | public function useUnicodeBlocks($setting = true) |
436 | { | 445 | { |
437 | if (is_bool($setting)) { | 446 | if (is_bool($setting)) { |
438 | $this->_use_unicode_narrowing = $setting; | 447 | $this->_use_unicode_narrowing = $setting; |
@@ -442,15 +451,15 @@ class Text_LanguageDetect | |||
442 | /** | 451 | /** |
443 | * Converts a piece of text into trigrams | 452 | * Converts a piece of text into trigrams |
444 | * | 453 | * |
445 | * Superceded by the Text_LanguageDetect_Parser class | 454 | * @param string $text text to convert |
446 | * | 455 | * |
447 | * @access private | 456 | * @return array array of trigram frequencies |
448 | * @param string $text text to convert | 457 | * @access private |
449 | * @return array array of trigram frequencies | 458 | * @deprecated Superceded by the Text_LanguageDetect_Parser class |
450 | */ | 459 | */ |
451 | function _trigram($text) | 460 | function _trigram($text) |
452 | { | 461 | { |
453 | $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); | 462 | $s = new Text_LanguageDetect_Parser($text); |
454 | $s->prepareTrigram(); | 463 | $s->prepareTrigram(); |
455 | $s->prepareUnicode(false); | 464 | $s->prepareUnicode(false); |
456 | $s->setPadStart(!$this->_perl_compatible); | 465 | $s->setPadStart(!$this->_perl_compatible); |
@@ -463,11 +472,12 @@ class Text_LanguageDetect | |||
463 | * | 472 | * |
464 | * Thresholds (cuts off) the list at $this->_threshold | 473 | * Thresholds (cuts off) the list at $this->_threshold |
465 | * | 474 | * |
466 | * @access protected | 475 | * @param array $arr array of trigram |
467 | * @param array $arr array of trgram | 476 | * |
468 | * @return array ranks of trigrams | 477 | * @return array ranks of trigrams |
478 | * @access protected | ||
469 | */ | 479 | */ |
470 | function _arr_rank(&$arr) | 480 | function _arr_rank($arr) |
471 | { | 481 | { |
472 | 482 | ||
473 | // sorts alphabetically first as a standard way of breaking rank ties | 483 | // sorts alphabetically first as a standard way of breaking rank ties |
@@ -494,14 +504,17 @@ class Text_LanguageDetect | |||
494 | 504 | ||
495 | /** | 505 | /** |
496 | * Sorts an array by value breaking ties alphabetically | 506 | * Sorts an array by value breaking ties alphabetically |
497 | * | 507 | * |
498 | * @access private | 508 | * @param array &$arr the array to sort |
499 | * @param array &$arr the array to sort | 509 | * |
510 | * @return void | ||
511 | * @access private | ||
500 | */ | 512 | */ |
501 | function _bub_sort(&$arr) | 513 | function _bub_sort(&$arr) |
502 | { | 514 | { |
503 | // should do the same as this perl statement: | 515 | // should do the same as this perl statement: |
504 | // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | 516 | // sort { $trigrams{$b} == $trigrams{$a} |
517 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | ||
505 | 518 | ||
506 | // needs to sort by both key and value at once | 519 | // needs to sort by both key and value at once |
507 | // using the key to break ties for the value | 520 | // using the key to break ties for the value |
@@ -528,13 +541,14 @@ class Text_LanguageDetect | |||
528 | /** | 541 | /** |
529 | * Sort function used by bubble sort | 542 | * Sort function used by bubble sort |
530 | * | 543 | * |
531 | * Callback function for usort(). | 544 | * Callback function for usort(). |
532 | * | 545 | * |
533 | * @access private | 546 | * @param array $a first param passed by usort() |
534 | * @param array first param passed by usort() | 547 | * @param array $b second param passed by usort() |
535 | * @param array second param passed by usort() | 548 | * |
536 | * @return int 1 if $a is greater, -1 if not | 549 | * @return int 1 if $a is greater, -1 if not |
537 | * @see _bub_sort() | 550 | * @see _bub_sort() |
551 | * @access private | ||
538 | */ | 552 | */ |
539 | function _sort_func($a, $b) | 553 | function _sort_func($a, $b) |
540 | { | 554 | { |
@@ -542,12 +556,12 @@ class Text_LanguageDetect | |||
542 | list($a_key, $a_value) = $a; | 556 | list($a_key, $a_value) = $a; |
543 | list($b_key, $b_value) = $b; | 557 | list($b_key, $b_value) = $b; |
544 | 558 | ||
545 | // if the values are the same, break ties using the key | ||
546 | if ($a_value == $b_value) { | 559 | if ($a_value == $b_value) { |
560 | // if the values are the same, break ties using the key | ||
547 | return strcmp($a_key, $b_key); | 561 | return strcmp($a_key, $b_key); |
548 | 562 | ||
549 | // if not, just sort normally | ||
550 | } else { | 563 | } else { |
564 | // if not, just sort normally | ||
551 | if ($a_value > $b_value) { | 565 | if ($a_value > $b_value) { |
552 | return -1; | 566 | return -1; |
553 | } else { | 567 | } else { |
@@ -559,23 +573,24 @@ class Text_LanguageDetect | |||
559 | } | 573 | } |
560 | 574 | ||
561 | /** | 575 | /** |
562 | * Calculates a linear rank-order distance statistic between two sets of | 576 | * Calculates a linear rank-order distance statistic between two sets of |
563 | * ranked trigrams | 577 | * ranked trigrams |
564 | * | 578 | * |
565 | * Sums the differences in rank for each trigram. If the trigram does not | 579 | * Sums the differences in rank for each trigram. If the trigram does not |
566 | * appear in both, consider it a difference of $this->_threshold. | 580 | * appear in both, consider it a difference of $this->_threshold. |
567 | * | 581 | * |
568 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | 582 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite |
569 | * its simplicity it has been shown to be highly accurate for language | 583 | * its simplicity it has been shown to be highly accurate for language |
570 | * identification tasks. | 584 | * identification tasks. |
571 | * | 585 | * |
572 | * @access private | 586 | * @param array $arr1 the reference set of trigram ranks |
573 | * @param array $arr1 the reference set of trigram ranks | 587 | * @param array $arr2 the target set of trigram ranks |
574 | * @param array $arr2 the target set of trigram ranks | 588 | * |
575 | * @return int the sum of the differences between the ranks of | 589 | * @return int the sum of the differences between the ranks of |
576 | * the two trigram sets | 590 | * the two trigram sets |
591 | * @access private | ||
577 | */ | 592 | */ |
578 | function _distance(&$arr1, &$arr2) | 593 | function _distance($arr1, $arr2) |
579 | { | 594 | { |
580 | $sumdist = 0; | 595 | $sumdist = 0; |
581 | 596 | ||
@@ -598,14 +613,15 @@ class Text_LanguageDetect | |||
598 | 613 | ||
599 | /** | 614 | /** |
600 | * Normalizes the score returned by _distance() | 615 | * Normalizes the score returned by _distance() |
601 | * | 616 | * |
602 | * Different if perl compatible or not | 617 | * Different if perl compatible or not |
603 | * | 618 | * |
604 | * @access private | 619 | * @param int $score the score from _distance() |
605 | * @param int $score the score from _distance() | 620 | * @param int $base_count the number of trigrams being considered |
606 | * @param int $base_count the number of trigrams being considered | 621 | * |
607 | * @return float the normalized score | 622 | * @return float the normalized score |
608 | * @see _distance() | 623 | * @see _distance() |
624 | * @access private | ||
609 | */ | 625 | */ |
610 | function _normalize_score($score, $base_count = null) | 626 | function _normalize_score($score, $base_count = null) |
611 | { | 627 | { |
@@ -630,29 +646,24 @@ class Text_LanguageDetect | |||
630 | * | 646 | * |
631 | * If perl compatible, the score is 300-0, 0 being most similar. | 647 | * If perl compatible, the score is 300-0, 0 being most similar. |
632 | * Otherwise, it's 0-1 with 1 being most similar. | 648 | * Otherwise, it's 0-1 with 1 being most similar. |
633 | * | 649 | * |
634 | * The $sample text should be at least a few sentences in length; | 650 | * The $sample text should be at least a few sentences in length; |
635 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | 651 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension |
636 | * is present it will try to detect and convert. However, experience has | 652 | * is present it will try to detect and convert. However, experience has |
637 | * shown that mb_detect_encoding() *does not work very well* with at least | 653 | * shown that mb_detect_encoding() *does not work very well* with at least |
638 | * some types of encoding. | 654 | * some types of encoding. |
639 | * | 655 | * |
640 | * @access public | 656 | * @param string $sample a sample of text to compare. |
641 | * @param string $sample a sample of text to compare. | 657 | * @param int $limit if specified, return an array of the most likely |
642 | * @param int $limit if specified, return an array of the most likely | 658 | * $limit languages and their scores. |
643 | * $limit languages and their scores. | 659 | * |
644 | * @return mixed sorted array of language scores, blank array if no | 660 | * @return mixed sorted array of language scores, blank array if no |
645 | * useable text was found, or PEAR_Error if error | 661 | * useable text was found |
646 | * with the object setup | 662 | * @see _distance() |
647 | * @see _distance() | 663 | * @throws Text_LanguageDetect_Exception |
648 | * @throws PEAR_Error | ||
649 | */ | 664 | */ |
650 | function detect($sample, $limit = 0) | 665 | public function detect($sample, $limit = 0) |
651 | { | 666 | { |
652 | if (!$this->_setup_ok($err)) { | ||
653 | return $err; | ||
654 | } | ||
655 | |||
656 | // input check | 667 | // input check |
657 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | 668 | if (!Text_LanguageDetect_Parser::validateString($sample)) { |
658 | return array(); | 669 | return array(); |
@@ -660,36 +671,27 @@ class Text_LanguageDetect | |||
660 | 671 | ||
661 | // check char encoding | 672 | // check char encoding |
662 | // (only if mbstring extension is compiled and PHP > 4.0.6) | 673 | // (only if mbstring extension is compiled and PHP > 4.0.6) |
663 | if (function_exists('mb_detect_encoding') | 674 | if (function_exists('mb_detect_encoding') |
664 | && function_exists('mb_convert_encoding')) { | 675 | && function_exists('mb_convert_encoding') |
665 | 676 | ) { | |
666 | // mb_detect_encoding isn't very reliable, to say the least | 677 | // mb_detect_encoding isn't very reliable, to say the least |
667 | // detection should still work with a sufficient sample of ascii characters | 678 | // detection should still work with a sufficient sample |
679 | // of ascii characters | ||
668 | $encoding = mb_detect_encoding($sample); | 680 | $encoding = mb_detect_encoding($sample); |
669 | 681 | ||
670 | // mb_detect_encoding() will return FALSE if detection fails | 682 | // mb_detect_encoding() will return FALSE if detection fails |
671 | // don't attempt conversion if that's the case | 683 | // don't attempt conversion if that's the case |
672 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { | 684 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' |
673 | 685 | && $encoding !== false | |
674 | if (function_exists('mb_list_encodings')) { | 686 | ) { |
675 | 687 | // verify the encoding exists in mb_list_encodings | |
676 | // verify the encoding exists in mb_list_encodings | 688 | if (in_array($encoding, mb_list_encodings())) { |
677 | if (in_array($encoding, mb_list_encodings())) { | 689 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); |
678 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
679 | } | ||
680 | |||
681 | // if the previous condition failed: | ||
682 | // somehow we detected an encoding that also we don't support | ||
683 | |||
684 | } else { | ||
685 | // php 4 doesnt have mb_list_encodings() | ||
686 | // so attempt with error suppression | ||
687 | $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
688 | } | 690 | } |
689 | } | 691 | } |
690 | } | 692 | } |
691 | 693 | ||
692 | $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); | 694 | $sample_obj = new Text_LanguageDetect_Parser($sample); |
693 | $sample_obj->prepareTrigram(); | 695 | $sample_obj->prepareTrigram(); |
694 | if ($this->_use_unicode_narrowing) { | 696 | if ($this->_use_unicode_narrowing) { |
695 | $sample_obj->prepareUnicode(); | 697 | $sample_obj->prepareUnicode(); |
@@ -713,7 +715,10 @@ class Text_LanguageDetect | |||
713 | if (is_array($blocks)) { | 715 | if (is_array($blocks)) { |
714 | $present_blocks = array_keys($blocks); | 716 | $present_blocks = array_keys($blocks); |
715 | } else { | 717 | } else { |
716 | throw new Exception('Error during block detection'); | 718 | throw new Text_LanguageDetect_Exception( |
719 | 'Error during block detection', | ||
720 | Text_LanguageDetect_Exception::BLOCK_DETECTION | ||
721 | ); | ||
717 | } | 722 | } |
718 | 723 | ||
719 | $possible_langs = array(); | 724 | $possible_langs = array(); |
@@ -731,30 +736,30 @@ class Text_LanguageDetect | |||
731 | } | 736 | } |
732 | 737 | ||
733 | // could also try an intersect operation rather than a union | 738 | // could also try an intersect operation rather than a union |
734 | // in other words, choose languages whose trigrams contain | 739 | // in other words, choose languages whose trigrams contain |
735 | // ALL of the unicode blocks found in this sample | 740 | // ALL of the unicode blocks found in this sample |
736 | // would improve speed but would be completely thrown off by an | 741 | // would improve speed but would be completely thrown off by an |
737 | // unexpected character, like an umlaut appearing in english text | 742 | // unexpected character, like an umlaut appearing in english text |
738 | 743 | ||
739 | $possible_langs = array_intersect( | 744 | $possible_langs = array_intersect( |
740 | array_keys($this->_lang_db), | 745 | array_keys($this->_lang_db), |
741 | array_unique($possible_langs) | 746 | array_unique($possible_langs) |
742 | ); | 747 | ); |
743 | 748 | ||
744 | // needs to intersect it with the keys of _lang_db in case | 749 | // needs to intersect it with the keys of _lang_db in case |
745 | // languages have been omitted | 750 | // languages have been omitted |
746 | 751 | ||
747 | // or just try 'em all | ||
748 | } else { | 752 | } else { |
753 | // or just try 'em all | ||
749 | $possible_langs = array_keys($this->_lang_db); | 754 | $possible_langs = array_keys($this->_lang_db); |
750 | } | 755 | } |
751 | 756 | ||
752 | 757 | ||
753 | foreach ($possible_langs as $lang) { | 758 | foreach ($possible_langs as $lang) { |
754 | $scores[$lang] = | 759 | $scores[$lang] = $this->_normalize_score( |
755 | $this->_normalize_score( | 760 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), |
756 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | 761 | $trigram_count |
757 | $trigram_count); | 762 | ); |
758 | } | 763 | } |
759 | 764 | ||
760 | unset($sample_obj); | 765 | unset($sample_obj); |
@@ -772,7 +777,6 @@ class Text_LanguageDetect | |||
772 | $limited_scores = array(); | 777 | $limited_scores = array(); |
773 | 778 | ||
774 | $i = 0; | 779 | $i = 0; |
775 | |||
776 | foreach ($scores as $key => $value) { | 780 | foreach ($scores as $key => $value) { |
777 | if ($i++ >= $limit) { | 781 | if ($i++ >= $limit) { |
778 | break; | 782 | break; |
@@ -781,9 +785,9 @@ class Text_LanguageDetect | |||
781 | $limited_scores[$key] = $value; | 785 | $limited_scores[$key] = $value; |
782 | } | 786 | } |
783 | 787 | ||
784 | return $limited_scores; | 788 | return $this->_convertToNameMode($limited_scores, true); |
785 | } else { | 789 | } else { |
786 | return $scores; | 790 | return $this->_convertToNameMode($scores, true); |
787 | } | 791 | } |
788 | } | 792 | } |
789 | 793 | ||
@@ -791,35 +795,33 @@ class Text_LanguageDetect | |||
791 | * Returns only the most similar language to the text sample | 795 | * Returns only the most similar language to the text sample |
792 | * | 796 | * |
793 | * Calls $this->detect() and returns only the top result | 797 | * Calls $this->detect() and returns only the top result |
794 | * | 798 | * |
795 | * @access public | 799 | * @param string $sample text to detect the language of |
796 | * @param string $sample text to detect the language of | 800 | * |
797 | * @return string the name of the most likely language | 801 | * @return string the name of the most likely language |
798 | * or null if no language is similar | 802 | * or null if no language is similar |
799 | * @see detect() | 803 | * @see detect() |
800 | * @throws PEAR_Error | 804 | * @throws Text_LanguageDetect_Exception |
801 | */ | 805 | */ |
802 | function detectSimple($sample) | 806 | public function detectSimple($sample) |
803 | { | 807 | { |
804 | $scores = $this->detect($sample, 1); | 808 | $scores = $this->detect($sample, 1); |
805 | 809 | ||
806 | // if top language has the maximum possible score, | 810 | // if top language has the maximum possible score, |
807 | // then the top score will have been picked at random | 811 | // then the top score will have been picked at random |
808 | if ( !is_array($scores) | 812 | if (!is_array($scores) || empty($scores) |
809 | || empty($scores) | 813 | || current($scores) == $this->_max_score |
810 | || current($scores) == $this->_max_score) { | 814 | ) { |
811 | |||
812 | return null; | 815 | return null; |
813 | |||
814 | } else { | 816 | } else { |
815 | return ucfirst(key($scores)); | 817 | return key($scores); |
816 | } | 818 | } |
817 | } | 819 | } |
818 | 820 | ||
819 | /** | 821 | /** |
820 | * Returns an array containing the most similar language and a confidence | 822 | * Returns an array containing the most similar language and a confidence |
821 | * rating | 823 | * rating |
822 | * | 824 | * |
823 | * Confidence is a simple measure calculated from the similarity score | 825 | * Confidence is a simple measure calculated from the similarity score |
824 | * minus the similarity score from the next most similar language | 826 | * minus the similarity score from the next most similar language |
825 | * divided by the highest possible score. Languages that have closely | 827 | * divided by the highest possible score. Languages that have closely |
@@ -827,46 +829,43 @@ class Text_LanguageDetect | |||
827 | * confidence scores. | 829 | * confidence scores. |
828 | * | 830 | * |
829 | * The similarity score answers the question "How likely is the text the | 831 | * The similarity score answers the question "How likely is the text the |
830 | * returned language regardless of the other languages considered?" The | 832 | * returned language regardless of the other languages considered?" The |
831 | * confidence score is one way of answering the question "how likely is the | 833 | * confidence score is one way of answering the question "how likely is the |
832 | * text the detected language relative to the rest of the language model | 834 | * text the detected language relative to the rest of the language model |
833 | * set?" | 835 | * set?" |
834 | * | 836 | * |
835 | * To see how similar languages are a priori, see languageSimilarity() | 837 | * To see how similar languages are a priori, see languageSimilarity() |
836 | * | 838 | * |
837 | * @access public | 839 | * @param string $sample text for which language will be detected |
838 | * @param string $sample text for which language will be detected | 840 | * |
839 | * @return array most similar language, score and confidence rating | 841 | * @return array most similar language, score and confidence rating |
840 | * or null if no language is similar | 842 | * or null if no language is similar |
841 | * @see detect() | 843 | * @see detect() |
842 | * @throws PEAR_Error | 844 | * @throws Text_LanguageDetect_Exception |
843 | */ | 845 | */ |
844 | function detectConfidence($sample) | 846 | public function detectConfidence($sample) |
845 | { | 847 | { |
846 | $scores = $this->detect($sample, 2); | 848 | $scores = $this->detect($sample, 2); |
847 | 849 | ||
848 | // if most similar language has the max score, it | 850 | // if most similar language has the max score, it |
849 | // will have been picked at random | 851 | // will have been picked at random |
850 | if ( !is_array($scores) | 852 | if (!is_array($scores) || empty($scores) |
851 | || empty($scores) | 853 | || current($scores) == $this->_max_score |
852 | || current($scores) == $this->_max_score) { | 854 | ) { |
853 | |||
854 | return null; | 855 | return null; |
855 | } | 856 | } |
856 | 857 | ||
857 | $arr['language'] = ucfirst(key($scores)); | 858 | $arr['language'] = key($scores); |
858 | $arr['similarity'] = current($scores); | 859 | $arr['similarity'] = current($scores); |
859 | if (next($scores) !== false) { // if false then no next element | 860 | if (next($scores) !== false) { // if false then no next element |
860 | // the goal is to return a higher value if the distance between | 861 | // the goal is to return a higher value if the distance between |
861 | // the similarity of the first score and the second score is high | 862 | // the similarity of the first score and the second score is high |
862 | 863 | ||
863 | if ($this->_perl_compatible) { | 864 | if ($this->_perl_compatible) { |
864 | 865 | $arr['confidence'] = (current($scores) - $arr['similarity']) | |
865 | $arr['confidence'] = | 866 | / $this->_max_score; |
866 | (current($scores) - $arr['similarity']) / $this->_max_score; | ||
867 | 867 | ||
868 | } else { | 868 | } else { |
869 | |||
870 | $arr['confidence'] = $arr['similarity'] - current($scores); | 869 | $arr['confidence'] = $arr['similarity'] - current($scores); |
871 | 870 | ||
872 | } | 871 | } |
@@ -882,32 +881,26 @@ class Text_LanguageDetect | |||
882 | * Returns the distribution of unicode blocks in a given utf8 string | 881 | * Returns the distribution of unicode blocks in a given utf8 string |
883 | * | 882 | * |
884 | * For the block name of a single char, use unicodeBlockName() | 883 | * For the block name of a single char, use unicodeBlockName() |
885 | * | 884 | * |
886 | * @access public | 885 | * @param string $str input string. Must be ascii or utf8 |
887 | * @param string $str input string. Must be ascii or utf8 | 886 | * @param bool $skip_symbols if true, skip ascii digits, symbols and |
888 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | 887 | * non-printing characters. Includes spaces, |
889 | * non-printing characters. Includes spaces, | 888 | * newlines and common punctutation characters. |
890 | * newlines and common punctutation characters. | 889 | * |
891 | * @return array | 890 | * @return array |
892 | * @throws PEAR_Error | 891 | * @throws Text_LanguageDetect_Exception |
893 | */ | 892 | */ |
894 | function detectUnicodeBlocks($str, $skip_symbols) | 893 | public function detectUnicodeBlocks($str, $skip_symbols) |
895 | { | 894 | { |
896 | // input check | 895 | $skip_symbols = (bool)$skip_symbols; |
897 | if (!is_bool($skip_symbols)) { | 896 | $str = (string)$str; |
898 | throw new Exception('Second parameter must be boolean'); | ||
899 | } | ||
900 | |||
901 | if (!is_string($str)) { | ||
902 | throw new Exception('First parameter was not a string'); | ||
903 | } | ||
904 | 897 | ||
905 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 898 | $sample_obj = new Text_LanguageDetect_Parser($str); |
906 | $sample_obj->prepareUnicode(); | 899 | $sample_obj->prepareUnicode(); |
907 | $sample_obj->prepareTrigram(false); | 900 | $sample_obj->prepareTrigram(false); |
908 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | 901 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); |
909 | $sample_obj->analyze(); | 902 | $sample_obj->analyze(); |
910 | $blocks =& $sample_obj->getUnicodeBlocks(); | 903 | $blocks = $sample_obj->getUnicodeBlocks(); |
911 | unset($sample_obj); | 904 | unset($sample_obj); |
912 | return $blocks; | 905 | return $blocks; |
913 | } | 906 | } |
@@ -915,38 +908,37 @@ class Text_LanguageDetect | |||
915 | /** | 908 | /** |
916 | * Returns the block name for a given unicode value | 909 | * Returns the block name for a given unicode value |
917 | * | 910 | * |
918 | * If passed a string, will assume it is being passed a UTF8-formatted | 911 | * If passed a string, will assume it is being passed a UTF8-formatted |
919 | * character and will automatically convert. Otherwise it will assume it | 912 | * character and will automatically convert. Otherwise it will assume it |
920 | * is being passed a numeric unicode value. | 913 | * is being passed a numeric unicode value. |
921 | * | 914 | * |
922 | * Make sure input is of the correct type! | 915 | * Make sure input is of the correct type! |
923 | * | 916 | * |
924 | * @access public | ||
925 | * @param mixed $unicode unicode value or utf8 char | 917 | * @param mixed $unicode unicode value or utf8 char |
918 | * | ||
926 | * @return mixed the block name string or false if not found | 919 | * @return mixed the block name string or false if not found |
927 | * @throws PEAR_Error | 920 | * @throws Text_LanguageDetect_Exception |
928 | */ | 921 | */ |
929 | function unicodeBlockName($unicode) { | 922 | public function unicodeBlockName($unicode) |
923 | { | ||
930 | if (is_string($unicode)) { | 924 | if (is_string($unicode)) { |
931 | // assume it is being passed a utf8 char, so convert it | 925 | // assume it is being passed a utf8 char, so convert it |
932 | 926 | if (self::utf8strlen($unicode) > 1) { | |
933 | // input check | 927 | throw new Text_LanguageDetect_Exception( |
934 | if ($this->utf8strlen($unicode) > 1) { | 928 | 'Pass a single char only to this method', |
935 | throw new Exception('Pass this function only a single char'); | 929 | Text_LanguageDetect_Exception::PARAM_TYPE |
930 | ); | ||
936 | } | 931 | } |
937 | |||
938 | $unicode = $this->_utf8char2unicode($unicode); | 932 | $unicode = $this->_utf8char2unicode($unicode); |
939 | 933 | ||
940 | if ($unicode == -1) { | ||
941 | throw new Exception('Malformatted char'); | ||
942 | } | ||
943 | |||
944 | // input check | ||
945 | } elseif (!is_int($unicode)) { | 934 | } elseif (!is_int($unicode)) { |
946 | throw new Exception('Input must be of type string or int.'); | 935 | throw new Text_LanguageDetect_Exception( |
936 | 'Input must be of type string or int.', | ||
937 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
938 | ); | ||
947 | } | 939 | } |
948 | 940 | ||
949 | $blocks =& $this->_read_unicode_block_db(); | 941 | $blocks = $this->_read_unicode_block_db(); |
950 | 942 | ||
951 | $result = $this->_unicode_block_name($unicode, $blocks); | 943 | $result = $this->_unicode_block_name($unicode, $blocks); |
952 | 944 | ||
@@ -964,14 +956,17 @@ class Text_LanguageDetect | |||
964 | * the public interface for this function, which does input checks which | 956 | * the public interface for this function, which does input checks which |
965 | * this function omits for speed. | 957 | * this function omits for speed. |
966 | * | 958 | * |
967 | * @access protected | 959 | * @param int $unicode the unicode value |
968 | * @param int $unicode the unicode value | 960 | * @param array $blocks the block database |
969 | * @param array &$blocks the block database | 961 | * @param int $block_count the number of defined blocks in the database |
970 | * @param int $block_count the number of defined blocks in the database | 962 | * |
971 | * @see unicodeBlockName() | 963 | * @return mixed Block name, -1 if it failed |
964 | * @see unicodeBlockName() | ||
965 | * @access protected | ||
972 | */ | 966 | */ |
973 | function _unicode_block_name($unicode, &$blocks, $block_count = -1) { | 967 | function _unicode_block_name($unicode, $blocks, $block_count = -1) |
974 | // for a reference, see | 968 | { |
969 | // for a reference, see | ||
975 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | 970 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt |
976 | 971 | ||
977 | // assume that ascii characters are the most common | 972 | // assume that ascii characters are the most common |
@@ -994,35 +989,36 @@ class Text_LanguageDetect | |||
994 | while ($low <= $high) { | 989 | while ($low <= $high) { |
995 | $mid = floor(($low + $high) / 2); | 990 | $mid = floor(($low + $high) / 2); |
996 | 991 | ||
997 | // if it's lower than the lower bound | ||
998 | if ($unicode < $blocks[$mid][0]) { | 992 | if ($unicode < $blocks[$mid][0]) { |
993 | // if it's lower than the lower bound | ||
999 | $high = $mid - 1; | 994 | $high = $mid - 1; |
1000 | 995 | ||
1001 | // if it's higher than the upper bound | ||
1002 | } elseif ($unicode > $blocks[$mid][1]) { | 996 | } elseif ($unicode > $blocks[$mid][1]) { |
997 | // if it's higher than the upper bound | ||
1003 | $low = $mid + 1; | 998 | $low = $mid + 1; |
1004 | 999 | ||
1005 | // found it | ||
1006 | } else { | 1000 | } else { |
1001 | // found it | ||
1007 | return $blocks[$mid]; | 1002 | return $blocks[$mid]; |
1008 | } | 1003 | } |
1009 | } | 1004 | } |
1010 | 1005 | ||
1011 | // failed to find the block | 1006 | // failed to find the block |
1012 | return -1; | 1007 | return -1; |
1013 | 1008 | ||
1014 | // todo: differentiate when it's out of range or when it falls | 1009 | // todo: differentiate when it's out of range or when it falls |
1015 | // into an unassigned range? | 1010 | // into an unassigned range? |
1016 | } | 1011 | } |
1017 | 1012 | ||
1018 | /** | 1013 | /** |
1019 | * Brings up the unicode block database | 1014 | * Brings up the unicode block database |
1020 | * | 1015 | * |
1021 | * @access protected | ||
1022 | * @return array the database of unicode block definitions | 1016 | * @return array the database of unicode block definitions |
1023 | * @throws PEAR_Error | 1017 | * @throws Text_LanguageDetect_Exception |
1018 | * @access protected | ||
1024 | */ | 1019 | */ |
1025 | function &_read_unicode_block_db() { | 1020 | function _read_unicode_block_db() |
1021 | { | ||
1026 | // since the unicode definitions are always going to be the same, | 1022 | // since the unicode definitions are always going to be the same, |
1027 | // might as well share the memory for the db with all other instances | 1023 | // might as well share the memory for the db with all other instances |
1028 | // of this class | 1024 | // of this class |
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect | |||
1037 | 1033 | ||
1038 | /** | 1034 | /** |
1039 | * Calculate the similarities between the language models | 1035 | * Calculate the similarities between the language models |
1040 | * | 1036 | * |
1041 | * Use this function to see how similar languages are to each other. | 1037 | * Use this function to see how similar languages are to each other. |
1042 | * | 1038 | * |
1043 | * If passed 2 language names, will return just those languages compared. | 1039 | * If passed 2 language names, will return just those languages compared. |
1044 | * If passed 1 language name, will return that language compared to | 1040 | * If passed 1 language name, will return that language compared to |
1045 | * all others. | 1041 | * all others. |
1046 | * If passed none, will return an array of every language model compared | 1042 | * If passed none, will return an array of every language model compared |
1047 | * to every other one. | 1043 | * to every other one. |
1048 | * | 1044 | * |
1049 | * @access public | 1045 | * @param string $lang1 the name of the first language to be compared |
1050 | * @param string $lang1 the name of the first language to be compared | 1046 | * @param string $lang2 the name of the second language to be compared |
1051 | * @param string $lang2 the name of the second language to be compared | 1047 | * |
1052 | * @return array scores of every language compared | 1048 | * @return array scores of every language compared |
1053 | * or the score of just the provided languages | 1049 | * or the score of just the provided languages |
1054 | * or null if one of the supplied languages does not exist | 1050 | * or null if one of the supplied languages does not exist |
1055 | * @throws PEAR_Error | 1051 | * @throws Text_LanguageDetect_Exception |
1056 | */ | 1052 | */ |
1057 | function languageSimilarity($lang1 = null, $lang2 = null) | 1053 | public function languageSimilarity($lang1 = null, $lang2 = null) |
1058 | { | 1054 | { |
1059 | if (!$this->_setup_ok($err)) { | 1055 | $lang1 = $this->_convertFromNameMode($lang1); |
1060 | return $err; | 1056 | $lang2 = $this->_convertFromNameMode($lang2); |
1061 | } | ||
1062 | |||
1063 | if ($lang1 != null) { | 1057 | if ($lang1 != null) { |
1064 | $lang1 = strtolower($lang1); | 1058 | $lang1 = strtolower($lang1); |
1065 | 1059 | ||
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect | |||
1069 | } | 1063 | } |
1070 | 1064 | ||
1071 | if ($lang2 != null) { | 1065 | if ($lang2 != null) { |
1072 | 1066 | if (!isset($this->_lang_db[$lang2])) { | |
1073 | // can't only set the second param | 1067 | // check if language model exists |
1074 | if ($lang1 == null) { | ||
1075 | return null; | ||
1076 | // check if language model exists | ||
1077 | } elseif (!isset($this->_lang_db[$lang2])) { | ||
1078 | return null; | 1068 | return null; |
1079 | } | 1069 | } |
1080 | 1070 | ||
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect | |||
1088 | ) | 1078 | ) |
1089 | ); | 1079 | ); |
1090 | 1080 | ||
1091 | |||
1092 | // compare just $lang1 to all languages | ||
1093 | } else { | 1081 | } else { |
1082 | // compare just $lang1 to all languages | ||
1094 | $return_arr = array(); | 1083 | $return_arr = array(); |
1095 | foreach ($this->_lang_db as $key => $value) { | 1084 | foreach ($this->_lang_db as $key => $value) { |
1096 | if ($key != $lang1) { // don't compare a language to itself | 1085 | if ($key != $lang1) { |
1086 | // don't compare a language to itself | ||
1097 | $return_arr[$key] = $this->_normalize_score( | 1087 | $return_arr[$key] = $this->_normalize_score( |
1098 | $this->_distance($this->_lang_db[$lang1], $value)); | 1088 | $this->_distance($this->_lang_db[$lang1], $value) |
1089 | ); | ||
1099 | } | 1090 | } |
1100 | } | 1091 | } |
1101 | asort($return_arr); | 1092 | asort($return_arr); |
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect | |||
1104 | } | 1095 | } |
1105 | 1096 | ||
1106 | 1097 | ||
1107 | // compare all languages to each other | ||
1108 | } else { | 1098 | } else { |
1099 | // compare all languages to each other | ||
1109 | $return_arr = array(); | 1100 | $return_arr = array(); |
1110 | foreach (array_keys($this->_lang_db) as $lang1) { | 1101 | foreach (array_keys($this->_lang_db) as $lang1) { |
1111 | foreach (array_keys($this->_lang_db) as $lang2) { | 1102 | foreach (array_keys($this->_lang_db) as $lang2) { |
1112 | |||
1113 | // skip comparing languages to themselves | 1103 | // skip comparing languages to themselves |
1114 | if ($lang1 != $lang2) { | 1104 | if ($lang1 != $lang2) { |
1115 | |||
1116 | // don't re-calculate what's already been done | ||
1117 | if (isset($return_arr[$lang2][$lang1])) { | ||
1118 | 1105 | ||
1119 | $return_arr[$lang1][$lang2] = | 1106 | if (isset($return_arr[$lang2][$lang1])) { |
1120 | $return_arr[$lang2][$lang1]; | 1107 | // don't re-calculate what's already been done |
1108 | $return_arr[$lang1][$lang2] | ||
1109 | = $return_arr[$lang2][$lang1]; | ||
1121 | 1110 | ||
1122 | // calculate | ||
1123 | } else { | 1111 | } else { |
1124 | 1112 | // calculate | |
1125 | $return_arr[$lang1][$lang2] = | 1113 | $return_arr[$lang1][$lang2] |
1126 | $this->_normalize_score( | 1114 | = $this->_normalize_score( |
1127 | $this->_distance( | 1115 | $this->_distance( |
1128 | $this->_lang_db[$lang1], | 1116 | $this->_lang_db[$lang1], |
1129 | $this->_lang_db[$lang2] | 1117 | $this->_lang_db[$lang2] |
1130 | ) | 1118 | ) |
1131 | ); | 1119 | ); |
1132 | 1120 | ||
1133 | } | 1121 | } |
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect | |||
1150 | * | 1138 | * |
1151 | * @access public | 1139 | * @access public |
1152 | * @return array language cluster data | 1140 | * @return array language cluster data |
1153 | * @throws PEAR_Error | 1141 | * @throws Text_LanguageDetect_Exception |
1154 | * @see languageSimilarity() | 1142 | * @see languageSimilarity() |
1155 | * @deprecated this function will eventually be removed and placed into | 1143 | * @deprecated this function will eventually be removed and placed into |
1156 | * the model generation class | 1144 | * the model generation class |
1157 | */ | 1145 | */ |
1158 | function clusterLanguages() | 1146 | function clusterLanguages() |
1159 | { | 1147 | { |
1160 | // todo: set the maximum number of clusters | 1148 | // todo: set the maximum number of clusters |
1161 | |||
1162 | // setup check | ||
1163 | if (!$this->_setup_ok($err)) { | ||
1164 | return $err; | ||
1165 | } | ||
1166 | |||
1167 | // return cached result, if any | 1149 | // return cached result, if any |
1168 | if (isset($this->_clusters)) { | 1150 | if (isset($this->_clusters)) { |
1169 | return $this->_clusters; | 1151 | return $this->_clusters; |
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect | |||
1177 | 1159 | ||
1178 | foreach ($langs as $lang) { | 1160 | foreach ($langs as $lang) { |
1179 | if (!isset($this->_lang_db[$lang])) { | 1161 | if (!isset($this->_lang_db[$lang])) { |
1180 | throw new Exception("missing $lang!\n"); | 1162 | throw new Text_LanguageDetect_Exception( |
1163 | "missing $lang!", | ||
1164 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | ||
1165 | ); | ||
1181 | } | 1166 | } |
1182 | } | 1167 | } |
1183 | 1168 | ||
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect | |||
1186 | $langs[$lang1] = $lang1; | 1171 | $langs[$lang1] = $lang1; |
1187 | unset($langs[$old_key]); | 1172 | unset($langs[$old_key]); |
1188 | } | 1173 | } |
1189 | 1174 | ||
1175 | $result_data = $really_map = array(); | ||
1176 | |||
1190 | $i = 0; | 1177 | $i = 0; |
1191 | while (count($langs) > 2 && $i++ < 200) { | 1178 | while (count($langs) > 2 && $i++ < 200) { |
1192 | $highest_score = -1; | 1179 | $highest_score = -1; |
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect | |||
1194 | $highest_key2 = ''; | 1181 | $highest_key2 = ''; |
1195 | foreach ($langs as $lang1) { | 1182 | foreach ($langs as $lang1) { |
1196 | foreach ($langs as $lang2) { | 1183 | foreach ($langs as $lang2) { |
1197 | if ( $lang1 != $lang2 | 1184 | if ($lang1 != $lang2 |
1198 | && $arr[$lang1][$lang2] > $highest_score) { | 1185 | && $arr[$lang1][$lang2] > $highest_score |
1186 | ) { | ||
1199 | $highest_score = $arr[$lang1][$lang2]; | 1187 | $highest_score = $arr[$lang1][$lang2]; |
1200 | $highest_key1 = $lang1; | 1188 | $highest_key1 = $lang1; |
1201 | $highest_key2 = $lang2; | 1189 | $highest_key2 = $lang2; |
1202 | } | 1190 | } |
1203 | } | 1191 | } |
1204 | } | 1192 | } |
1205 | 1193 | ||
1206 | if (!$highest_key1) { | 1194 | if (!$highest_key1) { |
1207 | // should not ever happen | 1195 | // should not ever happen |
1208 | throw new Exception("no highest key? (step: $i)"); | 1196 | throw new Text_LanguageDetect_Exception( |
1197 | "no highest key? (step: $i)", | ||
1198 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | ||
1199 | ); | ||
1209 | } | 1200 | } |
1210 | 1201 | ||
1211 | if ($highest_score == 0) { | 1202 | if ($highest_score == 0) { |
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect | |||
1217 | $sum1 = array_sum($arr[$highest_key1]); | 1208 | $sum1 = array_sum($arr[$highest_key1]); |
1218 | $sum2 = array_sum($arr[$highest_key2]); | 1209 | $sum2 = array_sum($arr[$highest_key2]); |
1219 | 1210 | ||
1220 | // use the score for the one that is most similar to the rest of | 1211 | // use the score for the one that is most similar to the rest of |
1221 | // the field as the score for the group | 1212 | // the field as the score for the group |
1222 | // todo: could try averaging or "centroid" method instead | 1213 | // todo: could try averaging or "centroid" method instead |
1223 | // seems like that might make more sense | 1214 | // seems like that might make more sense |
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect | |||
1248 | $really_lang = $replaceme; | 1239 | $really_lang = $replaceme; |
1249 | while (isset($really_map[$really_lang])) { | 1240 | while (isset($really_map[$really_lang])) { |
1250 | $really_lang = $really_map[$really_lang]; | 1241 | $really_lang = $really_map[$really_lang]; |
1251 | } | 1242 | } |
1252 | $really_map[$newkey] = $really_lang; | 1243 | $really_map[$newkey] = $really_lang; |
1253 | 1244 | ||
1254 | 1245 | ||
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect | |||
1259 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | 1250 | $arr[$key1][$newkey] = $arr[$key1][$key2]; |
1260 | unset($arr[$key1][$key2]); | 1251 | unset($arr[$key1][$key2]); |
1261 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | 1252 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] |
1262 | } | 1253 | } |
1263 | 1254 | ||
1264 | if ($key1 == $replaceme) { | 1255 | if ($key1 == $replaceme) { |
1265 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | 1256 | $arr[$newkey][$key2] = $arr[$key1][$key2]; |
1266 | unset($arr[$key1][$key2]); | 1257 | unset($arr[$key1][$key2]); |
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect | |||
1273 | } | 1264 | } |
1274 | } | 1265 | } |
1275 | } | 1266 | } |
1276 | 1267 | ||
1277 | 1268 | ||
1278 | unset($langs[$highest_key1]); | 1269 | unset($langs[$highest_key1]); |
1279 | unset($langs[$highest_key2]); | 1270 | unset($langs[$highest_key2]); |
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect | |||
1293 | } | 1284 | } |
1294 | 1285 | ||
1295 | $return_val = array( | 1286 | $return_val = array( |
1296 | 'open_forks' => $langs, | 1287 | 'open_forks' => $langs, |
1297 | // the top level of clusters | 1288 | // the top level of clusters |
1298 | // clusters that are mutually exclusive | 1289 | // clusters that are mutually exclusive |
1299 | // or specified by a specific maximum | 1290 | // or specified by a specific maximum |
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect | |||
1323 | * use, and it may disappear or its functionality may change in future | 1314 | * use, and it may disappear or its functionality may change in future |
1324 | * releases without notice. | 1315 | * releases without notice. |
1325 | * | 1316 | * |
1326 | * This compares the sample text to top the top level of clusters. If the | 1317 | * This compares the sample text to top the top level of clusters. If the |
1327 | * sample is similar to the cluster it will drop down and compare it to the | 1318 | * sample is similar to the cluster it will drop down and compare it to the |
1328 | * languages in the cluster, and so on until it hits a leaf node. | 1319 | * languages in the cluster, and so on until it hits a leaf node. |
1329 | * | 1320 | * |
1330 | * this should find the language in considerably fewer compares | 1321 | * this should find the language in considerably fewer compares |
1331 | * (the equivalent of a binary search), however clusterLanguages() is costly | 1322 | * (the equivalent of a binary search), however clusterLanguages() is costly |
1332 | * and the loss of accuracy from this technique is significant. | 1323 | * and the loss of accuracy from this technique is significant. |
1333 | * | 1324 | * |
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect | |||
1337 | * was very large, however in such cases some method of Bayesian inference | 1328 | * was very large, however in such cases some method of Bayesian inference |
1338 | * might be more helpful. | 1329 | * might be more helpful. |
1339 | * | 1330 | * |
1340 | * @see clusterLanguages() | 1331 | * @param string $str input string |
1341 | * @access public | 1332 | * |
1342 | * @param string $str input string | 1333 | * @return array language scores (only those compared) |
1343 | * @return array language scores (only those compared) | 1334 | * @throws Text_LanguageDetect_Exception |
1344 | * @throws PEAR_Error | 1335 | * @see clusterLanguages() |
1345 | */ | 1336 | */ |
1346 | function clusteredSearch($str) | 1337 | public function clusteredSearch($str) |
1347 | { | 1338 | { |
1348 | |||
1349 | // input check | 1339 | // input check |
1350 | if (!Text_LanguageDetect_Parser::validateString($str)) { | 1340 | if (!Text_LanguageDetect_Parser::validateString($str)) { |
1351 | return array(); | 1341 | return array(); |
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect | |||
1359 | $dendogram_data = $result['fork_data']; | 1349 | $dendogram_data = $result['fork_data']; |
1360 | $dendogram_alias = $result['name_map']; | 1350 | $dendogram_alias = $result['name_map']; |
1361 | 1351 | ||
1362 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 1352 | $sample_obj = new Text_LanguageDetect_Parser($str); |
1363 | $sample_obj->prepareTrigram(); | 1353 | $sample_obj->prepareTrigram(); |
1364 | $sample_obj->setPadStart(!$this->_perl_compatible); | 1354 | $sample_obj->setPadStart(!$this->_perl_compatible); |
1365 | $sample_obj->analyze(); | 1355 | $sample_obj->analyze(); |
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect | |||
1372 | } | 1362 | } |
1373 | 1363 | ||
1374 | $i = 0; // counts the number of steps | 1364 | $i = 0; // counts the number of steps |
1375 | 1365 | ||
1376 | foreach ($dendogram_start as $lang) { | 1366 | foreach ($dendogram_start as $lang) { |
1377 | if (isset($dendogram_alias[$lang])) { | 1367 | if (isset($dendogram_alias[$lang])) { |
1378 | $lang_key = $dendogram_alias[$lang]; | 1368 | $lang_key = $dendogram_alias[$lang]; |
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect | |||
1382 | 1372 | ||
1383 | $scores[$lang] = $this->_normalize_score( | 1373 | $scores[$lang] = $this->_normalize_score( |
1384 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1374 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1385 | $sample_count); | 1375 | $sample_count |
1376 | ); | ||
1386 | 1377 | ||
1387 | $i++; | 1378 | $i++; |
1388 | } | 1379 | } |
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect | |||
1411 | 1402 | ||
1412 | $scores[$lang] = $this->_normalize_score( | 1403 | $scores[$lang] = $this->_normalize_score( |
1413 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1404 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1414 | $sample_count); | 1405 | $sample_count |
1406 | ); | ||
1415 | 1407 | ||
1416 | //todo: does not need to do same comparison again | 1408 | //todo: does not need to do same comparison again |
1417 | } | 1409 | } |
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect | |||
1428 | 1420 | ||
1429 | $diff = $scores[$cur_key] - $scores[$loser_key]; | 1421 | $diff = $scores[$cur_key] - $scores[$loser_key]; |
1430 | 1422 | ||
1431 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | 1423 | // $cur_key ({$dendogram_alias[$cur_key]}) wins |
1432 | // over $loser_key ({$dendogram_alias[$loser_key]}) | 1424 | // over $loser_key ({$dendogram_alias[$loser_key]}) |
1433 | // with a difference of $diff | 1425 | // with a difference of $diff |
1434 | } | 1426 | } |
1435 | 1427 | ||
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect | |||
1439 | // which paths the algorithm decided to take along the tree | 1431 | // which paths the algorithm decided to take along the tree |
1440 | 1432 | ||
1441 | // but sometimes the last item is only the second highest | 1433 | // but sometimes the last item is only the second highest |
1442 | if ( ($this->_perl_compatible && (end($scores) > prev($scores))) | 1434 | if (($this->_perl_compatible && (end($scores) > prev($scores))) |
1443 | || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { | 1435 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) |
1444 | 1436 | ) { | |
1445 | $real_last_score = current($scores); | 1437 | $real_last_score = current($scores); |
1446 | $real_last_key = key($scores); | 1438 | $real_last_key = key($scores); |
1447 | 1439 | ||
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect | |||
1449 | unset($scores[$real_last_key]); | 1441 | unset($scores[$real_last_key]); |
1450 | $scores[$real_last_key] = $real_last_score; | 1442 | $scores[$real_last_key] = $real_last_score; |
1451 | } | 1443 | } |
1452 | 1444 | ||
1453 | 1445 | ||
1454 | if (!$this->_perl_compatible) { | 1446 | if (!$this->_perl_compatible) { |
1455 | $scores = array_reverse($scores, true); | 1447 | $scores = array_reverse($scores, true); |
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect | |||
1464 | * | 1456 | * |
1465 | * Returns the numbers of characters (not bytes) in a utf8 string | 1457 | * Returns the numbers of characters (not bytes) in a utf8 string |
1466 | * | 1458 | * |
1467 | * @static | 1459 | * @param string $str string to get the length of |
1468 | * @access public | 1460 | * |
1469 | * @param string $str string to get the length of | 1461 | * @return int number of chars |
1470 | * @return int number of chars | ||
1471 | */ | 1462 | */ |
1472 | function utf8strlen($str) | 1463 | public static function utf8strlen($str) |
1473 | { | 1464 | { |
1474 | // utf8_decode() will convert unknown chars to '?', which is actually | 1465 | // utf8_decode() will convert unknown chars to '?', which is actually |
1475 | // ideal for counting. | 1466 | // ideal for counting. |
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect | |||
1482 | /** | 1473 | /** |
1483 | * Returns the unicode value of a utf8 char | 1474 | * Returns the unicode value of a utf8 char |
1484 | * | 1475 | * |
1485 | * @access protected | 1476 | * @param string $char a utf8 (possibly multi-byte) char |
1486 | * @param string $char a utf8 (possibly multi-byte) char | 1477 | * |
1487 | * @return int unicode value or -1 if malformatted | 1478 | * @return int unicode value |
1479 | * @access protected | ||
1480 | * @link http://en.wikipedia.org/wiki/UTF-8 | ||
1488 | */ | 1481 | */ |
1489 | function _utf8char2unicode($char) { | 1482 | function _utf8char2unicode($char) |
1490 | 1483 | { | |
1491 | // strlen() here will actually get the binary length of a single char | 1484 | // strlen() here will actually get the binary length of a single char |
1492 | switch (strlen($char)) { | 1485 | switch (strlen($char)) { |
1493 | 1486 | case 1: | |
1494 | // for a reference, see http://en.wikipedia.org/wiki/UTF-8 | 1487 | // normal ASCII-7 byte |
1495 | 1488 | // 0xxxxxxx --> 0xxxxxxx | |
1496 | case 1: | 1489 | return ord($char{0}); |
1497 | // normal ASCII-7 byte | 1490 | |
1498 | // 0xxxxxxx --> 0xxxxxxx | 1491 | case 2: |
1499 | return ord($char{0}); | 1492 | // 2 byte unicode |
1500 | 1493 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1501 | case 2: | 1494 | $z = (ord($char{0}) & 0x000001F) << 6; |
1502 | // 2 byte unicode | 1495 | $x = (ord($char{1}) & 0x0000003F); |
1503 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | 1496 | return ($z | $x); |
1504 | $z = (ord($char{0}) & 0x000001F) << 6; | 1497 | |
1505 | $x = (ord($char{1}) & 0x0000003F); | 1498 | case 3: |
1506 | 1499 | // 3 byte unicode | |
1507 | return ($z | $x); | 1500 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx |
1508 | 1501 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1509 | case 3: | 1502 | $x1 = (ord($char{1}) & 0x0000003F) << 6; |
1510 | // 3 byte unicode | 1503 | $x2 = (ord($char{2}) & 0x0000003F); |
1511 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | 1504 | return ($z | $x1 | $x2); |
1512 | $z = (ord($char{0}) & 0x0000000F) << 12; | 1505 | |
1513 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | 1506 | case 4: |
1514 | $x2 = (ord($char{2}) & 0x0000003F); | 1507 | // 4 byte unicode |
1515 | 1508 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1516 | return ($z | $x1 | $x2); | 1509 | // 000zzzzz xxxxxxxx xxxxxxxx |
1517 | 1510 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1518 | case 4: | 1511 | $z2 = (ord($char{1}) & 0x0000003F) << 12; |
1519 | // 4 byte unicode | 1512 | $x1 = (ord($char{2}) & 0x0000003F) << 6; |
1520 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | 1513 | $x2 = (ord($char{3}) & 0x0000003F); |
1521 | // 000zzzzz xxxxxxxx xxxxxxxx | 1514 | return ($z1 | $z2 | $x1 | $x2); |
1522 | $z1 = (ord($char{0}) & 0x00000007) << 18; | ||
1523 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | ||
1524 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | ||
1525 | $x2 = (ord($char{3}) & 0x0000003F); | ||
1526 | |||
1527 | return ($z1 | $z2 | $x1 | $x2); | ||
1528 | |||
1529 | default: | ||
1530 | // error: malformatted char? | ||
1531 | return -1; | ||
1532 | } | 1515 | } |
1533 | } | 1516 | } |
1534 | 1517 | ||
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect | |||
1536 | * utf8-safe fast character iterator | 1519 | * utf8-safe fast character iterator |
1537 | * | 1520 | * |
1538 | * Will get the next character starting from $counter, which will then be | 1521 | * Will get the next character starting from $counter, which will then be |
1539 | * incremented. If a multi-byte char the bytes will be concatenated and | 1522 | * incremented. If a multi-byte char the bytes will be concatenated and |
1540 | * $counter will be incremeted by the number of bytes in the char. | 1523 | * $counter will be incremeted by the number of bytes in the char. |
1541 | * | 1524 | * |
1542 | * @access private | 1525 | * @param string $str the string being iterated over |
1543 | * @param string &$str the string being iterated over | 1526 | * @param int &$counter the iterator, will increment by reference |
1544 | * @param int &$counter the iterator, will increment by reference | 1527 | * @param bool $special_convert whether to do special conversions |
1545 | * @param bool $special_convert whether to do special conversions | 1528 | * |
1546 | * @return char the next (possibly multi-byte) char from $counter | 1529 | * @return char the next (possibly multi-byte) char from $counter |
1530 | * @access private | ||
1547 | */ | 1531 | */ |
1548 | function _next_char(&$str, &$counter, $special_convert = false) | 1532 | static function _next_char($str, &$counter, $special_convert = false) |
1549 | { | 1533 | { |
1550 | |||
1551 | $char = $str{$counter++}; | 1534 | $char = $str{$counter++}; |
1552 | $ord = ord($char); | 1535 | $ord = ord($char); |
1553 | 1536 | ||
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect | |||
1556 | 1539 | ||
1557 | // normal ascii one byte char | 1540 | // normal ascii one byte char |
1558 | if ($ord <= 127) { | 1541 | if ($ord <= 127) { |
1559 | |||
1560 | // special conversions needed for this package | 1542 | // special conversions needed for this package |
1561 | // (that only apply to regular ascii characters) | 1543 | // (that only apply to regular ascii characters) |
1562 | // lower case, and convert all non-alphanumeric characters | 1544 | // lower case, and convert all non-alphanumeric characters |
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect | |||
1571 | 1553 | ||
1572 | return $char; | 1554 | return $char; |
1573 | 1555 | ||
1574 | // multi-byte chars | ||
1575 | } elseif ($ord >> 5 == 6) { // two-byte char | 1556 | } elseif ($ord >> 5 == 6) { // two-byte char |
1557 | // multi-byte chars | ||
1576 | $nextchar = $str{$counter++}; // get next byte | 1558 | $nextchar = $str{$counter++}; // get next byte |
1577 | 1559 | ||
1578 | // lower-casing of non-ascii characters is still incomplete | 1560 | // lower-casing of non-ascii characters is still incomplete |
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect | |||
1582 | if ($ord == 195) { | 1564 | if ($ord == 195) { |
1583 | $nextord = ord($nextchar); | 1565 | $nextord = ord($nextchar); |
1584 | $nextord_adj = $nextord + 64; | 1566 | $nextord_adj = $nextord + 64; |
1585 | // for a reference, see | 1567 | // for a reference, see |
1586 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | 1568 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html |
1587 | 1569 | ||
1588 | // À - Þ but not × | 1570 | // À - Þ but not × |
1589 | if ( $nextord_adj >= 192 | 1571 | if ($nextord_adj >= 192 |
1590 | && $nextord_adj <= 222 | 1572 | && $nextord_adj <= 222 |
1591 | && $nextord_adj != 215) { | 1573 | && $nextord_adj != 215 |
1592 | 1574 | ) { | |
1593 | $nextchar = chr($nextord + 32); | 1575 | $nextchar = chr($nextord + 32); |
1594 | } | 1576 | } |
1595 | 1577 | ||
1596 | // lower case cyrillic alphabet | ||
1597 | } elseif ($ord == 208) { | 1578 | } elseif ($ord == 208) { |
1579 | // lower case cyrillic alphabet | ||
1598 | $nextord = ord($nextchar); | 1580 | $nextord = ord($nextchar); |
1599 | // if A - Pe | 1581 | // if A - Pe |
1600 | if ($nextord >= 144 && $nextord <= 159) { | 1582 | if ($nextord >= 144 && $nextord <= 159) { |
1601 | // lower case | 1583 | // lower case |
1602 | $nextchar = chr($nextord + 32); | 1584 | $nextchar = chr($nextord + 32); |
1603 | 1585 | ||
1604 | // if Er - Ya | ||
1605 | } elseif ($nextord >= 160 && $nextord <= 175) { | 1586 | } elseif ($nextord >= 160 && $nextord <= 175) { |
1587 | // if Er - Ya | ||
1606 | // lower case | 1588 | // lower case |
1607 | $char = chr(209); // == $ord++ | 1589 | $char = chr(209); // == $ord++ |
1608 | $nextchar = chr($nextord - 32); | 1590 | $nextchar = chr($nextord - 32); |
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect | |||
1611 | } | 1593 | } |
1612 | 1594 | ||
1613 | // tag on next byte | 1595 | // tag on next byte |
1614 | return $char . $nextchar; | 1596 | return $char . $nextchar; |
1615 | |||
1616 | } elseif ($ord >> 4 == 14) { // three-byte char | 1597 | } elseif ($ord >> 4 == 14) { // three-byte char |
1617 | 1598 | ||
1618 | // tag on next 2 bytes | 1599 | // tag on next 2 bytes |
1619 | return $char . $str{$counter++} . $str{$counter++}; | 1600 | return $char . $str{$counter++} . $str{$counter++}; |
1620 | 1601 | ||
1621 | } elseif ($ord >> 3 == 30) { // four-byte char | 1602 | } elseif ($ord >> 3 == 30) { // four-byte char |
1622 | 1603 | ||
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect | |||
1628 | } | 1609 | } |
1629 | } | 1610 | } |
1630 | 1611 | ||
1631 | } | 1612 | /** |
1613 | * Converts an $language input parameter from the configured mode | ||
1614 | * to the language name that is used internally. | ||
1615 | * | ||
1616 | * Works for strings and arrays. | ||
1617 | * | ||
1618 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1619 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1620 | * converts the keys to the language name. | ||
1621 | * | ||
1622 | * @return string|array Language name | ||
1623 | */ | ||
1624 | function _convertFromNameMode($lang, $convertKey = false) | ||
1625 | { | ||
1626 | if ($this->_name_mode == 0) { | ||
1627 | return $lang; | ||
1628 | } | ||
1629 | |||
1630 | if ($this->_name_mode == 2) { | ||
1631 | $method = 'code2ToName'; | ||
1632 | } else { | ||
1633 | $method = 'code3ToName'; | ||
1634 | } | ||
1635 | |||
1636 | if (is_string($lang)) { | ||
1637 | return (string)Text_LanguageDetect_ISO639::$method($lang); | ||
1638 | } | ||
1639 | |||
1640 | $newlang = array(); | ||
1641 | foreach ($lang as $key => $val) { | ||
1642 | if ($convertKey) { | ||
1643 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | ||
1644 | $newlang[$newkey] = $val; | ||
1645 | } else { | ||
1646 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | ||
1647 | } | ||
1648 | } | ||
1649 | return $newlang; | ||
1650 | } | ||
1632 | 1651 | ||
1633 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | 1652 | /** |
1653 | * Converts an $language output parameter from the language name that is | ||
1654 | * used internally to the configured mode. | ||
1655 | * | ||
1656 | * Works for strings and arrays. | ||
1657 | * | ||
1658 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1659 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1660 | * converts the keys to the language name. | ||
1661 | * | ||
1662 | * @return string|array Language name | ||
1663 | */ | ||
1664 | function _convertToNameMode($lang, $convertKey = false) | ||
1665 | { | ||
1666 | if ($this->_name_mode == 0) { | ||
1667 | return $lang; | ||
1668 | } | ||
1669 | |||
1670 | if ($this->_name_mode == 2) { | ||
1671 | $method = 'nameToCode2'; | ||
1672 | } else { | ||
1673 | $method = 'nameToCode3'; | ||
1674 | } | ||
1675 | |||
1676 | if (is_string($lang)) { | ||
1677 | return Text_LanguageDetect_ISO639::$method($lang); | ||
1678 | } | ||
1679 | |||
1680 | $newlang = array(); | ||
1681 | foreach ($lang as $key => $val) { | ||
1682 | if ($convertKey) { | ||
1683 | $newkey = Text_LanguageDetect_ISO639::$method($key); | ||
1684 | $newlang[$newkey] = $val; | ||
1685 | } else { | ||
1686 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | ||
1687 | } | ||
1688 | } | ||
1689 | return $newlang; | ||
1690 | } | ||
1691 | } | ||
1634 | 1692 | ||
1635 | ?> | 1693 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file |
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php new file mode 100644 index 00000000..196d994f --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php | |||
@@ -0,0 +1,57 @@ | |||
1 | <?php | ||
2 | class Text_LanguageDetect_Exception extends Exception | ||
3 | { | ||
4 | /** | ||
5 | * Database file could not be found | ||
6 | */ | ||
7 | const DB_NOT_FOUND = 10; | ||
8 | |||
9 | /** | ||
10 | * Database file found, but not readable | ||
11 | */ | ||
12 | const DB_NOT_READABLE = 11; | ||
13 | |||
14 | /** | ||
15 | * Database file is empty | ||
16 | */ | ||
17 | const DB_EMPTY = 12; | ||
18 | |||
19 | /** | ||
20 | * Database contents is not a PHP array | ||
21 | */ | ||
22 | const DB_NOT_ARRAY = 13; | ||
23 | |||
24 | /** | ||
25 | * Magic quotes are activated | ||
26 | */ | ||
27 | const MAGIC_QUOTES = 14; | ||
28 | |||
29 | |||
30 | /** | ||
31 | * Parameter of invalid type passed to method | ||
32 | */ | ||
33 | const PARAM_TYPE = 20; | ||
34 | |||
35 | /** | ||
36 | * Character in parameter is invalid | ||
37 | */ | ||
38 | const INVALID_CHAR = 21; | ||
39 | |||
40 | |||
41 | /** | ||
42 | * Language is not in the database | ||
43 | */ | ||
44 | const UNKNOWN_LANGUAGE = 30; | ||
45 | |||
46 | |||
47 | /** | ||
48 | * Error during block detection | ||
49 | */ | ||
50 | const BLOCK_DETECTION = 40; | ||
51 | |||
52 | |||
53 | /** | ||
54 | * Error while clustering languages | ||
55 | */ | ||
56 | const NO_HIGHEST_KEY = 50; | ||
57 | } | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php new file mode 100644 index 00000000..05b0590d --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php | |||
@@ -0,0 +1,339 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Part of Text_LanguageDetect | ||
4 | * | ||
5 | * PHP version 5 | ||
6 | * | ||
7 | * @category Text | ||
8 | * @package Text_LanguageDetect | ||
9 | * @author Christian Weiske <cweiske@php.net> | ||
10 | * @copyright 2011 Christian Weiske <cweiske@php.net> | ||
11 | * @license http://www.debian.org/misc/bsd.license BSD | ||
12 | * @version SVN: $Id$ | ||
13 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
14 | */ | ||
15 | |||
16 | /** | ||
17 | * Provides a mapping between the languages from lang.dat and the | ||
18 | * ISO 639-1 and ISO-639-2 codes. | ||
19 | * | ||
20 | * Note that this class contains only languages that exist in lang.dat. | ||
21 | * | ||
22 | * @category Text | ||
23 | * @package Text_LanguageDetect | ||
24 | * @author Christian Weiske <cweiske@php.net> | ||
25 | * @copyright 2011 Christian Weiske <cweiske@php.net> | ||
26 | * @license http://www.debian.org/misc/bsd.license BSD | ||
27 | * @link http://www.loc.gov/standards/iso639-2/php/code_list.php | ||
28 | */ | ||
29 | class Text_LanguageDetect_ISO639 | ||
30 | { | ||
31 | /** | ||
32 | * Maps all language names from the language database to the | ||
33 | * ISO 639-1 2-letter language code. | ||
34 | * | ||
35 | * NULL indicates that there is no 2-letter code. | ||
36 | * | ||
37 | * @var array | ||
38 | */ | ||
39 | public static $nameToCode2 = array( | ||
40 | 'albanian' => 'sq', | ||
41 | 'arabic' => 'ar', | ||
42 | 'azeri' => 'az', | ||
43 | 'bengali' => 'bn', | ||
44 | 'bulgarian' => 'bg', | ||
45 | 'cebuano' => null, | ||
46 | 'croatian' => 'hr', | ||
47 | 'czech' => 'cs', | ||
48 | 'danish' => 'da', | ||
49 | 'dutch' => 'nl', | ||
50 | 'english' => 'en', | ||
51 | 'estonian' => 'et', | ||
52 | 'farsi' => 'fa', | ||
53 | 'finnish' => 'fi', | ||
54 | 'french' => 'fr', | ||
55 | 'german' => 'de', | ||
56 | 'hausa' => 'ha', | ||
57 | 'hawaiian' => null, | ||
58 | 'hindi' => 'hi', | ||
59 | 'hungarian' => 'hu', | ||
60 | 'icelandic' => 'is', | ||
61 | 'indonesian' => 'id', | ||
62 | 'italian' => 'it', | ||
63 | 'kazakh' => 'kk', | ||
64 | 'kyrgyz' => 'ky', | ||
65 | 'latin' => 'la', | ||
66 | 'latvian' => 'lv', | ||
67 | 'lithuanian' => 'lt', | ||
68 | 'macedonian' => 'mk', | ||
69 | 'mongolian' => 'mn', | ||
70 | 'nepali' => 'ne', | ||
71 | 'norwegian' => 'no', | ||
72 | 'pashto' => 'ps', | ||
73 | 'pidgin' => null, | ||
74 | 'polish' => 'pl', | ||
75 | 'portuguese' => 'pt', | ||
76 | 'romanian' => 'ro', | ||
77 | 'russian' => 'ru', | ||
78 | 'serbian' => 'sr', | ||
79 | 'slovak' => 'sk', | ||
80 | 'slovene' => 'sl', | ||
81 | 'somali' => 'so', | ||
82 | 'spanish' => 'es', | ||
83 | 'swahili' => 'sw', | ||
84 | 'swedish' => 'sv', | ||
85 | 'tagalog' => 'tl', | ||
86 | 'turkish' => 'tr', | ||
87 | 'ukrainian' => 'uk', | ||
88 | 'urdu' => 'ur', | ||
89 | 'uzbek' => 'uz', | ||
90 | 'vietnamese' => 'vi', | ||
91 | 'welsh' => 'cy', | ||
92 | ); | ||
93 | |||
94 | /** | ||
95 | * Maps all language names from the language database to the | ||
96 | * ISO 639-2 3-letter language code. | ||
97 | * | ||
98 | * @var array | ||
99 | */ | ||
100 | public static $nameToCode3 = array( | ||
101 | 'albanian' => 'sqi', | ||
102 | 'arabic' => 'ara', | ||
103 | 'azeri' => 'aze', | ||
104 | 'bengali' => 'ben', | ||
105 | 'bulgarian' => 'bul', | ||
106 | 'cebuano' => 'ceb', | ||
107 | 'croatian' => 'hrv', | ||
108 | 'czech' => 'ces', | ||
109 | 'danish' => 'dan', | ||
110 | 'dutch' => 'nld', | ||
111 | 'english' => 'eng', | ||
112 | 'estonian' => 'est', | ||
113 | 'farsi' => 'fas', | ||
114 | 'finnish' => 'fin', | ||
115 | 'french' => 'fra', | ||
116 | 'german' => 'deu', | ||
117 | 'hausa' => 'hau', | ||
118 | 'hawaiian' => 'haw', | ||
119 | 'hindi' => 'hin', | ||
120 | 'hungarian' => 'hun', | ||
121 | 'icelandic' => 'isl', | ||
122 | 'indonesian' => 'ind', | ||
123 | 'italian' => 'ita', | ||
124 | 'kazakh' => 'kaz', | ||
125 | 'kyrgyz' => 'kir', | ||
126 | 'latin' => 'lat', | ||
127 | 'latvian' => 'lav', | ||
128 | 'lithuanian' => 'lit', | ||
129 | 'macedonian' => 'mkd', | ||
130 | 'mongolian' => 'mon', | ||
131 | 'nepali' => 'nep', | ||
132 | 'norwegian' => 'nor', | ||
133 | 'pashto' => 'pus', | ||
134 | 'pidgin' => 'crp', | ||
135 | 'polish' => 'pol', | ||
136 | 'portuguese' => 'por', | ||
137 | 'romanian' => 'ron', | ||
138 | 'russian' => 'rus', | ||
139 | 'serbian' => 'srp', | ||
140 | 'slovak' => 'slk', | ||
141 | 'slovene' => 'slv', | ||
142 | 'somali' => 'som', | ||
143 | 'spanish' => 'spa', | ||
144 | 'swahili' => 'swa', | ||
145 | 'swedish' => 'swe', | ||
146 | 'tagalog' => 'tgl', | ||
147 | 'turkish' => 'tur', | ||
148 | 'ukrainian' => 'ukr', | ||
149 | 'urdu' => 'urd', | ||
150 | 'uzbek' => 'uzb', | ||
151 | 'vietnamese' => 'vie', | ||
152 | 'welsh' => 'cym', | ||
153 | ); | ||
154 | |||
155 | /** | ||
156 | * Maps ISO 639-1 2-letter language codes to the language names | ||
157 | * in the language database | ||
158 | * | ||
159 | * Not all languages have a 2 letter code, so some are missing | ||
160 | * | ||
161 | * @var array | ||
162 | */ | ||
163 | public static $code2ToName = array( | ||
164 | 'ar' => 'arabic', | ||
165 | 'az' => 'azeri', | ||
166 | 'bg' => 'bulgarian', | ||
167 | 'bn' => 'bengali', | ||
168 | 'cs' => 'czech', | ||
169 | 'cy' => 'welsh', | ||
170 | 'da' => 'danish', | ||
171 | 'de' => 'german', | ||
172 | 'en' => 'english', | ||
173 | 'es' => 'spanish', | ||
174 | 'et' => 'estonian', | ||
175 | 'fa' => 'farsi', | ||
176 | 'fi' => 'finnish', | ||
177 | 'fr' => 'french', | ||
178 | 'ha' => 'hausa', | ||
179 | 'hi' => 'hindi', | ||
180 | 'hr' => 'croatian', | ||
181 | 'hu' => 'hungarian', | ||
182 | 'id' => 'indonesian', | ||
183 | 'is' => 'icelandic', | ||
184 | 'it' => 'italian', | ||
185 | 'kk' => 'kazakh', | ||
186 | 'ky' => 'kyrgyz', | ||
187 | 'la' => 'latin', | ||
188 | 'lt' => 'lithuanian', | ||
189 | 'lv' => 'latvian', | ||
190 | 'mk' => 'macedonian', | ||
191 | 'mn' => 'mongolian', | ||
192 | 'ne' => 'nepali', | ||
193 | 'nl' => 'dutch', | ||
194 | 'no' => 'norwegian', | ||
195 | 'pl' => 'polish', | ||
196 | 'ps' => 'pashto', | ||
197 | 'pt' => 'portuguese', | ||
198 | 'ro' => 'romanian', | ||
199 | 'ru' => 'russian', | ||
200 | 'sk' => 'slovak', | ||
201 | 'sl' => 'slovene', | ||
202 | 'so' => 'somali', | ||
203 | 'sq' => 'albanian', | ||
204 | 'sr' => 'serbian', | ||
205 | 'sv' => 'swedish', | ||
206 | 'sw' => 'swahili', | ||
207 | 'tl' => 'tagalog', | ||
208 | 'tr' => 'turkish', | ||
209 | 'uk' => 'ukrainian', | ||
210 | 'ur' => 'urdu', | ||
211 | 'uz' => 'uzbek', | ||
212 | 'vi' => 'vietnamese', | ||
213 | ); | ||
214 | |||
215 | /** | ||
216 | * Maps ISO 639-2 3-letter language codes to the language names | ||
217 | * in the language database. | ||
218 | * | ||
219 | * @var array | ||
220 | */ | ||
221 | public static $code3ToName = array( | ||
222 | 'ara' => 'arabic', | ||
223 | 'aze' => 'azeri', | ||
224 | 'ben' => 'bengali', | ||
225 | 'bul' => 'bulgarian', | ||
226 | 'ceb' => 'cebuano', | ||
227 | 'ces' => 'czech', | ||
228 | 'crp' => 'pidgin', | ||
229 | 'cym' => 'welsh', | ||
230 | 'dan' => 'danish', | ||
231 | 'deu' => 'german', | ||
232 | 'eng' => 'english', | ||
233 | 'est' => 'estonian', | ||
234 | 'fas' => 'farsi', | ||
235 | 'fin' => 'finnish', | ||
236 | 'fra' => 'french', | ||
237 | 'hau' => 'hausa', | ||
238 | 'haw' => 'hawaiian', | ||
239 | 'hin' => 'hindi', | ||
240 | 'hrv' => 'croatian', | ||
241 | 'hun' => 'hungarian', | ||
242 | 'ind' => 'indonesian', | ||
243 | 'isl' => 'icelandic', | ||
244 | 'ita' => 'italian', | ||
245 | 'kaz' => 'kazakh', | ||
246 | 'kir' => 'kyrgyz', | ||
247 | 'lat' => 'latin', | ||
248 | 'lav' => 'latvian', | ||
249 | 'lit' => 'lithuanian', | ||
250 | 'mkd' => 'macedonian', | ||
251 | 'mon' => 'mongolian', | ||
252 | 'nep' => 'nepali', | ||
253 | 'nld' => 'dutch', | ||
254 | 'nor' => 'norwegian', | ||
255 | 'pol' => 'polish', | ||
256 | 'por' => 'portuguese', | ||
257 | 'pus' => 'pashto', | ||
258 | 'rom' => 'romanian', | ||
259 | 'rus' => 'russian', | ||
260 | 'slk' => 'slovak', | ||
261 | 'slv' => 'slovene', | ||
262 | 'som' => 'somali', | ||
263 | 'spa' => 'spanish', | ||
264 | 'sqi' => 'albanian', | ||
265 | 'srp' => 'serbian', | ||
266 | 'swa' => 'swahili', | ||
267 | 'swe' => 'swedish', | ||
268 | 'tgl' => 'tagalog', | ||
269 | 'tur' => 'turkish', | ||
270 | 'ukr' => 'ukrainian', | ||
271 | 'urd' => 'urdu', | ||
272 | 'uzb' => 'uzbek', | ||
273 | 'vie' => 'vietnamese', | ||
274 | ); | ||
275 | |||
276 | /** | ||
277 | * Returns the 2-letter ISO 639-1 code for the given language name. | ||
278 | * | ||
279 | * @param string $lang English language name like "swedish" | ||
280 | * | ||
281 | * @return string Two-letter language code (e.g. "sv") or NULL if not found | ||
282 | */ | ||
283 | public static function nameToCode2($lang) | ||
284 | { | ||
285 | $lang = strtolower($lang); | ||
286 | if (!isset(self::$nameToCode2[$lang])) { | ||
287 | return null; | ||
288 | } | ||
289 | return self::$nameToCode2[$lang]; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * Returns the 3-letter ISO 639-2 code for the given language name. | ||
294 | * | ||
295 | * @param string $lang English language name like "swedish" | ||
296 | * | ||
297 | * @return string Three-letter language code (e.g. "swe") or NULL if not found | ||
298 | */ | ||
299 | public static function nameToCode3($lang) | ||
300 | { | ||
301 | $lang = strtolower($lang); | ||
302 | if (!isset(self::$nameToCode3[$lang])) { | ||
303 | return null; | ||
304 | } | ||
305 | return self::$nameToCode3[$lang]; | ||
306 | } | ||
307 | |||
308 | /** | ||
309 | * Returns the language name for the given 2-letter ISO 639-1 code. | ||
310 | * | ||
311 | * @param string $code Two-letter language code (e.g. "sv") | ||
312 | * | ||
313 | * @return string English language name like "swedish" | ||
314 | */ | ||
315 | public static function code2ToName($code) | ||
316 | { | ||
317 | $lang = strtolower($code); | ||
318 | if (!isset(self::$code2ToName[$code])) { | ||
319 | return null; | ||
320 | } | ||
321 | return self::$code2ToName[$code]; | ||
322 | } | ||
323 | |||
324 | /** | ||
325 | * Returns the language name for the given 3-letter ISO 639-2 code. | ||
326 | * | ||
327 | * @param string $code Three-letter language code (e.g. "swe") | ||
328 | * | ||
329 | * @return string English language name like "swedish" | ||
330 | */ | ||
331 | public static function code3ToName($code) | ||
332 | { | ||
333 | $lang = strtolower($code); | ||
334 | if (!isset(self::$code3ToName[$code])) { | ||
335 | return null; | ||
336 | } | ||
337 | return self::$code3ToName[$code]; | ||
338 | } | ||
339 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php index 7f15fa98..fb0e1e20 100644 --- a/inc/3rdparty/libraries/language-detect/Parser.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php | |||
@@ -8,7 +8,7 @@ | |||
8 | * @author Nicholas Pisarro | 8 | * @author Nicholas Pisarro |
9 | * @copyright 2006 | 9 | * @copyright 2006 |
10 | * @license BSD | 10 | * @license BSD |
11 | * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ | 11 | * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ |
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | 12 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
13 | * @link http://langdetect.blogspot.com/ | 13 | * @link http://langdetect.blogspot.com/ |
14 | */ | 14 | */ |
@@ -28,7 +28,7 @@ | |||
28 | * @author Nicholas Pisarro | 28 | * @author Nicholas Pisarro |
29 | * @copyright 2006 | 29 | * @copyright 2006 |
30 | * @license BSD | 30 | * @license BSD |
31 | * @version release: 0.2.3 | 31 | * @version release: 0.3.0 |
32 | */ | 32 | */ |
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | 33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect |
34 | { | 34 | { |
@@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect | |||
102 | * @access private | 102 | * @access private |
103 | * @param string $string string to be parsed | 103 | * @param string $string string to be parsed |
104 | */ | 104 | */ |
105 | function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { | 105 | function Text_LanguageDetect_Parser($string) { |
106 | if (isset($db)) $this->_db_filename = $db; | ||
107 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
108 | $this->_string = $string; | 106 | $this->_string = $string; |
109 | } | 107 | } |
110 | 108 | ||
111 | /** | 109 | /** |
112 | * Returns true if a string is suitable for parsing | 110 | * Returns true if a string is suitable for parsing |
113 | * | 111 | * |
114 | * @static | ||
115 | * @access public | ||
116 | * @param string $str input string to test | 112 | * @param string $str input string to test |
117 | * @return bool true if acceptable, false if not | 113 | * @return bool true if acceptable, false if not |
118 | */ | 114 | */ |
119 | function validateString($str) { | 115 | public static function validateString($str) { |
120 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | 116 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { |
121 | return true; | 117 | return true; |
122 | } else { | 118 | } else { |
@@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect | |||
222 | 218 | ||
223 | // unicode startup | 219 | // unicode startup |
224 | if ($this->_compile_unicode) { | 220 | if ($this->_compile_unicode) { |
225 | $blocks =& $this->_read_unicode_block_db(); | 221 | $blocks = $this->_read_unicode_block_db(); |
226 | |||
227 | $block_count = count($blocks); | 222 | $block_count = count($blocks); |
228 | 223 | ||
229 | $skipped_count = 0; | 224 | $skipped_count = 0; |
@@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect | |||
349 | } | 344 | } |
350 | } | 345 | } |
351 | 346 | ||
352 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | 347 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file |
353 | |||
354 | ?> | ||
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php index 2e8991cc..d0f09d74 100644 --- a/inc/3rdparty/libraries/readability/Readability.php +++ b/inc/3rdparty/libraries/readability/Readability.php | |||
@@ -1,1138 +1,1138 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Arc90's Readability ported to PHP for FiveFilters.org | 3 | * Arc90's Readability ported to PHP for FiveFilters.org |
4 | * Based on readability.js version 1.7.1 (without multi-page support) | 4 | * Based on readability.js version 1.7.1 (without multi-page support) |
5 | * Updated to allow HTML5 parsing with html5lib | 5 | * Updated to allow HTML5 parsing with html5lib |
6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds | 6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds |
7 | * ------------------------------------------------------ | 7 | * ------------------------------------------------------ |
8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | 8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js |
9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | 9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ |
10 | * JS Source: http://code.google.com/p/arc90labs-readability | 10 | * JS Source: http://code.google.com/p/arc90labs-readability |
11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | 11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net |
12 | * More information: http://fivefilters.org/content-only/ | 12 | * More information: http://fivefilters.org/content-only/ |
13 | * License: Apache License, Version 2.0 | 13 | * License: Apache License, Version 2.0 |
14 | * Requires: PHP5 | 14 | * Requires: PHP5 |
15 | * Date: 2012-09-19 | 15 | * Date: 2012-09-19 |
16 | * | 16 | * |
17 | * Differences between the PHP port and the original | 17 | * Differences between the PHP port and the original |
18 | * ------------------------------------------------------ | 18 | * ------------------------------------------------------ |
19 | * Arc90's Readability is designed to run in the browser. It works on the DOM | 19 | * Arc90's Readability is designed to run in the browser. It works on the DOM |
20 | * tree (the parsed HTML) after the page's CSS styles have been applied and | 20 | * tree (the parsed HTML) after the page's CSS styles have been applied and |
21 | * Javascript code executed. This PHP port does not run inside a browser. | 21 | * Javascript code executed. This PHP port does not run inside a browser. |
22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | 22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot |
23 | * rely on CSS or Javascript support. As such, the results will not always | 23 | * rely on CSS or Javascript support. As such, the results will not always |
24 | * match Arc90's Readability. (For example, if a web page contains CSS style | 24 | * match Arc90's Readability. (For example, if a web page contains CSS style |
25 | * rules or Javascript code which hide certain HTML elements from display, | 25 | * rules or Javascript code which hide certain HTML elements from display, |
26 | * Arc90's Readability will dismiss those from consideration but our PHP port, | 26 | * Arc90's Readability will dismiss those from consideration but our PHP port, |
27 | * unable to understand CSS or Javascript, will not know any better.) | 27 | * unable to understand CSS or Javascript, will not know any better.) |
28 | * | 28 | * |
29 | * Another significant difference is that the aim of Arc90's Readability is | 29 | * Another significant difference is that the aim of Arc90's Readability is |
30 | * to re-present the main content block of a given web page so users can | 30 | * to re-present the main content block of a given web page so users can |
31 | * read it more easily in their browsers. Correct identification, clean up, | 31 | * read it more easily in their browsers. Correct identification, clean up, |
32 | * and separation of the content block is only a part of this process. | 32 | * and separation of the content block is only a part of this process. |
33 | * This PHP port is only concerned with this part, it does not include code | 33 | * This PHP port is only concerned with this part, it does not include code |
34 | * that relates to presentation in the browser - Arc90 already do | 34 | * that relates to presentation in the browser - Arc90 already do |
35 | * that extremely well, and for PDF output there's FiveFilters.org's | 35 | * that extremely well, and for PDF output there's FiveFilters.org's |
36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | 36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. |
37 | * | 37 | * |
38 | * Finally, this class contains methods that might be useful for developers | 38 | * Finally, this class contains methods that might be useful for developers |
39 | * working on HTML document fragments. So without deviating too much from | 39 | * working on HTML document fragments. So without deviating too much from |
40 | * the original code (which I don't want to do because it makes debugging | 40 | * the original code (which I don't want to do because it makes debugging |
41 | * and updating more difficult), I've tried to make it a little more | 41 | * and updating more difficult), I've tried to make it a little more |
42 | * developer friendly. You should be able to use the methods here on | 42 | * developer friendly. You should be able to use the methods here on |
43 | * existing DOMElement objects without passing an entire HTML document to | 43 | * existing DOMElement objects without passing an entire HTML document to |
44 | * be parsed. | 44 | * be parsed. |
45 | */ | 45 | */ |
46 | 46 | ||
47 | // This class allows us to do JavaScript like assignements to innerHTML | 47 | // This class allows us to do JavaScript like assignements to innerHTML |
48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | 48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); |
49 | 49 | ||
50 | // Alternative usage (for testing only!) | 50 | // Alternative usage (for testing only!) |
51 | // uncomment the lines below and call Readability.php in your browser | 51 | // uncomment the lines below and call Readability.php in your browser |
52 | // passing it the URL of the page you'd like content from, e.g.: | 52 | // passing it the URL of the page you'd like content from, e.g.: |
53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | 53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php |
54 | 54 | ||
55 | /* | 55 | /* |
56 | if (!isset($_GET['url']) || $_GET['url'] == '') { | 56 | if (!isset($_GET['url']) || $_GET['url'] == '') { |
57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | 57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); |
58 | } | 58 | } |
59 | $url = $_GET['url']; | 59 | $url = $_GET['url']; |
60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | 60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; |
61 | $html = file_get_contents($url); | 61 | $html = file_get_contents($url); |
62 | $r = new Readability($html, $url); | 62 | $r = new Readability($html, $url); |
63 | $r->init(); | 63 | $r->init(); |
64 | echo $r->articleContent->innerHTML; | 64 | echo $r->articleContent->innerHTML; |
65 | */ | 65 | */ |
66 | 66 | ||
67 | class Readability | 67 | class Readability |
68 | { | 68 | { |
69 | public $version = '1.7.1-without-multi-page'; | 69 | public $version = '1.7.1-without-multi-page'; |
70 | public $convertLinksToFootnotes = false; | 70 | public $convertLinksToFootnotes = false; |
71 | public $revertForcedParagraphElements = true; | 71 | public $revertForcedParagraphElements = true; |
72 | public $articleTitle; | 72 | public $articleTitle; |
73 | public $articleContent; | 73 | public $articleContent; |
74 | public $dom; | 74 | public $dom; |
75 | public $url = null; // optional - URL where HTML was retrieved | 75 | public $url = null; // optional - URL where HTML was retrieved |
76 | public $debug = false; | 76 | public $debug = false; |
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 | 77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 |
78 | protected $body = null; // | 78 | protected $body = null; // |
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | 79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later |
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | 80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. |
81 | protected $success = false; // indicates whether we were able to extract or not | 81 | protected $success = false; // indicates whether we were able to extract or not |
82 | 82 | ||
83 | /** | 83 | /** |
84 | * All of the regular expressions in use within readability. | 84 | * All of the regular expressions in use within readability. |
85 | * Defined up here so we don't instantiate them repeatedly in loops. | 85 | * Defined up here so we don't instantiate them repeatedly in loops. |
86 | **/ | 86 | **/ |
87 | public $regexps = array( | 87 | public $regexps = array( |
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', | 88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', |
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | 89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', | 90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', |
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | 91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', |
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | 92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', |
93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | 93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', |
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | 94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', |
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | 95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() |
96 | 'normalize' => '/\s{2,}/', | 96 | 'normalize' => '/\s{2,}/', |
97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | 97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', |
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', | 98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', |
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | 99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' |
100 | ); | 100 | ); |
101 | 101 | ||
102 | /* constants */ | 102 | /* constants */ |
103 | const FLAG_STRIP_UNLIKELYS = 1; | 103 | const FLAG_STRIP_UNLIKELYS = 1; |
104 | const FLAG_WEIGHT_CLASSES = 2; | 104 | const FLAG_WEIGHT_CLASSES = 2; |
105 | const FLAG_CLEAN_CONDITIONALLY = 4; | 105 | const FLAG_CLEAN_CONDITIONALLY = 4; |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * Create instance of Readability | 108 | * Create instance of Readability |
109 | * @param string UTF-8 encoded string | 109 | * @param string UTF-8 encoded string |
110 | * @param string (optional) URL associated with HTML (used for footnotes) | 110 | * @param string (optional) URL associated with HTML (used for footnotes) |
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
112 | */ | 112 | */ |
113 | function __construct($html, $url=null, $parser='libxml') | 113 | function __construct($html, $url=null, $parser='libxml') |
114 | { | 114 | { |
115 | $this->url = $url; | 115 | $this->url = $url; |
116 | /* Turn all double br's into p's */ | 116 | /* Turn all double br's into p's */ |
117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | 117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); |
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | 118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); |
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | 119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); |
120 | if (trim($html) == '') $html = '<html></html>'; | 120 | if (trim($html) == '') $html = '<html></html>'; |
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { | 121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { |
122 | // all good | 122 | // all good |
123 | } else { | 123 | } else { |
124 | $this->dom = new DOMDocument(); | 124 | $this->dom = new DOMDocument(); |
125 | $this->dom->preserveWhiteSpace = false; | 125 | $this->dom->preserveWhiteSpace = false; |
126 | @$this->dom->loadHTML($html); | 126 | @$this->dom->loadHTML($html); |
127 | } | 127 | } |
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | 128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); |
129 | } | 129 | } |
130 | 130 | ||
131 | /** | 131 | /** |
132 | * Get article title element | 132 | * Get article title element |
133 | * @return DOMElement | 133 | * @return DOMElement |
134 | */ | 134 | */ |
135 | public function getTitle() { | 135 | public function getTitle() { |
136 | return $this->articleTitle; | 136 | return $this->articleTitle; |
137 | } | 137 | } |
138 | 138 | ||
139 | /** | 139 | /** |
140 | * Get article content element | 140 | * Get article content element |
141 | * @return DOMElement | 141 | * @return DOMElement |
142 | */ | 142 | */ |
143 | public function getContent() { | 143 | public function getContent() { |
144 | return $this->articleContent; | 144 | return $this->articleContent; |
145 | } | 145 | } |
146 | 146 | ||
147 | /** | 147 | /** |
148 | * Runs readability. | 148 | * Runs readability. |
149 | * | 149 | * |
150 | * Workflow: | 150 | * Workflow: |
151 | * 1. Prep the document by removing script tags, css, etc. | 151 | * 1. Prep the document by removing script tags, css, etc. |
152 | * 2. Build readability's DOM tree. | 152 | * 2. Build readability's DOM tree. |
153 | * 3. Grab the article content from the current dom tree. | 153 | * 3. Grab the article content from the current dom tree. |
154 | * 4. Replace the current DOM tree with the new one. | 154 | * 4. Replace the current DOM tree with the new one. |
155 | * 5. Read peacefully. | 155 | * 5. Read peacefully. |
156 | * | 156 | * |
157 | * @return boolean true if we found content, false otherwise | 157 | * @return boolean true if we found content, false otherwise |
158 | **/ | 158 | **/ |
159 | public function init() | 159 | public function init() |
160 | { | 160 | { |
161 | if (!isset($this->dom->documentElement)) return false; | 161 | if (!isset($this->dom->documentElement)) return false; |
162 | $this->removeScripts($this->dom); | 162 | $this->removeScripts($this->dom); |
163 | //die($this->getInnerHTML($this->dom->documentElement)); | 163 | //die($this->getInnerHTML($this->dom->documentElement)); |
164 | 164 | ||
165 | // Assume successful outcome | 165 | // Assume successful outcome |
166 | $this->success = true; | 166 | $this->success = true; |
167 | 167 | ||
168 | $bodyElems = $this->dom->getElementsByTagName('body'); | 168 | $bodyElems = $this->dom->getElementsByTagName('body'); |
169 | if ($bodyElems->length > 0) { | 169 | if ($bodyElems->length > 0) { |
170 | if ($this->bodyCache == null) { | 170 | if ($this->bodyCache == null) { |
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | 171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; |
172 | } | 172 | } |
173 | if ($this->body == null) { | 173 | if ($this->body == null) { |
174 | $this->body = $bodyElems->item(0); | 174 | $this->body = $bodyElems->item(0); |
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | $this->prepDocument(); | 178 | $this->prepDocument(); |
179 | 179 | ||
180 | //die($this->dom->documentElement->parentNode->nodeType); | 180 | //die($this->dom->documentElement->parentNode->nodeType); |
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | 181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); |
182 | //die($this->getInnerHTML($this->dom->documentElement)); | 182 | //die($this->getInnerHTML($this->dom->documentElement)); |
183 | 183 | ||
184 | /* Build readability's DOM tree */ | 184 | /* Build readability's DOM tree */ |
185 | $overlay = $this->dom->createElement('div'); | 185 | $overlay = $this->dom->createElement('div'); |
186 | $innerDiv = $this->dom->createElement('div'); | 186 | $innerDiv = $this->dom->createElement('div'); |
187 | $articleTitle = $this->getArticleTitle(); | 187 | $articleTitle = $this->getArticleTitle(); |
188 | $articleContent = $this->grabArticle(); | 188 | $articleContent = $this->grabArticle(); |
189 | 189 | ||
190 | if (!$articleContent) { | 190 | if (!$articleContent) { |
191 | $this->success = false; | 191 | $this->success = false; |
192 | $articleContent = $this->dom->createElement('div'); | 192 | $articleContent = $this->dom->createElement('div'); |
193 | $articleContent->setAttribute('id', 'readability-content'); | 193 | $articleContent->setAttribute('id', 'readability-content'); |
194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | 194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; |
195 | } | 195 | } |
196 | 196 | ||
197 | $overlay->setAttribute('id', 'readOverlay'); | 197 | $overlay->setAttribute('id', 'readOverlay'); |
198 | $innerDiv->setAttribute('id', 'readInner'); | 198 | $innerDiv->setAttribute('id', 'readInner'); |
199 | 199 | ||
200 | /* Glue the structure of our document together. */ | 200 | /* Glue the structure of our document together. */ |
201 | $innerDiv->appendChild($articleTitle); | 201 | $innerDiv->appendChild($articleTitle); |
202 | $innerDiv->appendChild($articleContent); | 202 | $innerDiv->appendChild($articleContent); |
203 | $overlay->appendChild($innerDiv); | 203 | $overlay->appendChild($innerDiv); |
204 | 204 | ||
205 | /* Clear the old HTML, insert the new content. */ | 205 | /* Clear the old HTML, insert the new content. */ |
206 | $this->body->innerHTML = ''; | 206 | $this->body->innerHTML = ''; |
207 | $this->body->appendChild($overlay); | 207 | $this->body->appendChild($overlay); |
208 | //document.body.insertBefore(overlay, document.body.firstChild); | 208 | //document.body.insertBefore(overlay, document.body.firstChild); |
209 | $this->body->removeAttribute('style'); | 209 | $this->body->removeAttribute('style'); |
210 | 210 | ||
211 | $this->postProcessContent($articleContent); | 211 | $this->postProcessContent($articleContent); |
212 | 212 | ||
213 | // Set title and content instance variables | 213 | // Set title and content instance variables |
214 | $this->articleTitle = $articleTitle; | 214 | $this->articleTitle = $articleTitle; |
215 | $this->articleContent = $articleContent; | 215 | $this->articleContent = $articleContent; |
216 | 216 | ||
217 | return $this->success; | 217 | return $this->success; |
218 | } | 218 | } |
219 | 219 | ||
220 | /** | 220 | /** |
221 | * Debug | 221 | * Debug |
222 | */ | 222 | */ |
223 | protected function dbg($msg) { | 223 | protected function dbg($msg) { |
224 | if ($this->debug) echo '* ',$msg, "\n"; | 224 | if ($this->debug) echo '* ',$msg, "\n"; |
225 | } | 225 | } |
226 | 226 | ||
227 | /** | 227 | /** |
228 | * Run any post-process modifications to article content as necessary. | 228 | * Run any post-process modifications to article content as necessary. |
229 | * | 229 | * |
230 | * @param DOMElement | 230 | * @param DOMElement |
231 | * @return void | 231 | * @return void |
232 | */ | 232 | */ |
233 | public function postProcessContent($articleContent) { | 233 | public function postProcessContent($articleContent) { |
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | 234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { |
235 | $this->addFootnotes($articleContent); | 235 | $this->addFootnotes($articleContent); |
236 | } | 236 | } |
237 | } | 237 | } |
238 | 238 | ||
239 | /** | 239 | /** |
240 | * Get the article title as an H1. | 240 | * Get the article title as an H1. |
241 | * | 241 | * |
242 | * @return DOMElement | 242 | * @return DOMElement |
243 | */ | 243 | */ |
244 | protected function getArticleTitle() { | 244 | protected function getArticleTitle() { |
245 | $curTitle = ''; | 245 | $curTitle = ''; |
246 | $origTitle = ''; | 246 | $origTitle = ''; |
247 | 247 | ||
248 | try { | 248 | try { |
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | 249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); |
250 | } catch(Exception $e) {} | 250 | } catch(Exception $e) {} |
251 | 251 | ||
252 | if (preg_match('/ [\|\-] /', $curTitle)) | 252 | if (preg_match('/ [\|\-] /', $curTitle)) |
253 | { | 253 | { |
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | 254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); |
255 | 255 | ||
256 | if (count(explode(' ', $curTitle)) < 3) { | 256 | if (count(explode(' ', $curTitle)) < 3) { |
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | 257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); |
258 | } | 258 | } |
259 | } | 259 | } |
260 | else if (strpos($curTitle, ': ') !== false) | 260 | else if (strpos($curTitle, ': ') !== false) |
261 | { | 261 | { |
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | 262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); |
263 | 263 | ||
264 | if (count(explode(' ', $curTitle)) < 3) { | 264 | if (count(explode(' ', $curTitle)) < 3) { |
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | 265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | 268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) |
269 | { | 269 | { |
270 | $hOnes = $this->dom->getElementsByTagName('h1'); | 270 | $hOnes = $this->dom->getElementsByTagName('h1'); |
271 | if($hOnes->length == 1) | 271 | if($hOnes->length == 1) |
272 | { | 272 | { |
273 | $curTitle = $this->getInnerText($hOnes->item(0)); | 273 | $curTitle = $this->getInnerText($hOnes->item(0)); |
274 | } | 274 | } |
275 | } | 275 | } |
276 | 276 | ||
277 | $curTitle = trim($curTitle); | 277 | $curTitle = trim($curTitle); |
278 | 278 | ||
279 | if (count(explode(' ', $curTitle)) <= 4) { | 279 | if (count(explode(' ', $curTitle)) <= 4) { |
280 | $curTitle = $origTitle; | 280 | $curTitle = $origTitle; |
281 | } | 281 | } |
282 | 282 | ||
283 | $articleTitle = $this->dom->createElement('h1'); | 283 | $articleTitle = $this->dom->createElement('h1'); |
284 | $articleTitle->innerHTML = $curTitle; | 284 | $articleTitle->innerHTML = $curTitle; |
285 | 285 | ||
286 | return $articleTitle; | 286 | return $articleTitle; |
287 | } | 287 | } |
288 | 288 | ||
289 | /** | 289 | /** |
290 | * Prepare the HTML document for readability to scrape it. | 290 | * Prepare the HTML document for readability to scrape it. |
291 | * This includes things like stripping javascript, CSS, and handling terrible markup. | 291 | * This includes things like stripping javascript, CSS, and handling terrible markup. |
292 | * | 292 | * |
293 | * @return void | 293 | * @return void |
294 | **/ | 294 | **/ |
295 | protected function prepDocument() { | 295 | protected function prepDocument() { |
296 | /** | 296 | /** |
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | 297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) |
298 | * so we create a new body node and append it to the document. | 298 | * so we create a new body node and append it to the document. |
299 | */ | 299 | */ |
300 | if ($this->body == null) | 300 | if ($this->body == null) |
301 | { | 301 | { |
302 | $this->body = $this->dom->createElement('body'); | 302 | $this->body = $this->dom->createElement('body'); |
303 | $this->dom->documentElement->appendChild($this->body); | 303 | $this->dom->documentElement->appendChild($this->body); |
304 | } | 304 | } |
305 | $this->body->setAttribute('id', 'readabilityBody'); | 305 | $this->body->setAttribute('id', 'readabilityBody'); |
306 | 306 | ||
307 | /* Remove all style tags in head */ | 307 | /* Remove all style tags in head */ |
308 | $styleTags = $this->dom->getElementsByTagName('style'); | 308 | $styleTags = $this->dom->getElementsByTagName('style'); |
309 | for ($i = $styleTags->length-1; $i >= 0; $i--) | 309 | for ($i = $styleTags->length-1; $i >= 0; $i--) |
310 | { | 310 | { |
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | 311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); |
312 | } | 312 | } |
313 | 313 | ||
314 | /* Turn all double br's into p's */ | 314 | /* Turn all double br's into p's */ |
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | 315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ |
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | 316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); |
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | 317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. |
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | 318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. |
319 | } | 319 | } |
320 | 320 | ||
321 | /** | 321 | /** |
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | 322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. |
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | 323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php |
324 | * | 324 | * |
325 | * @return void | 325 | * @return void |
326 | **/ | 326 | **/ |
327 | public function addFootnotes($articleContent) { | 327 | public function addFootnotes($articleContent) { |
328 | $footnotesWrapper = $this->dom->createElement('div'); | 328 | $footnotesWrapper = $this->dom->createElement('div'); |
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | 329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); |
330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | 330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; |
331 | 331 | ||
332 | $articleFootnotes = $this->dom->createElement('ol'); | 332 | $articleFootnotes = $this->dom->createElement('ol'); |
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | 333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); |
334 | $footnotesWrapper->appendChild($articleFootnotes); | 334 | $footnotesWrapper->appendChild($articleFootnotes); |
335 | 335 | ||
336 | $articleLinks = $articleContent->getElementsByTagName('a'); | 336 | $articleLinks = $articleContent->getElementsByTagName('a'); |
337 | 337 | ||
338 | $linkCount = 0; | 338 | $linkCount = 0; |
339 | for ($i = 0; $i < $articleLinks->length; $i++) | 339 | for ($i = 0; $i < $articleLinks->length; $i++) |
340 | { | 340 | { |
341 | $articleLink = $articleLinks->item($i); | 341 | $articleLink = $articleLinks->item($i); |
342 | $footnoteLink = $articleLink->cloneNode(true); | 342 | $footnoteLink = $articleLink->cloneNode(true); |
343 | $refLink = $this->dom->createElement('a'); | 343 | $refLink = $this->dom->createElement('a'); |
344 | $footnote = $this->dom->createElement('li'); | 344 | $footnote = $this->dom->createElement('li'); |
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | 345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); |
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | 346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); |
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | 347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, |
348 | $linkText = $this->getInnerText($articleLink); | 348 | $linkText = $this->getInnerText($articleLink); |
349 | 349 | ||
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | 350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { |
351 | continue; | 351 | continue; |
352 | } | 352 | } |
353 | 353 | ||
354 | $linkCount++; | 354 | $linkCount++; |
355 | 355 | ||
356 | /** Add a superscript reference after the article link */ | 356 | /** Add a superscript reference after the article link */ |
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | 357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); |
358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | 358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; |
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | 359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); |
360 | $refLink->setAttribute('style', 'color: inherit;'); | 360 | $refLink->setAttribute('style', 'color: inherit;'); |
361 | 361 | ||
362 | //TODO: does this work or should we use DOMNode.isSameNode()? | 362 | //TODO: does this work or should we use DOMNode.isSameNode()? |
363 | if ($articleLink->parentNode->lastChild == $articleLink) { | 363 | if ($articleLink->parentNode->lastChild == $articleLink) { |
364 | $articleLink->parentNode->appendChild($refLink); | 364 | $articleLink->parentNode->appendChild($refLink); |
365 | } else { | 365 | } else { |
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | 366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); |
367 | } | 367 | } |
368 | 368 | ||
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | 369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | 370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
371 | 371 | ||
372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | 372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; |
373 | 373 | ||
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | 374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); |
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | 375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
376 | 376 | ||
377 | $footnote->appendChild($footnoteLink); | 377 | $footnote->appendChild($footnoteLink); |
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | 378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; |
379 | 379 | ||
380 | $articleFootnotes->appendChild($footnote); | 380 | $articleFootnotes->appendChild($footnote); |
381 | } | 381 | } |
382 | 382 | ||
383 | if ($linkCount > 0) { | 383 | if ($linkCount > 0) { |
384 | $articleContent->appendChild($footnotesWrapper); | 384 | $articleContent->appendChild($footnotesWrapper); |
385 | } | 385 | } |
386 | } | 386 | } |
387 | 387 | ||
388 | /** | 388 | /** |
389 | * Reverts P elements with class 'readability-styled' | 389 | * Reverts P elements with class 'readability-styled' |
390 | * to text nodes - which is what they were before. | 390 | * to text nodes - which is what they were before. |
391 | * | 391 | * |
392 | * @param DOMElement | 392 | * @param DOMElement |
393 | * @return void | 393 | * @return void |
394 | */ | 394 | */ |
395 | function revertReadabilityStyledElements($articleContent) { | 395 | function revertReadabilityStyledElements($articleContent) { |
396 | $xpath = new DOMXPath($articleContent->ownerDocument); | 396 | $xpath = new DOMXPath($articleContent->ownerDocument); |
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | 397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); |
398 | //$elems = $articleContent->getElementsByTagName('p'); | 398 | //$elems = $articleContent->getElementsByTagName('p'); |
399 | for ($i = $elems->length-1; $i >= 0; $i--) { | 399 | for ($i = $elems->length-1; $i >= 0; $i--) { |
400 | $e = $elems->item($i); | 400 | $e = $elems->item($i); |
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | 401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); |
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | 402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { |
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | 403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); |
404 | //} | 404 | //} |
405 | } | 405 | } |
406 | } | 406 | } |
407 | 407 | ||
408 | /** | 408 | /** |
409 | * Prepare the article node for display. Clean out any inline styles, | 409 | * Prepare the article node for display. Clean out any inline styles, |
410 | * iframes, forms, strip extraneous <p> tags, etc. | 410 | * iframes, forms, strip extraneous <p> tags, etc. |
411 | * | 411 | * |
412 | * @param DOMElement | 412 | * @param DOMElement |
413 | * @return void | 413 | * @return void |
414 | */ | 414 | */ |
415 | function prepArticle($articleContent) { | 415 | function prepArticle($articleContent) { |
416 | $this->cleanStyles($articleContent); | 416 | $this->cleanStyles($articleContent); |
417 | $this->killBreaks($articleContent); | 417 | $this->killBreaks($articleContent); |
418 | if ($this->revertForcedParagraphElements) { | 418 | if ($this->revertForcedParagraphElements) { |
419 | $this->revertReadabilityStyledElements($articleContent); | 419 | $this->revertReadabilityStyledElements($articleContent); |
420 | } | 420 | } |
421 | 421 | ||
422 | /* Clean out junk from the article content */ | 422 | /* Clean out junk from the article content */ |
423 | $this->cleanConditionally($articleContent, 'form'); | 423 | $this->cleanConditionally($articleContent, 'form'); |
424 | $this->clean($articleContent, 'object'); | 424 | $this->clean($articleContent, 'object'); |
425 | $this->clean($articleContent, 'h1'); | 425 | $this->clean($articleContent, 'h1'); |
426 | 426 | ||
427 | /** | 427 | /** |
428 | * If there is only one h2, they are probably using it | 428 | * If there is only one h2, they are probably using it |
429 | * as a header and not a subheader, so remove it since we already have a header. | 429 | * as a header and not a subheader, so remove it since we already have a header. |
430 | ***/ | 430 | ***/ |
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { | 431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { |
432 | $this->clean($articleContent, 'h2'); | 432 | $this->clean($articleContent, 'h2'); |
433 | } | 433 | } |
434 | $this->clean($articleContent, 'iframe'); | 434 | $this->clean($articleContent, 'iframe'); |
435 | 435 | ||
436 | $this->cleanHeaders($articleContent); | 436 | $this->cleanHeaders($articleContent); |
437 | 437 | ||
438 | /* Do these last as the previous stuff may have removed junk that will affect these */ | 438 | /* Do these last as the previous stuff may have removed junk that will affect these */ |
439 | $this->cleanConditionally($articleContent, 'table'); | 439 | $this->cleanConditionally($articleContent, 'table'); |
440 | $this->cleanConditionally($articleContent, 'ul'); | 440 | $this->cleanConditionally($articleContent, 'ul'); |
441 | $this->cleanConditionally($articleContent, 'div'); | 441 | $this->cleanConditionally($articleContent, 'div'); |
442 | 442 | ||
443 | /* Remove extra paragraphs */ | 443 | /* Remove extra paragraphs */ |
444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | 444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); |
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | 445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) |
446 | { | 446 | { |
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | 447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; |
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | 448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; |
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | 449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; |
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; | 450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; |
451 | 451 | ||
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | 452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') |
453 | { | 453 | { |
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | 454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); |
455 | } | 455 | } |
456 | } | 456 | } |
457 | 457 | ||
458 | try { | 458 | try { |
459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | 459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); |
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | 460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); |
461 | } | 461 | } |
462 | catch (Exception $e) { | 462 | catch (Exception $e) { |
463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | 463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); |
464 | } | 464 | } |
465 | } | 465 | } |
466 | 466 | ||
467 | /** | 467 | /** |
468 | * Initialize a node with the readability object. Also checks the | 468 | * Initialize a node with the readability object. Also checks the |
469 | * className/id for special names to add to its score. | 469 | * className/id for special names to add to its score. |
470 | * | 470 | * |
471 | * @param Element | 471 | * @param Element |
472 | * @return void | 472 | * @return void |
473 | **/ | 473 | **/ |
474 | protected function initializeNode($node) { | 474 | protected function initializeNode($node) { |
475 | $readability = $this->dom->createAttribute('readability'); | 475 | $readability = $this->dom->createAttribute('readability'); |
476 | $readability->value = 0; // this is our contentScore | 476 | $readability->value = 0; // this is our contentScore |
477 | $node->setAttributeNode($readability); | 477 | $node->setAttributeNode($readability); |
478 | 478 | ||
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | 479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case |
480 | case 'DIV': | 480 | case 'DIV': |
481 | $readability->value += 5; | 481 | $readability->value += 5; |
482 | break; | 482 | break; |
483 | 483 | ||
484 | case 'PRE': | 484 | case 'PRE': |
485 | case 'TD': | 485 | case 'TD': |
486 | case 'BLOCKQUOTE': | 486 | case 'BLOCKQUOTE': |
487 | $readability->value += 3; | 487 | $readability->value += 3; |
488 | break; | 488 | break; |
489 | 489 | ||
490 | case 'ADDRESS': | 490 | case 'ADDRESS': |
491 | case 'OL': | 491 | case 'OL': |
492 | case 'UL': | 492 | case 'UL': |
493 | case 'DL': | 493 | case 'DL': |
494 | case 'DD': | 494 | case 'DD': |
495 | case 'DT': | 495 | case 'DT': |
496 | case 'LI': | 496 | case 'LI': |
497 | case 'FORM': | 497 | case 'FORM': |
498 | $readability->value -= 3; | 498 | $readability->value -= 3; |
499 | break; | 499 | break; |
500 | 500 | ||
501 | case 'H1': | 501 | case 'H1': |
502 | case 'H2': | 502 | case 'H2': |
503 | case 'H3': | 503 | case 'H3': |
504 | case 'H4': | 504 | case 'H4': |
505 | case 'H5': | 505 | case 'H5': |
506 | case 'H6': | 506 | case 'H6': |
507 | case 'TH': | 507 | case 'TH': |
508 | $readability->value -= 5; | 508 | $readability->value -= 5; |
509 | break; | 509 | break; |
510 | } | 510 | } |
511 | $readability->value += $this->getClassWeight($node); | 511 | $readability->value += $this->getClassWeight($node); |
512 | } | 512 | } |
513 | 513 | ||
514 | /*** | 514 | /*** |
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | 515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | 516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
517 | * | 517 | * |
518 | * @return DOMElement | 518 | * @return DOMElement |
519 | **/ | 519 | **/ |
520 | protected function grabArticle($page=null) { | 520 | protected function grabArticle($page=null) { |
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | 521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); |
522 | if (!$page) $page = $this->dom; | 522 | if (!$page) $page = $this->dom; |
523 | $allElements = $page->getElementsByTagName('*'); | 523 | $allElements = $page->getElementsByTagName('*'); |
524 | /** | 524 | /** |
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | 525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs |
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | 526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) |
527 | * | 527 | * |
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | 528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
529 | * TODO: Shouldn't this be a reverse traversal? | 529 | * TODO: Shouldn't this be a reverse traversal? |
530 | **/ | 530 | **/ |
531 | $node = null; | 531 | $node = null; |
532 | $nodesToScore = array(); | 532 | $nodesToScore = array(); |
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | 533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { |
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | 534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { |
535 | //$node = $targetList->item($nodeIndex); | 535 | //$node = $targetList->item($nodeIndex); |
536 | $tagName = strtoupper($node->tagName); | 536 | $tagName = strtoupper($node->tagName); |
537 | /* Remove unlikely candidates */ | 537 | /* Remove unlikely candidates */ |
538 | if ($stripUnlikelyCandidates) { | 538 | if ($stripUnlikelyCandidates) { |
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | 539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); |
540 | if ( | 540 | if ( |
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | 541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && |
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | 542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && |
543 | $tagName != 'BODY' | 543 | $tagName != 'BODY' |
544 | ) | 544 | ) |
545 | { | 545 | { |
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | 546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); |
547 | //$nodesToRemove[] = $node; | 547 | //$nodesToRemove[] = $node; |
548 | $node->parentNode->removeChild($node); | 548 | $node->parentNode->removeChild($node); |
549 | $nodeIndex--; | 549 | $nodeIndex--; |
550 | continue; | 550 | continue; |
551 | } | 551 | } |
552 | } | 552 | } |
553 | 553 | ||
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | 554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { |
555 | $nodesToScore[] = $node; | 555 | $nodesToScore[] = $node; |
556 | } | 556 | } |
557 | 557 | ||
558 | /* Turn all divs that don't have children block level elements into p's */ | 558 | /* Turn all divs that don't have children block level elements into p's */ |
559 | if ($tagName == 'DIV') { | 559 | if ($tagName == 'DIV') { |
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | 560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { |
561 | //$this->dbg('Altering div to p'); | 561 | //$this->dbg('Altering div to p'); |
562 | $newNode = $this->dom->createElement('p'); | 562 | $newNode = $this->dom->createElement('p'); |
563 | try { | 563 | try { |
564 | $newNode->innerHTML = $node->innerHTML; | 564 | $newNode->innerHTML = $node->innerHTML; |
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | 565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); |
566 | $node->parentNode->replaceChild($newNode, $node); | 566 | $node->parentNode->replaceChild($newNode, $node); |
567 | $nodeIndex--; | 567 | $nodeIndex--; |
568 | $nodesToScore[] = $node; // or $newNode? | 568 | $nodesToScore[] = $node; // or $newNode? |
569 | } | 569 | } |
570 | catch(Exception $e) { | 570 | catch(Exception $e) { |
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | 571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); |
572 | } | 572 | } |
573 | } | 573 | } |
574 | else | 574 | else |
575 | { | 575 | { |
576 | /* EXPERIMENTAL */ | 576 | /* EXPERIMENTAL */ |
577 | // TODO: change these p elements back to text nodes after processing | 577 | // TODO: change these p elements back to text nodes after processing |
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | 578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { |
579 | $childNode = $node->childNodes->item($i); | 579 | $childNode = $node->childNodes->item($i); |
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | 580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE |
581 | //$this->dbg('replacing text node with a p tag with the same content.'); | 581 | //$this->dbg('replacing text node with a p tag with the same content.'); |
582 | $p = $this->dom->createElement('p'); | 582 | $p = $this->dom->createElement('p'); |
583 | $p->innerHTML = $childNode->nodeValue; | 583 | $p->innerHTML = $childNode->nodeValue; |
584 | $p->setAttribute('style', 'display: inline;'); | 584 | $p->setAttribute('style', 'display: inline;'); |
585 | $p->setAttribute('class', 'readability-styled'); | 585 | $p->setAttribute('class', 'readability-styled'); |
586 | $childNode->parentNode->replaceChild($p, $childNode); | 586 | $childNode->parentNode->replaceChild($p, $childNode); |
587 | } | 587 | } |
588 | } | 588 | } |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | /** | 593 | /** |
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | 594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
595 | * Then add their score to their parent node. | 595 | * Then add their score to their parent node. |
596 | * | 596 | * |
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | 597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
598 | **/ | 598 | **/ |
599 | $candidates = array(); | 599 | $candidates = array(); |
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | 600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { |
601 | $parentNode = $nodesToScore[$pt]->parentNode; | 601 | $parentNode = $nodesToScore[$pt]->parentNode; |
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | 602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; |
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | 603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); |
604 | $innerText = $this->getInnerText($nodesToScore[$pt]); | 604 | $innerText = $this->getInnerText($nodesToScore[$pt]); |
605 | 605 | ||
606 | if (!$parentNode || !isset($parentNode->tagName)) { | 606 | if (!$parentNode || !isset($parentNode->tagName)) { |
607 | continue; | 607 | continue; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* If this paragraph is less than 25 characters, don't even count it. */ | 610 | /* If this paragraph is less than 25 characters, don't even count it. */ |
611 | if(strlen($innerText) < 25) { | 611 | if(strlen($innerText) < 25) { |
612 | continue; | 612 | continue; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* Initialize readability data for the parent. */ | 615 | /* Initialize readability data for the parent. */ |
616 | if (!$parentNode->hasAttribute('readability')) | 616 | if (!$parentNode->hasAttribute('readability')) |
617 | { | 617 | { |
618 | $this->initializeNode($parentNode); | 618 | $this->initializeNode($parentNode); |
619 | $candidates[] = $parentNode; | 619 | $candidates[] = $parentNode; |
620 | } | 620 | } |
621 | 621 | ||
622 | /* Initialize readability data for the grandparent. */ | 622 | /* Initialize readability data for the grandparent. */ |
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | 623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) |
624 | { | 624 | { |
625 | $this->initializeNode($grandParentNode); | 625 | $this->initializeNode($grandParentNode); |
626 | $candidates[] = $grandParentNode; | 626 | $candidates[] = $grandParentNode; |
627 | } | 627 | } |
628 | 628 | ||
629 | $contentScore = 0; | 629 | $contentScore = 0; |
630 | 630 | ||
631 | /* Add a point for the paragraph itself as a base. */ | 631 | /* Add a point for the paragraph itself as a base. */ |
632 | $contentScore++; | 632 | $contentScore++; |
633 | 633 | ||
634 | /* Add points for any commas within this paragraph */ | 634 | /* Add points for any commas within this paragraph */ |
635 | $contentScore += count(explode(',', $innerText)); | 635 | $contentScore += count(explode(',', $innerText)); |
636 | 636 | ||
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | 637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
638 | $contentScore += min(floor(strlen($innerText) / 100), 3); | 638 | $contentScore += min(floor(strlen($innerText) / 100), 3); |
639 | 639 | ||
640 | /* Add the score to the parent. The grandparent gets half. */ | 640 | /* Add the score to the parent. The grandparent gets half. */ |
641 | $parentNode->getAttributeNode('readability')->value += $contentScore; | 641 | $parentNode->getAttributeNode('readability')->value += $contentScore; |
642 | 642 | ||
643 | if ($grandParentNode) { | 643 | if ($grandParentNode) { |
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | 644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; |
645 | } | 645 | } |
646 | } | 646 | } |
647 | 647 | ||
648 | /** | 648 | /** |
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found | 649 | * After we've calculated scores, loop through all of the possible candidate nodes we found |
650 | * and find the one with the highest score. | 650 | * and find the one with the highest score. |
651 | **/ | 651 | **/ |
652 | $topCandidate = null; | 652 | $topCandidate = null; |
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | 653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) |
654 | { | 654 | { |
655 | /** | 655 | /** |
656 | * Scale the final candidates score based on link density. Good content should have a | 656 | * Scale the final candidates score based on link density. Good content should have a |
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | 657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. |
658 | **/ | 658 | **/ |
659 | $readability = $candidates[$c]->getAttributeNode('readability'); | 659 | $readability = $candidates[$c]->getAttributeNode('readability'); |
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | 660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); |
661 | 661 | ||
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | 662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); |
663 | 663 | ||
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | 664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { |
665 | $topCandidate = $candidates[$c]; | 665 | $topCandidate = $candidates[$c]; |
666 | } | 666 | } |
667 | } | 667 | } |
668 | 668 | ||
669 | /** | 669 | /** |
670 | * If we still have no top candidate, just use the body as a last resort. | 670 | * If we still have no top candidate, just use the body as a last resort. |
671 | * We also have to copy the body node so it is something we can modify. | 671 | * We also have to copy the body node so it is something we can modify. |
672 | **/ | 672 | **/ |
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | 673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') |
674 | { | 674 | { |
675 | $topCandidate = $this->dom->createElement('div'); | 675 | $topCandidate = $this->dom->createElement('div'); |
676 | if ($page instanceof DOMDocument) { | 676 | if ($page instanceof DOMDocument) { |
677 | if (!isset($page->documentElement)) { | 677 | if (!isset($page->documentElement)) { |
678 | // we don't have a body either? what a mess! :) | 678 | // we don't have a body either? what a mess! :) |
679 | } else { | 679 | } else { |
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; | 680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; |
681 | $page->documentElement->innerHTML = ''; | 681 | $page->documentElement->innerHTML = ''; |
682 | $page->documentElement->appendChild($topCandidate); | 682 | $page->documentElement->appendChild($topCandidate); |
683 | } | 683 | } |
684 | } else { | 684 | } else { |
685 | $topCandidate->innerHTML = $page->innerHTML; | 685 | $topCandidate->innerHTML = $page->innerHTML; |
686 | $page->innerHTML = ''; | 686 | $page->innerHTML = ''; |
687 | $page->appendChild($topCandidate); | 687 | $page->appendChild($topCandidate); |
688 | } | 688 | } |
689 | $this->initializeNode($topCandidate); | 689 | $this->initializeNode($topCandidate); |
690 | } | 690 | } |
691 | 691 | ||
692 | /** | 692 | /** |
693 | * Now that we have the top candidate, look through its siblings for content that might also be related. | 693 | * Now that we have the top candidate, look through its siblings for content that might also be related. |
694 | * Things like preambles, content split by ads that we removed, etc. | 694 | * Things like preambles, content split by ads that we removed, etc. |
695 | **/ | 695 | **/ |
696 | $articleContent = $this->dom->createElement('div'); | 696 | $articleContent = $this->dom->createElement('div'); |
697 | $articleContent->setAttribute('id', 'readability-content'); | 697 | $articleContent->setAttribute('id', 'readability-content'); |
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | 698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); |
699 | $siblingNodes = $topCandidate->parentNode->childNodes; | 699 | $siblingNodes = $topCandidate->parentNode->childNodes; |
700 | if (!isset($siblingNodes)) { | 700 | if (!isset($siblingNodes)) { |
701 | $siblingNodes = new stdClass; | 701 | $siblingNodes = new stdClass; |
702 | $siblingNodes->length = 0; | 702 | $siblingNodes->length = 0; |
703 | } | 703 | } |
704 | 704 | ||
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | 705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) |
706 | { | 706 | { |
707 | $siblingNode = $siblingNodes->item($s); | 707 | $siblingNode = $siblingNodes->item($s); |
708 | $append = false; | 708 | $append = false; |
709 | 709 | ||
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | 710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); |
711 | 711 | ||
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | 712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); |
713 | 713 | ||
714 | if ($siblingNode === $topCandidate) | 714 | if ($siblingNode === $topCandidate) |
715 | // or if ($siblingNode->isSameNode($topCandidate)) | 715 | // or if ($siblingNode->isSameNode($topCandidate)) |
716 | { | 716 | { |
717 | $append = true; | 717 | $append = true; |
718 | } | 718 | } |
719 | 719 | ||
720 | $contentBonus = 0; | 720 | $contentBonus = 0; |
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | 721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | 722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { |
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | 723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; |
724 | } | 724 | } |
725 | 725 | ||
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | 726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) |
727 | { | 727 | { |
728 | $append = true; | 728 | $append = true; |
729 | } | 729 | } |
730 | 730 | ||
731 | if (strtoupper($siblingNode->nodeName) == 'P') { | 731 | if (strtoupper($siblingNode->nodeName) == 'P') { |
732 | $linkDensity = $this->getLinkDensity($siblingNode); | 732 | $linkDensity = $this->getLinkDensity($siblingNode); |
733 | $nodeContent = $this->getInnerText($siblingNode); | 733 | $nodeContent = $this->getInnerText($siblingNode); |
734 | $nodeLength = strlen($nodeContent); | 734 | $nodeLength = strlen($nodeContent); |
735 | 735 | ||
736 | if ($nodeLength > 80 && $linkDensity < 0.25) | 736 | if ($nodeLength > 80 && $linkDensity < 0.25) |
737 | { | 737 | { |
738 | $append = true; | 738 | $append = true; |
739 | } | 739 | } |
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | 740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) |
741 | { | 741 | { |
742 | $append = true; | 742 | $append = true; |
743 | } | 743 | } |
744 | } | 744 | } |
745 | 745 | ||
746 | if ($append) | 746 | if ($append) |
747 | { | 747 | { |
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | 748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); |
749 | 749 | ||
750 | $nodeToAppend = null; | 750 | $nodeToAppend = null; |
751 | $sibNodeName = strtoupper($siblingNode->nodeName); | 751 | $sibNodeName = strtoupper($siblingNode->nodeName); |
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | 752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { |
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | 753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
754 | 754 | ||
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | 755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); |
756 | $nodeToAppend = $this->dom->createElement('div'); | 756 | $nodeToAppend = $this->dom->createElement('div'); |
757 | try { | 757 | try { |
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | 758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); |
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | 759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; |
760 | } | 760 | } |
761 | catch(Exception $e) | 761 | catch(Exception $e) |
762 | { | 762 | { |
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | 763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); |
764 | $nodeToAppend = $siblingNode; | 764 | $nodeToAppend = $siblingNode; |
765 | $s--; | 765 | $s--; |
766 | $sl--; | 766 | $sl--; |
767 | } | 767 | } |
768 | } else { | 768 | } else { |
769 | $nodeToAppend = $siblingNode; | 769 | $nodeToAppend = $siblingNode; |
770 | $s--; | 770 | $s--; |
771 | $sl--; | 771 | $sl--; |
772 | } | 772 | } |
773 | 773 | ||
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | 774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ |
775 | $nodeToAppend->removeAttribute('class'); | 775 | $nodeToAppend->removeAttribute('class'); |
776 | 776 | ||
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | 777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
778 | $articleContent->appendChild($nodeToAppend); | 778 | $articleContent->appendChild($nodeToAppend); |
779 | } | 779 | } |
780 | } | 780 | } |
781 | 781 | ||
782 | /** | 782 | /** |
783 | * So we have all of the content that we need. Now we clean it up for presentation. | 783 | * So we have all of the content that we need. Now we clean it up for presentation. |
784 | **/ | 784 | **/ |
785 | $this->prepArticle($articleContent); | 785 | $this->prepArticle($articleContent); |
786 | 786 | ||
787 | /** | 787 | /** |
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | 788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. |
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | 789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | 790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
791 | * finding the -right- content. | 791 | * finding the -right- content. |
792 | **/ | 792 | **/ |
793 | if (strlen($this->getInnerText($articleContent, false)) < 250) | 793 | if (strlen($this->getInnerText($articleContent, false)) < 250) |
794 | { | 794 | { |
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 | 795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 |
796 | // in the meantime, we check and create an empty element if it's not there. | 796 | // in the meantime, we check and create an empty element if it's not there. |
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); | 797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); |
798 | $this->body->innerHTML = $this->bodyCache; | 798 | $this->body->innerHTML = $this->bodyCache; |
799 | 799 | ||
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | 800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { |
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | 801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); |
802 | return $this->grabArticle($this->body); | 802 | return $this->grabArticle($this->body); |
803 | } | 803 | } |
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | 805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); |
806 | return $this->grabArticle($this->body); | 806 | return $this->grabArticle($this->body); |
807 | } | 807 | } |
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | 809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); |
810 | return $this->grabArticle($this->body); | 810 | return $this->grabArticle($this->body); |
811 | } | 811 | } |
812 | else { | 812 | else { |
813 | return false; | 813 | return false; |
814 | } | 814 | } |
815 | } | 815 | } |
816 | return $articleContent; | 816 | return $articleContent; |
817 | } | 817 | } |
818 | 818 | ||
819 | /** | 819 | /** |
820 | * Remove script tags from document | 820 | * Remove script tags from document |
821 | * | 821 | * |
822 | * @param DOMElement | 822 | * @param DOMElement |
823 | * @return void | 823 | * @return void |
824 | */ | 824 | */ |
825 | public function removeScripts($doc) { | 825 | public function removeScripts($doc) { |
826 | $scripts = $doc->getElementsByTagName('script'); | 826 | $scripts = $doc->getElementsByTagName('script'); |
827 | for($i = $scripts->length-1; $i >= 0; $i--) | 827 | for($i = $scripts->length-1; $i >= 0; $i--) |
828 | { | 828 | { |
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | 829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); |
830 | } | 830 | } |
831 | } | 831 | } |
832 | 832 | ||
833 | /** | 833 | /** |
834 | * Get the inner text of a node. | 834 | * Get the inner text of a node. |
835 | * This also strips out any excess whitespace to be found. | 835 | * This also strips out any excess whitespace to be found. |
836 | * | 836 | * |
837 | * @param DOMElement $ | 837 | * @param DOMElement $ |
838 | * @param boolean $normalizeSpaces (default: true) | 838 | * @param boolean $normalizeSpaces (default: true) |
839 | * @return string | 839 | * @return string |
840 | **/ | 840 | **/ |
841 | public function getInnerText($e, $normalizeSpaces=true) { | 841 | public function getInnerText($e, $normalizeSpaces=true) { |
842 | $textContent = ''; | 842 | $textContent = ''; |
843 | 843 | ||
844 | if (!isset($e->textContent) || $e->textContent == '') { | 844 | if (!isset($e->textContent) || $e->textContent == '') { |
845 | return ''; | 845 | return ''; |
846 | } | 846 | } |
847 | 847 | ||
848 | $textContent = trim($e->textContent); | 848 | $textContent = trim($e->textContent); |
849 | 849 | ||
850 | if ($normalizeSpaces) { | 850 | if ($normalizeSpaces) { |
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | 851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); |
852 | } else { | 852 | } else { |
853 | return $textContent; | 853 | return $textContent; |
854 | } | 854 | } |
855 | } | 855 | } |
856 | 856 | ||
857 | /** | 857 | /** |
858 | * Get the number of times a string $s appears in the node $e. | 858 | * Get the number of times a string $s appears in the node $e. |
859 | * | 859 | * |
860 | * @param DOMElement $e | 860 | * @param DOMElement $e |
861 | * @param string - what to count. Default is "," | 861 | * @param string - what to count. Default is "," |
862 | * @return number (integer) | 862 | * @return number (integer) |
863 | **/ | 863 | **/ |
864 | public function getCharCount($e, $s=',') { | 864 | public function getCharCount($e, $s=',') { |
865 | return substr_count($this->getInnerText($e), $s); | 865 | return substr_count($this->getInnerText($e), $s); |
866 | } | 866 | } |
867 | 867 | ||
868 | /** | 868 | /** |
869 | * Remove the style attribute on every $e and under. | 869 | * Remove the style attribute on every $e and under. |
870 | * | 870 | * |
871 | * @param DOMElement $e | 871 | * @param DOMElement $e |
872 | * @return void | 872 | * @return void |
873 | */ | 873 | */ |
874 | public function cleanStyles($e) { | 874 | public function cleanStyles($e) { |
875 | if (!is_object($e)) return; | 875 | if (!is_object($e)) return; |
876 | $elems = $e->getElementsByTagName('*'); | 876 | $elems = $e->getElementsByTagName('*'); |
877 | foreach ($elems as $elem) { | 877 | foreach ($elems as $elem) { |
878 | $elem->removeAttribute('style'); | 878 | $elem->removeAttribute('style'); |
879 | } | 879 | } |
880 | } | 880 | } |
881 | 881 | ||
882 | /** | 882 | /** |
883 | * Get the density of links as a percentage of the content | 883 | * Get the density of links as a percentage of the content |
884 | * This is the amount of text that is inside a link divided by the total text in the node. | 884 | * This is the amount of text that is inside a link divided by the total text in the node. |
885 | * | 885 | * |
886 | * @param DOMElement $e | 886 | * @param DOMElement $e |
887 | * @return number (float) | 887 | * @return number (float) |
888 | */ | 888 | */ |
889 | public function getLinkDensity($e) { | 889 | public function getLinkDensity($e) { |
890 | $links = $e->getElementsByTagName('a'); | 890 | $links = $e->getElementsByTagName('a'); |
891 | $textLength = strlen($this->getInnerText($e)); | 891 | $textLength = strlen($this->getInnerText($e)); |
892 | $linkLength = 0; | 892 | $linkLength = 0; |
893 | for ($i=0, $il=$links->length; $i < $il; $i++) | 893 | for ($i=0, $il=$links->length; $i < $il; $i++) |
894 | { | 894 | { |
895 | $linkLength += strlen($this->getInnerText($links->item($i))); | 895 | $linkLength += strlen($this->getInnerText($links->item($i))); |
896 | } | 896 | } |
897 | if ($textLength > 0) { | 897 | if ($textLength > 0) { |
898 | return $linkLength / $textLength; | 898 | return $linkLength / $textLength; |
899 | } else { | 899 | } else { |
900 | return 0; | 900 | return 0; |
901 | } | 901 | } |
902 | } | 902 | } |
903 | 903 | ||
904 | /** | 904 | /** |
905 | * Get an elements class/id weight. Uses regular expressions to tell if this | 905 | * Get an elements class/id weight. Uses regular expressions to tell if this |
906 | * element looks good or bad. | 906 | * element looks good or bad. |
907 | * | 907 | * |
908 | * @param DOMElement $e | 908 | * @param DOMElement $e |
909 | * @return number (Integer) | 909 | * @return number (Integer) |
910 | */ | 910 | */ |
911 | public function getClassWeight($e) { | 911 | public function getClassWeight($e) { |
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
913 | return 0; | 913 | return 0; |
914 | } | 914 | } |
915 | 915 | ||
916 | $weight = 0; | 916 | $weight = 0; |
917 | 917 | ||
918 | /* Look for a special classname */ | 918 | /* Look for a special classname */ |
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | 919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') |
920 | { | 920 | { |
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | 921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { |
922 | $weight -= 25; | 922 | $weight -= 25; |
923 | } | 923 | } |
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | 924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { |
925 | $weight += 25; | 925 | $weight += 25; |
926 | } | 926 | } |
927 | } | 927 | } |
928 | 928 | ||
929 | /* Look for a special ID */ | 929 | /* Look for a special ID */ |
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | 930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') |
931 | { | 931 | { |
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | 932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { |
933 | $weight -= 25; | 933 | $weight -= 25; |
934 | } | 934 | } |
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | 935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { |
936 | $weight += 25; | 936 | $weight += 25; |
937 | } | 937 | } |
938 | } | 938 | } |
939 | return $weight; | 939 | return $weight; |
940 | } | 940 | } |
941 | 941 | ||
942 | /** | 942 | /** |
943 | * Remove extraneous break tags from a node. | 943 | * Remove extraneous break tags from a node. |
944 | * | 944 | * |
945 | * @param DOMElement $node | 945 | * @param DOMElement $node |
946 | * @return void | 946 | * @return void |
947 | */ | 947 | */ |
948 | public function killBreaks($node) { | 948 | public function killBreaks($node) { |
949 | $html = $node->innerHTML; | 949 | $html = $node->innerHTML; |
950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | 950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); |
951 | $node->innerHTML = $html; | 951 | $node->innerHTML = $html; |
952 | } | 952 | } |
953 | 953 | ||
954 | /** | 954 | /** |
955 | * Clean a node of all elements of type "tag". | 955 | * Clean a node of all elements of type "tag". |
956 | * (Unless it's a youtube/vimeo video. People love movies.) | 956 | * (Unless it's a youtube/vimeo video. People love movies.) |
957 | * | 957 | * |
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes | 958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes |
959 | * | 959 | * |
960 | * @param DOMElement $e | 960 | * @param DOMElement $e |
961 | * @param string $tag | 961 | * @param string $tag |
962 | * @return void | 962 | * @return void |
963 | */ | 963 | */ |
964 | public function clean($e, $tag) { | 964 | public function clean($e, $tag) { |
965 | $targetList = $e->getElementsByTagName($tag); | 965 | $targetList = $e->getElementsByTagName($tag); |
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); | 966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); |
967 | 967 | ||
968 | for ($y=$targetList->length-1; $y >= 0; $y--) { | 968 | for ($y=$targetList->length-1; $y >= 0; $y--) { |
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | 969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ |
970 | if ($isEmbed) { | 970 | if ($isEmbed) { |
971 | $attributeValues = ''; | 971 | $attributeValues = ''; |
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | 972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { |
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | 973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) |
974 | } | 974 | } |
975 | 975 | ||
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | 976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ |
977 | if (preg_match($this->regexps['video'], $attributeValues)) { | 977 | if (preg_match($this->regexps['video'], $attributeValues)) { |
978 | continue; | 978 | continue; |
979 | } | 979 | } |
980 | 980 | ||
981 | /* Then check the elements inside this element for the same. */ | 981 | /* Then check the elements inside this element for the same. */ |
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | 982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { |
983 | continue; | 983 | continue; |
984 | } | 984 | } |
985 | } | 985 | } |
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | 986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); |
987 | } | 987 | } |
988 | } | 988 | } |
989 | 989 | ||
990 | /** | 990 | /** |
991 | * Clean an element of all tags of type "tag" if they look fishy. | 991 | * Clean an element of all tags of type "tag" if they look fishy. |
992 | * "Fishy" is an algorithm based on content length, classnames, | 992 | * "Fishy" is an algorithm based on content length, classnames, |
993 | * link density, number of images & embeds, etc. | 993 | * link density, number of images & embeds, etc. |
994 | * | 994 | * |
995 | * @param DOMElement $e | 995 | * @param DOMElement $e |
996 | * @param string $tag | 996 | * @param string $tag |
997 | * @return void | 997 | * @return void |
998 | */ | 998 | */ |
999 | public function cleanConditionally($e, $tag) { | 999 | public function cleanConditionally($e, $tag) { |
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
1001 | return; | 1001 | return; |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | $tagsList = $e->getElementsByTagName($tag); | 1004 | $tagsList = $e->getElementsByTagName($tag); |
1005 | $curTagsLength = $tagsList->length; | 1005 | $curTagsLength = $tagsList->length; |
1006 | 1006 | ||
1007 | /** | 1007 | /** |
1008 | * Gather counts for other typical elements embedded within. | 1008 | * Gather counts for other typical elements embedded within. |
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | 1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. |
1010 | * | 1010 | * |
1011 | * TODO: Consider taking into account original contentScore here. | 1011 | * TODO: Consider taking into account original contentScore here. |
1012 | */ | 1012 | */ |
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | 1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { |
1014 | $weight = $this->getClassWeight($tagsList->item($i)); | 1014 | $weight = $this->getClassWeight($tagsList->item($i)); |
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | 1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; |
1016 | 1016 | ||
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | 1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); |
1018 | 1018 | ||
1019 | if ($weight + $contentScore < 0) { | 1019 | if ($weight + $contentScore < 0) { |
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1021 | } | 1021 | } |
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | 1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { |
1023 | /** | 1023 | /** |
1024 | * If there are not very many commas, and the number of | 1024 | * If there are not very many commas, and the number of |
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | 1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. |
1026 | **/ | 1026 | **/ |
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | 1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; |
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | 1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; |
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | 1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; |
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | 1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; |
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; | 1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; |
1032 | 1032 | ||
1033 | $embedCount = 0; | 1033 | $embedCount = 0; |
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | 1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); |
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1037 | $embedCount++; | 1037 | $embedCount++; |
1038 | } | 1038 | } |
1039 | } | 1039 | } |
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); | 1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); |
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1043 | $embedCount++; | 1043 | $embedCount++; |
1044 | } | 1044 | } |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | 1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); |
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | 1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); |
1049 | $toRemove = false; | 1049 | $toRemove = false; |
1050 | 1050 | ||
1051 | if ($this->lightClean) { | 1051 | if ($this->lightClean) { |
1052 | $this->dbg('Light clean...'); | 1052 | $this->dbg('Light clean...'); |
1053 | if ( ($img > $p) && ($img > 4) ) { | 1053 | if ( ($img > $p) && ($img > 4) ) { |
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); | 1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); |
1055 | $toRemove = true; | 1055 | $toRemove = true; |
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1058 | $toRemove = true; | 1058 | $toRemove = true; |
1059 | } else if ( $input > floor($p/3) ) { | 1059 | } else if ( $input > floor($p/3) ) { |
1060 | $this->dbg(' too many <input> elements'); | 1060 | $this->dbg(' too many <input> elements'); |
1061 | $toRemove = true; | 1061 | $toRemove = true; |
1062 | } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { | 1062 | } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) { |
1063 | $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); | 1063 | $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images'); |
1064 | $toRemove = true; | 1064 | $toRemove = true; |
1065 | } else if($weight < 25 && $linkDensity > 0.2) { | 1065 | } else if($weight < 25 && $linkDensity > 0.2) { |
1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1067 | $toRemove = true; | 1067 | $toRemove = true; |
1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { | 1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { |
1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); | 1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); |
1070 | $toRemove = true; | 1070 | $toRemove = true; |
1071 | } else if($embedCount > 3) { | 1071 | } else if($embedCount > 3) { |
1072 | $this->dbg(' more than 3 embeds'); | 1072 | $this->dbg(' more than 3 embeds'); |
1073 | $toRemove = true; | 1073 | $toRemove = true; |
1074 | } | 1074 | } |
1075 | } else { | 1075 | } else { |
1076 | $this->dbg('Standard clean...'); | 1076 | $this->dbg('Standard clean...'); |
1077 | if ( $img > $p ) { | 1077 | if ( $img > $p ) { |
1078 | $this->dbg(' more image elements than paragraph elements'); | 1078 | $this->dbg(' more image elements than paragraph elements'); |
1079 | $toRemove = true; | 1079 | $toRemove = true; |
1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1082 | $toRemove = true; | 1082 | $toRemove = true; |
1083 | } else if ( $input > floor($p/3) ) { | 1083 | } else if ( $input > floor($p/3) ) { |
1084 | $this->dbg(' too many <input> elements'); | 1084 | $this->dbg(' too many <input> elements'); |
1085 | $toRemove = true; | 1085 | $toRemove = true; |
1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | 1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { |
1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); | 1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); |
1088 | $toRemove = true; | 1088 | $toRemove = true; |
1089 | } else if($weight < 25 && $linkDensity > 0.2) { | 1089 | } else if($weight < 25 && $linkDensity > 0.2) { |
1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1091 | $toRemove = true; | 1091 | $toRemove = true; |
1092 | } else if($weight >= 25 && $linkDensity > 0.5) { | 1092 | } else if($weight >= 25 && $linkDensity > 0.5) { |
1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); | 1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); |
1094 | $toRemove = true; | 1094 | $toRemove = true; |
1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | 1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { |
1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); | 1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
1097 | $toRemove = true; | 1097 | $toRemove = true; |
1098 | } | 1098 | } |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | if ($toRemove) { | 1101 | if ($toRemove) { |
1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); | 1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); |
1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1104 | } | 1104 | } |
1105 | } | 1105 | } |
1106 | } | 1106 | } |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | /** | 1109 | /** |
1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | 1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
1111 | * | 1111 | * |
1112 | * @param DOMElement $e | 1112 | * @param DOMElement $e |
1113 | * @return void | 1113 | * @return void |
1114 | */ | 1114 | */ |
1115 | public function cleanHeaders($e) { | 1115 | public function cleanHeaders($e) { |
1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | 1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); | 1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); |
1118 | for ($i=$headers->length-1; $i >=0; $i--) { | 1118 | for ($i=$headers->length-1; $i >=0; $i--) { |
1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | 1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { |
1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | 1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); |
1121 | } | 1121 | } |
1122 | } | 1122 | } |
1123 | } | 1123 | } |
1124 | } | 1124 | } |
1125 | 1125 | ||
1126 | public function flagIsActive($flag) { | 1126 | public function flagIsActive($flag) { |
1127 | return ($this->flags & $flag) > 0; | 1127 | return ($this->flags & $flag) > 0; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | public function addFlag($flag) { | 1130 | public function addFlag($flag) { |
1131 | $this->flags = $this->flags | $flag; | 1131 | $this->flags = $this->flags | $flag; |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | public function removeFlag($flag) { | 1134 | public function removeFlag($flag) { |
1135 | $this->flags = $this->flags & ~$flag; | 1135 | $this->flags = $this->flags & ~$flag; |
1136 | } | 1136 | } |
1137 | } | 1137 | } |
1138 | ?> \ No newline at end of file | 1138 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php index 135964f1..7a56be8c 100755 --- a/inc/3rdparty/makefulltextfeed.php +++ b/inc/3rdparty/makefulltextfeed.php | |||
@@ -3,8 +3,8 @@ | |||
3 | // Author: Keyvan Minoukadeh | 3 | // Author: Keyvan Minoukadeh |
4 | // Copyright (c) 2013 Keyvan Minoukadeh | 4 | // Copyright (c) 2013 Keyvan Minoukadeh |
5 | // License: AGPLv3 | 5 | // License: AGPLv3 |
6 | // Version: 3.1 | 6 | // Version: 3.2 |
7 | // Date: 2013-03-05 | 7 | // Date: 2013-05-13 |
8 | // More info: http://fivefilters.org/content-only/ | 8 | // More info: http://fivefilters.org/content-only/ |
9 | // Help: http://help.fivefilters.org | 9 | // Help: http://help.fivefilters.org |
10 | 10 | ||
@@ -25,12 +25,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. | |||
25 | 25 | ||
26 | // Usage | 26 | // Usage |
27 | // ----- | 27 | // ----- |
28 | // Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org | 28 | // Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article |
29 | // The following options can be passed in the querystring: | 29 | // For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage |
30 | // * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url)) | ||
31 | // * URL points to HTML (not feed): html=true (optional, by default it's automatically detected) | ||
32 | // * API key: key=[api key] (optional, refer to config.php) | ||
33 | // * Max entries to process: max=[max number of items] (optional) | ||
34 | 30 | ||
35 | error_reporting(E_ALL ^ E_NOTICE); | 31 | error_reporting(E_ALL ^ E_NOTICE); |
36 | ini_set("display_errors", 1); | 32 | ini_set("display_errors", 1); |
@@ -76,8 +72,8 @@ header('X-Robots-Tag: noindex, nofollow'); | |||
76 | //////////////////////////////// | 72 | //////////////////////////////// |
77 | // Check if service is enabled | 73 | // Check if service is enabled |
78 | //////////////////////////////// | 74 | //////////////////////////////// |
79 | if (!$options->enabled) { | 75 | if (!$options->enabled) { |
80 | die('The full-text RSS service is currently disabled'); | 76 | die('The full-text RSS service is currently disabled'); |
81 | } | 77 | } |
82 | 78 | ||
83 | //////////////////////////////// | 79 | //////////////////////////////// |
@@ -121,8 +117,8 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc'); | |||
121 | //////////////////////////////// | 117 | //////////////////////////////// |
122 | // Check for feed URL | 118 | // Check for feed URL |
123 | //////////////////////////////// | 119 | //////////////////////////////// |
124 | if (!isset($_GET['url'])) { | 120 | if (!isset($_GET['url'])) { |
125 | die('No URL supplied'); | 121 | die('No URL supplied'); |
126 | } | 122 | } |
127 | $url = trim($_GET['url']); | 123 | $url = trim($_GET['url']); |
128 | if (strtolower(substr($url, 0, 7)) == 'feed://') { | 124 | if (strtolower(substr($url, 0, 7)) == 'feed://') { |
@@ -161,10 +157,12 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap | |||
161 | if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); | 157 | if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); |
162 | if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); | 158 | if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); |
163 | if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); | 159 | if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); |
164 | if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); | 160 | if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); |
165 | if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); | 161 | if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); |
166 | if (isset($_GET['xss'])) $redirect .= '&xss'; | 162 | if (isset($_GET['xss'])) $redirect .= '&xss'; |
167 | if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; | 163 | if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; |
164 | if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']); | ||
165 | if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']); | ||
168 | if (isset($_GET['debug'])) $redirect .= '&debug'; | 166 | if (isset($_GET['debug'])) $redirect .= '&debug'; |
169 | if ($debug_mode) { | 167 | if ($debug_mode) { |
170 | debug('Redirecting to hide access key, follow URL below to continue'); | 168 | debug('Redirecting to hide access key, follow URL below to continue'); |
@@ -177,7 +175,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap | |||
177 | 175 | ||
178 | /////////////////////////////////////////////// | 176 | /////////////////////////////////////////////// |
179 | // Set timezone. | 177 | // Set timezone. |
180 | // Prevents warnings, but needs more testing - | 178 | // Prevents warnings, but needs more testing - |
181 | // perhaps if timezone is set in php.ini we | 179 | // perhaps if timezone is set in php.ini we |
182 | // don't need to set it at all... | 180 | // don't need to set it at all... |
183 | /////////////////////////////////////////////// | 181 | /////////////////////////////////////////////// |
@@ -199,7 +197,7 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int | |||
199 | } | 197 | } |
200 | $key_index = ($valid_key) ? (int)$_GET['key'] : 0; | 198 | $key_index = ($valid_key) ? (int)$_GET['key'] : 0; |
201 | if (!$valid_key && $options->key_required) { | 199 | if (!$valid_key && $options->key_required) { |
202 | die('A valid key must be supplied'); | 200 | die('A valid key must be supplied'); |
203 | } | 201 | } |
204 | if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') { | 202 | if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') { |
205 | die('The entered key is invalid'); | 203 | die('The entered key is invalid'); |
@@ -251,6 +249,28 @@ if ($options->favour_feed_titles == 'user') { | |||
251 | } | 249 | } |
252 | 250 | ||
253 | /////////////////////////////////////////////// | 251 | /////////////////////////////////////////////// |
252 | // Include full content in output? | ||
253 | /////////////////////////////////////////////// | ||
254 | if ($options->content === 'user') { | ||
255 | if (isset($_GET['content']) && $_GET['content'] === '0') { | ||
256 | $options->content = false; | ||
257 | } else { | ||
258 | $options->content = true; | ||
259 | } | ||
260 | } | ||
261 | |||
262 | /////////////////////////////////////////////// | ||
263 | // Include summaries in output? | ||
264 | /////////////////////////////////////////////// | ||
265 | if ($options->summary === 'user') { | ||
266 | if (isset($_GET['summary']) && $_GET['summary'] === '1') { | ||
267 | $options->summary = true; | ||
268 | } else { | ||
269 | $options->summary = false; | ||
270 | } | ||
271 | } | ||
272 | |||
273 | /////////////////////////////////////////////// | ||
254 | // Exclude items if extraction fails | 274 | // Exclude items if extraction fails |
255 | /////////////////////////////////////////////// | 275 | /////////////////////////////////////////////// |
256 | if ($options->exclude_items_on_fail === 'user') { | 276 | if ($options->exclude_items_on_fail === 'user') { |
@@ -272,15 +292,6 @@ if ($options->detect_language === 'user') { | |||
272 | $detect_language = $options->detect_language; | 292 | $detect_language = $options->detect_language; |
273 | } | 293 | } |
274 | 294 | ||
275 | if ($detect_language >= 2) { | ||
276 | $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg', | ||
277 | 'cebuano' => 'ceb', // ISO 639-2 | ||
278 | 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha', | ||
279 | 'hawaiian' => 'haw', // ISO 639-2 | ||
280 | 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps', | ||
281 | 'pidgin' => 'cpe', // ISO 639-2 | ||
282 | 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy'); | ||
283 | } | ||
284 | $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); | 295 | $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); |
285 | 296 | ||
286 | ///////////////////////////////////// | 297 | ///////////////////////////////////// |
@@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *'); | |||
330 | ////////////////////////////////// | 341 | ////////////////////////////////// |
331 | if ($options->caching) { | 342 | if ($options->caching) { |
332 | debug('Caching is enabled...'); | 343 | debug('Caching is enabled...'); |
333 | $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); | 344 | $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); |
334 | $check_cache = true; | 345 | $check_cache = true; |
335 | if ($options->apc && $options->smart_cache) { | 346 | if ($options->apc && $options->smart_cache) { |
336 | apc_add("cache.$cache_id", 0, 10*60); | 347 | apc_add("cache.$cache_id", 0, 10*60); |
@@ -468,7 +479,7 @@ if ($img_url = $feed->get_image_url()) { | |||
468 | //////////////////////////////////////////// | 479 | //////////////////////////////////////////// |
469 | // Loop through feed items | 480 | // Loop through feed items |
470 | //////////////////////////////////////////// | 481 | //////////////////////////////////////////// |
471 | $items = $feed->get_items(0, $max); | 482 | $items = $feed->get_items(0, $max); |
472 | // Request all feed items in parallel (if supported) | 483 | // Request all feed items in parallel (if supported) |
473 | $urls_sanitized = array(); | 484 | $urls_sanitized = array(); |
474 | $urls = array(); | 485 | $urls = array(); |
@@ -550,24 +561,43 @@ foreach ($items as $key => $item) { | |||
550 | $is_single_page = false; | 561 | $is_single_page = false; |
551 | if ($single_page_response = getSinglePage($item, $html, $effective_url)) { | 562 | if ($single_page_response = getSinglePage($item, $html, $effective_url)) { |
552 | $is_single_page = true; | 563 | $is_single_page = true; |
553 | $html = $single_page_response['body']; | ||
554 | // remove strange things | ||
555 | $html = str_replace('</[>', '', $html); | ||
556 | $html = convert_to_utf8($html, $single_page_response['headers']); | ||
557 | $effective_url = $single_page_response['effective_url']; | 564 | $effective_url = $single_page_response['effective_url']; |
558 | debug("Retrieved single-page view from $effective_url"); | 565 | // check if action defined for returned Content-Type |
566 | $mime_info = get_mime_action_info($single_page_response['headers']); | ||
567 | if (isset($mime_info['action'])) { | ||
568 | if ($mime_info['action'] == 'exclude') { | ||
569 | continue; // skip this feed item entry | ||
570 | } elseif ($mime_info['action'] == 'link') { | ||
571 | if ($mime_info['type'] == 'image') { | ||
572 | $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>"; | ||
573 | } else { | ||
574 | $html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>"; | ||
575 | } | ||
576 | $extracted_title = $mime_info['name']; | ||
577 | $do_content_extraction = false; | ||
578 | } | ||
579 | } | ||
580 | if ($do_content_extraction) { | ||
581 | $html = $single_page_response['body']; | ||
582 | // remove strange things | ||
583 | $html = str_replace('</[>', '', $html); | ||
584 | $html = convert_to_utf8($html, $single_page_response['headers']); | ||
585 | debug("Retrieved single-page view from $effective_url"); | ||
586 | } | ||
559 | unset($single_page_response); | 587 | unset($single_page_response); |
560 | } | 588 | } |
589 | } | ||
590 | if ($do_content_extraction) { | ||
561 | debug('--------'); | 591 | debug('--------'); |
562 | debug('Attempting to extract content'); | 592 | debug('Attempting to extract content'); |
563 | $extract_result = $extractor->process($html, $effective_url); | 593 | $extract_result = $extractor->process($html, $effective_url); |
564 | $readability = $extractor->readability; | 594 | $readability = $extractor->readability; |
565 | $content_block = ($extract_result) ? $extractor->getContent() : null; | 595 | $content_block = ($extract_result) ? $extractor->getContent() : null; |
566 | $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; | 596 | $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; |
567 | // Deal with multi-page articles | 597 | // Deal with multi-page articles |
568 | //die('Next: '.$extractor->getNextPageUrl()); | 598 | //die('Next: '.$extractor->getNextPageUrl()); |
569 | $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); | 599 | $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); |
570 | if ($options->multipage && $is_multi_page) { | 600 | if ($options->multipage && $is_multi_page && $options->content) { |
571 | debug('--------'); | 601 | debug('--------'); |
572 | debug('Attempting to process multi-page article'); | 602 | debug('Attempting to process multi-page article'); |
573 | $multi_page_urls = array(); | 603 | $multi_page_urls = array(); |
@@ -580,7 +610,7 @@ foreach ($items as $key => $item) { | |||
580 | // check it's not what we have already! | 610 | // check it's not what we have already! |
581 | if (!in_array($next_page_url, $multi_page_urls)) { | 611 | if (!in_array($next_page_url, $multi_page_urls)) { |
582 | // it's not, so let's attempt to fetch it | 612 | // it's not, so let's attempt to fetch it |
583 | $multi_page_urls[] = $next_page_url; | 613 | $multi_page_urls[] = $next_page_url; |
584 | $_prev_ref = $http->referer; | 614 | $_prev_ref = $http->referer; |
585 | if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { | 615 | if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { |
586 | // make sure mime type is not something with a different action associated | 616 | // make sure mime type is not something with a different action associated |
@@ -605,13 +635,15 @@ foreach ($items as $key => $item) { | |||
605 | // did we successfully deal with this multi-page article? | 635 | // did we successfully deal with this multi-page article? |
606 | if (empty($multi_page_content)) { | 636 | if (empty($multi_page_content)) { |
607 | debug('Failed to extract all parts of multi-page article, so not going to include them'); | 637 | debug('Failed to extract all parts of multi-page article, so not going to include them'); |
608 | $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; | 638 | $_page = $readability->dom->createElement('p'); |
639 | $_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; | ||
640 | $multi_page_content[] = $_page; | ||
609 | } | 641 | } |
610 | foreach ($multi_page_content as $_page) { | 642 | foreach ($multi_page_content as $_page) { |
611 | $_page = $content_block->ownerDocument->importNode($_page, true); | 643 | $_page = $content_block->ownerDocument->importNode($_page, true); |
612 | $content_block->appendChild($_page); | 644 | $content_block->appendChild($_page); |
613 | } | 645 | } |
614 | unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); | 646 | unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page); |
615 | } | 647 | } |
616 | } | 648 | } |
617 | // use extracted title for both feed and item title if we're using single-item dummy feed | 649 | // use extracted title for both feed and item title if we're using single-item dummy feed |
@@ -658,7 +690,7 @@ foreach ($items as $key => $item) { | |||
658 | } else { | 690 | } else { |
659 | $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML | 691 | $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML |
660 | } | 692 | } |
661 | unset($content_block); | 693 | //unset($content_block); |
662 | // post-processing cleanup | 694 | // post-processing cleanup |
663 | $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html); | 695 | $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html); |
664 | if ($links == 'remove') { | 696 | if ($links == 'remove') { |
@@ -671,130 +703,155 @@ foreach ($items as $key => $item) { | |||
671 | } | 703 | } |
672 | } | 704 | } |
673 | 705 | ||
674 | if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment | 706 | if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment |
675 | $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); | 707 | $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); |
708 | } else { | ||
709 | $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); | ||
710 | } | ||
711 | // filter xss? | ||
712 | if ($xss_filter) { | ||
713 | debug('Filtering HTML to remove XSS'); | ||
714 | $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); | ||
715 | } | ||
716 | |||
717 | // add content | ||
718 | if ($options->summary === true) { | ||
719 | // get summary | ||
720 | $summary = ''; | ||
721 | if (!$do_content_extraction) { | ||
722 | $summary = $html; | ||
676 | } else { | 723 | } else { |
677 | $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); | 724 | // Try to get first few paragraphs |
678 | } | 725 | if (isset($content_block) && ($content_block instanceof DOMElement)) { |
679 | // filter xss? | 726 | $_paras = $content_block->getElementsByTagName('p'); |
680 | if ($xss_filter) { | 727 | foreach ($_paras as $_para) { |
681 | debug('Filtering HTML to remove XSS'); | 728 | $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' '; |
682 | $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); | 729 | if (strlen($summary) > 200) break; |
683 | } | ||
684 | $newitem->setDescription($html); | ||
685 | |||
686 | // set date | ||
687 | if ((int)$item->get_date('U') > 0) { | ||
688 | $newitem->setDate((int)$item->get_date('U')); | ||
689 | } elseif ($extractor->getDate()) { | ||
690 | $newitem->setDate($extractor->getDate()); | ||
691 | } | ||
692 | |||
693 | // add authors | ||
694 | if ($authors = $item->get_authors()) { | ||
695 | foreach ($authors as $author) { | ||
696 | // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel | ||
697 | if ($author->get_name() !== null) { | ||
698 | $newitem->addElement('dc:creator', $author->get_name()); | ||
699 | } elseif ($author->get_email() !== null) { | ||
700 | $newitem->addElement('dc:creator', $author->get_email()); | ||
701 | } | 730 | } |
731 | } else { | ||
732 | $summary = $html; | ||
702 | } | 733 | } |
703 | } elseif ($authors = $extractor->getAuthors()) { | 734 | } |
704 | //TODO: make sure the list size is reasonable | 735 | unset($_paras, $_para); |
705 | foreach ($authors as $author) { | 736 | $summary = get_excerpt($summary); |
706 | // TODO: xpath often selects authors from other articles linked from the page. | 737 | $newitem->setDescription($summary); |
707 | // for now choose first item | 738 | if ($options->content) $newitem->setElement('content:encoded', $html); |
708 | $newitem->addElement('dc:creator', $author); | 739 | } else { |
709 | break; | 740 | if ($options->content) $newitem->setDescription($html); |
741 | } | ||
742 | |||
743 | // set date | ||
744 | if ((int)$item->get_date('U') > 0) { | ||
745 | $newitem->setDate((int)$item->get_date('U')); | ||
746 | } elseif ($extractor->getDate()) { | ||
747 | $newitem->setDate($extractor->getDate()); | ||
748 | } | ||
749 | |||
750 | // add authors | ||
751 | if ($authors = $item->get_authors()) { | ||
752 | foreach ($authors as $author) { | ||
753 | // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel | ||
754 | if ($author->get_name() !== null) { | ||
755 | $newitem->addElement('dc:creator', $author->get_name()); | ||
756 | } elseif ($author->get_email() !== null) { | ||
757 | $newitem->addElement('dc:creator', $author->get_email()); | ||
710 | } | 758 | } |
711 | } | 759 | } |
712 | 760 | } elseif ($authors = $extractor->getAuthors()) { | |
713 | // add language | 761 | //TODO: make sure the list size is reasonable |
714 | if ($detect_language) { | 762 | foreach ($authors as $author) { |
715 | $language = $extractor->getLanguage(); | 763 | // TODO: xpath often selects authors from other articles linked from the page. |
716 | if (!$language) $language = $feed->get_language(); | 764 | // for now choose first item |
717 | if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { | 765 | $newitem->addElement('dc:creator', $author); |
718 | try { | 766 | break; |
719 | if ($use_cld) { | 767 | } |
720 | // Use PHP-CLD extension | 768 | } |
721 | $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error | 769 | |
722 | $res = $php_cld($text_sample); | 770 | // add language |
723 | if (is_array($res) && count($res) > 0) { | 771 | if ($detect_language) { |
724 | $language = $res[0]['code']; | 772 | $language = $extractor->getLanguage(); |
725 | } | 773 | if (!$language) $language = $feed->get_language(); |
726 | } else { | 774 | if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { |
727 | //die('what'); | 775 | try { |
728 | // Use PEAR's Text_LanguageDetect | 776 | if ($use_cld) { |
729 | if (!isset($l)) { | 777 | // Use PHP-CLD extension |
730 | $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); | 778 | $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error |
731 | } | 779 | $res = $php_cld($text_sample); |
732 | $l_result = $l->detect($text_sample, 1); | 780 | if (is_array($res) && count($res) > 0) { |
733 | if (count($l_result) > 0) { | 781 | $language = $res[0]['code']; |
734 | $language = $language_codes[key($l_result)]; | 782 | } |
735 | } | 783 | } else { |
784 | //die('what'); | ||
785 | // Use PEAR's Text_LanguageDetect | ||
786 | if (!isset($l)) { | ||
787 | $l = new Text_LanguageDetect(); | ||
788 | $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en") | ||
789 | } | ||
790 | $l_result = $l->detect($text_sample, 1); | ||
791 | if (count($l_result) > 0) { | ||
792 | $language = key($l_result); | ||
736 | } | 793 | } |
737 | } catch (Exception $e) { | ||
738 | //die('error: '.$e); | ||
739 | // do nothing | ||
740 | } | 794 | } |
741 | } | 795 | } catch (Exception $e) { |
742 | if ($language && (strlen($language) < 7)) { | 796 | //die('error: '.$e); |
743 | $newitem->addElement('dc:language', $language); | 797 | // do nothing |
744 | } | 798 | } |
745 | } | 799 | } |
746 | 800 | if ($language && (strlen($language) < 7)) { | |
747 | // add MIME type (if it appeared in our exclusions lists) | 801 | $newitem->addElement('dc:language', $language); |
748 | if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']); | ||
749 | // add effective URL (URL after redirects) | ||
750 | if (isset($effective_url)) { | ||
751 | //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. | ||
752 | //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi) | ||
753 | //temporary measure: use utf8_encode() | ||
754 | $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); | ||
755 | } else { | ||
756 | $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); | ||
757 | } | 802 | } |
758 | 803 | } | |
759 | // add categories | 804 | |
760 | if ($categories = $item->get_categories()) { | 805 | // add MIME type (if it appeared in our exclusions lists) |
761 | foreach ($categories as $category) { | 806 | if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']); |
762 | if ($category->get_label() !== null) { | 807 | // add effective URL (URL after redirects) |
763 | $newitem->addElement('category', $category->get_label()); | 808 | if (isset($effective_url)) { |
764 | } | 809 | //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. |
810 | //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi) | ||
811 | //temporary measure: use utf8_encode() | ||
812 | $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); | ||
813 | } else { | ||
814 | $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); | ||
815 | } | ||
816 | |||
817 | // add categories | ||
818 | if ($categories = $item->get_categories()) { | ||
819 | foreach ($categories as $category) { | ||
820 | if ($category->get_label() !== null) { | ||
821 | $newitem->addElement('category', $category->get_label()); | ||
765 | } | 822 | } |
766 | } | 823 | } |
767 | 824 | } | |
768 | // check for enclosures | 825 | |
769 | if ($options->keep_enclosures) { | 826 | // check for enclosures |
770 | if ($enclosures = $item->get_enclosures()) { | 827 | if ($options->keep_enclosures) { |
771 | foreach ($enclosures as $enclosure) { | 828 | if ($enclosures = $item->get_enclosures()) { |
772 | // thumbnails | 829 | foreach ($enclosures as $enclosure) { |
773 | foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { | 830 | // thumbnails |
774 | $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); | 831 | foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { |
775 | } | 832 | $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); |
776 | if (!$enclosure->get_link()) continue; | ||
777 | $enc = array(); | ||
778 | // Media RSS spec ($enc): http://search.yahoo.com/mrss | ||
779 | // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4 | ||
780 | $enc['url'] = $enclosure->get_link(); | ||
781 | if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length(); | ||
782 | if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type(); | ||
783 | if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium(); | ||
784 | if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression(); | ||
785 | if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate(); | ||
786 | if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate(); | ||
787 | if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate(); | ||
788 | if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels(); | ||
789 | if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration(); | ||
790 | if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height(); | ||
791 | if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width(); | ||
792 | if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language(); | ||
793 | $newitem->addElement('media:content', '', $enc); | ||
794 | } | 833 | } |
834 | if (!$enclosure->get_link()) continue; | ||
835 | $enc = array(); | ||
836 | // Media RSS spec ($enc): http://search.yahoo.com/mrss | ||
837 | // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4 | ||
838 | $enc['url'] = $enclosure->get_link(); | ||
839 | if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length(); | ||
840 | if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type(); | ||
841 | if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium(); | ||
842 | if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression(); | ||
843 | if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate(); | ||
844 | if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate(); | ||
845 | if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate(); | ||
846 | if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels(); | ||
847 | if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration(); | ||
848 | if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height(); | ||
849 | if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width(); | ||
850 | if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language(); | ||
851 | $newitem->addElement('media:content', '', $enc); | ||
795 | } | 852 | } |
796 | } | 853 | } |
797 | /* } */ | 854 | } |
798 | $output->addItem($newitem); | 855 | $output->addItem($newitem); |
799 | unset($html); | 856 | unset($html); |
800 | $item_count++; | 857 | $item_count++; |
@@ -831,7 +888,7 @@ if (!$debug_mode) { | |||
831 | } | 888 | } |
832 | if ($add_to_cache) { | 889 | if ($add_to_cache) { |
833 | ob_start(); | 890 | ob_start(); |
834 | $output->genarateFeed(); | 891 | $output->genarateFeed(false); |
835 | $output = ob_get_contents(); | 892 | $output = ob_get_contents(); |
836 | ob_end_clean(); | 893 | ob_end_clean(); |
837 | if ($html_only && $item_count == 0) { | 894 | if ($html_only && $item_count == 0) { |
@@ -842,7 +899,7 @@ if (!$debug_mode) { | |||
842 | } | 899 | } |
843 | echo $output; | 900 | echo $output; |
844 | } else { | 901 | } else { |
845 | $output->genarateFeed(); | 902 | $output->genarateFeed(false); |
846 | } | 903 | } |
847 | if ($callback) echo ');'; | 904 | if ($callback) echo ');'; |
848 | } | 905 | } |
diff --git a/inc/3rdparty/makefulltextfeedHelpers.php b/inc/3rdparty/makefulltextfeedHelpers.php index 1c11b8f6..4e985372 100755 --- a/inc/3rdparty/makefulltextfeedHelpers.php +++ b/inc/3rdparty/makefulltextfeedHelpers.php | |||
@@ -66,6 +66,38 @@ class DummySingleItem { | |||
66 | // HELPER FUNCTIONS | 66 | // HELPER FUNCTIONS |
67 | /////////////////////////////// | 67 | /////////////////////////////// |
68 | 68 | ||
69 | // Adapted from WordPress | ||
70 | // http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173 | ||
71 | function get_excerpt($text, $num_words=55, $more=null) { | ||
72 | if (null === $more) $more = '…'; | ||
73 | $text = strip_tags($text); | ||
74 | //TODO: Check if word count is based on single characters (East Asian characters) | ||
75 | /* | ||
76 | if (1==2) { | ||
77 | $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' '); | ||
78 | preg_match_all('/./u', $text, $words_array); | ||
79 | $words_array = array_slice($words_array[0], 0, $num_words + 1); | ||
80 | $sep = ''; | ||
81 | } else { | ||
82 | $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY); | ||
83 | $sep = ' '; | ||
84 | } | ||
85 | */ | ||
86 | $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY); | ||
87 | $sep = ' '; | ||
88 | if (count($words_array) > $num_words) { | ||
89 | array_pop($words_array); | ||
90 | $text = implode($sep, $words_array); | ||
91 | $text = $text.$more; | ||
92 | } else { | ||
93 | $text = implode($sep, $words_array); | ||
94 | } | ||
95 | // trim whitespace at beginning or end of string | ||
96 | // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2 | ||
97 | $text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text); | ||
98 | return $text; | ||
99 | } | ||
100 | |||
69 | function url_allowed($url) { | 101 | function url_allowed($url) { |
70 | global $options; | 102 | global $options; |
71 | if (!empty($options->allowed_urls)) { | 103 | if (!empty($options->allowed_urls)) { |
@@ -165,14 +197,6 @@ function convert_to_utf8($html, $header=null) | |||
165 | if (strtolower($encoding) != 'utf-8') { | 197 | if (strtolower($encoding) != 'utf-8') { |
166 | debug('Converting to UTF-8'); | 198 | debug('Converting to UTF-8'); |
167 | $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); | 199 | $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); |
168 | /* | ||
169 | if (function_exists('iconv')) { | ||
170 | // iconv appears to handle certain character encodings better than mb_convert_encoding | ||
171 | $html = iconv($encoding, 'utf-8', $html); | ||
172 | } else { | ||
173 | $html = mb_convert_encoding($html, 'utf-8', $encoding); | ||
174 | } | ||
175 | */ | ||
176 | } | 200 | } |
177 | } | 201 | } |
178 | } | 202 | } |
@@ -196,7 +220,7 @@ function makeAbsolute($base, $elem) { | |||
196 | } | 220 | } |
197 | function makeAbsoluteAttr($base, $e, $attr) { | 221 | function makeAbsoluteAttr($base, $e, $attr) { |
198 | if ($e->hasAttribute($attr)) { | 222 | if ($e->hasAttribute($attr)) { |
199 | // Trim leading and trailing white space. I don't really like this but | 223 | // Trim leading and trailing white space. I don't really like this but |
200 | // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" /> | 224 | // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" /> |
201 | $url = trim(str_replace('%20', ' ', $e->getAttribute($attr))); | 225 | $url = trim(str_replace('%20', ' ', $e->getAttribute($attr))); |
202 | $url = str_replace(' ', '%20', $url); | 226 | $url = str_replace(' ', '%20', $url); |
diff --git a/inc/3rdparty/site_config/custom/dailymotion.com.txt b/inc/3rdparty/site_config/custom/dailymotion.com.txt new file mode 100755 index 00000000..0cad808f --- /dev/null +++ b/inc/3rdparty/site_config/custom/dailymotion.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //title | ||
2 | body: //iframe | ||
3 | |||
4 | replace_string(<![CDATA[): _ | ||
5 | replace_string(]]>): _ | ||
6 | |||
7 | single_page_link: //link[@type='application/xml+oembed'] | ||
8 | |||
9 | prune: no | ||
10 | tidy: no | ||
11 | |||
12 | http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people | ||
diff --git a/inc/3rdparty/site_config/custom/index.php b/inc/3rdparty/site_config/custom/index.php new file mode 100644 index 00000000..a3d5f739 --- /dev/null +++ b/inc/3rdparty/site_config/custom/index.php | |||
@@ -0,0 +1,3 @@ | |||
1 | <?php | ||
2 | // this is here to prevent directory listing over the web | ||
3 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt b/inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt new file mode 100644 index 00000000..24aec5c3 --- /dev/null +++ b/inc/3rdparty/site_config/custom/mobile.lemondeinformatique.fr.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2 | ||
2 | body: div[@id='illustration'] | //p | ||
3 | prune: no | ||
4 | tidy: no | ||
5 | |||
6 | test_url: http://mobile.lemondeinformatique.fr/actualites/lire-les-datacenters-d-apple-google-et-facebook-eco-responsables-selon-greenpeace-le-monde-informatique-57122.html | ||
diff --git a/inc/3rdparty/site_config/custom/ted.com.txt b/inc/3rdparty/site_config/custom/ted.com.txt new file mode 100755 index 00000000..4940d2bc --- /dev/null +++ b/inc/3rdparty/site_config/custom/ted.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //title | ||
2 | body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image'] | ||
3 | |||
4 | strip_id_or_class: talk-transcript__para__time | ||
5 | |||
6 | single_page_link: //a[@id='hero-transcript-link'] | ||
7 | |||
8 | #prune: no | ||
9 | tidy: no | ||
10 | |||
11 | test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are | ||
diff --git a/inc/3rdparty/site_config/index.php b/inc/3rdparty/site_config/index.php index a1b767fd..76ca8b3c 100644 --- a/inc/3rdparty/site_config/index.php +++ b/inc/3rdparty/site_config/index.php | |||
@@ -1,3 +1,2 @@ | |||
1 | <?php | 1 | <?php |
2 | // this is here to prevent directory listing over the web | 2 | // this is here to prevent directory listing over the web \ No newline at end of file |
3 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/version.txt b/inc/3rdparty/site_config/standard/version.txt index bf0d87ab..eaf01ebd 100644 --- a/inc/3rdparty/site_config/standard/version.txt +++ b/inc/3rdparty/site_config/standard/version.txt | |||
@@ -1 +1 @@ | |||
4 \ No newline at end of file | 2013-05-12T22:53:07Z \ No newline at end of file | ||
diff --git a/inc/poche/Database.class.php b/inc/poche/Database.class.php index 036c9d1b..9e901974 100755 --- a/inc/poche/Database.class.php +++ b/inc/poche/Database.class.php | |||
@@ -33,6 +33,8 @@ class Database { | |||
33 | $db_path = 'pgsql:host=' . STORAGE_SERVER . ';dbname=' . STORAGE_DB; | 33 | $db_path = 'pgsql:host=' . STORAGE_SERVER . ';dbname=' . STORAGE_DB; |
34 | $this->handle = new PDO($db_path, STORAGE_USER, STORAGE_PASSWORD); | 34 | $this->handle = new PDO($db_path, STORAGE_USER, STORAGE_PASSWORD); |
35 | break; | 35 | break; |
36 | default: | ||
37 | die(STORAGE . ' is not a recognised database system !'); | ||
36 | } | 38 | } |
37 | 39 | ||
38 | $this->handle->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); | 40 | $this->handle->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); |
@@ -229,12 +231,49 @@ class Database { | |||
229 | return FALSE; | 231 | return FALSE; |
230 | } | 232 | } |
231 | } | 233 | } |
234 | |||
235 | public function listUsers($username=null) { | ||
236 | $sql = 'SELECT count(*) FROM users'.( $username ? ' WHERE username=?' : ''); | ||
237 | $query = $this->executeQuery($sql, ( $username ? array($username) : array())); | ||
238 | list($count) = $query->fetch(); | ||
239 | return $count; | ||
240 | } | ||
241 | |||
242 | public function getUserPassword($userID) { | ||
243 | $sql = "SELECT * FROM users WHERE id=?"; | ||
244 | $query = $this->executeQuery($sql, array($userID)); | ||
245 | $password = $query->fetchAll(); | ||
246 | return isset($password[0]['password']) ? $password[0]['password'] : null; | ||
247 | } | ||
248 | |||
249 | public function deleteUserConfig($userID) { | ||
250 | $sql_action = 'DELETE from users_config WHERE user_id=?'; | ||
251 | $params_action = array($userID); | ||
252 | $query = $this->executeQuery($sql_action, $params_action); | ||
253 | return $query; | ||
254 | } | ||
255 | |||
256 | public function deleteTagsEntriesAndEntries($userID) { | ||
257 | $entries = $this->retrieveAll($userID); | ||
258 | foreach($entries as $entryid) { | ||
259 | $tags = $this->retrieveTagsByEntry($entryid); | ||
260 | foreach($tags as $tag) { | ||
261 | $this->removeTagForEntry($entryid,$tags); | ||
262 | } | ||
263 | $this->deleteById($entryid,$userID); | ||
264 | } | ||
265 | } | ||
266 | |||
267 | public function deleteUser($userID) { | ||
268 | $sql_action = 'DELETE from users WHERE id=?'; | ||
269 | $params_action = array($userID); | ||
270 | $query = $this->executeQuery($sql_action, $params_action); | ||
271 | } | ||
232 | 272 | ||
233 | public function updateContentAndTitle($id, $title, $body, $user_id) { | 273 | public function updateContentAndTitle($id, $title, $body, $user_id) { |
234 | $sql_action = 'UPDATE entries SET content = ?, title = ? WHERE id=? AND user_id=?'; | 274 | $sql_action = 'UPDATE entries SET content = ?, title = ? WHERE id=? AND user_id=?'; |
235 | $params_action = array($body, $title, $id, $user_id); | 275 | $params_action = array($body, $title, $id, $user_id); |
236 | $query = $this->executeQuery($sql_action, $params_action); | 276 | $query = $this->executeQuery($sql_action, $params_action); |
237 | |||
238 | return $query; | 277 | return $query; |
239 | } | 278 | } |
240 | 279 | ||
@@ -472,6 +511,25 @@ class Database { | |||
472 | $query = $this->executeQuery($sql_action, $params_action); | 511 | $query = $this->executeQuery($sql_action, $params_action); |
473 | return $query; | 512 | return $query; |
474 | } | 513 | } |
514 | |||
515 | public function cleanUnusedTag($tag_id) { | ||
516 | $sql_action = "SELECT tags.* FROM tags JOIN tags_entries ON tags_entries.tag_id=tags.id WHERE tags.id=?"; | ||
517 | $query = $this->executeQuery($sql_action,array($tag_id)); | ||
518 | $tagstokeep = $query->fetchAll(); | ||
519 | $sql_action = "SELECT tags.* FROM tags LEFT JOIN tags_entries ON tags_entries.tag_id=tags.id WHERE tags.id=?"; | ||
520 | $query = $this->executeQuery($sql_action,array($tag_id)); | ||
521 | $alltags = $query->fetchAll(); | ||
522 | |||
523 | foreach ($alltags as $tag) { | ||
524 | if ($tag && !in_array($tag,$tagstokeep)) { | ||
525 | $sql_action = "DELETE FROM tags WHERE id=?"; | ||
526 | $params_action = array($tag[0]); | ||
527 | $this->executeQuery($sql_action, $params_action); | ||
528 | return true; | ||
529 | } | ||
530 | } | ||
531 | |||
532 | } | ||
475 | 533 | ||
476 | public function retrieveTagByValue($value) { | 534 | public function retrieveTagByValue($value) { |
477 | $tag = NULL; | 535 | $tag = NULL; |
diff --git a/inc/poche/Poche.class.php b/inc/poche/Poche.class.php index 811895dc..37cf66a3 100755 --- a/inc/poche/Poche.class.php +++ b/inc/poche/Poche.class.php | |||
@@ -72,7 +72,7 @@ class Poche | |||
72 | 72 | ||
73 | # l10n | 73 | # l10n |
74 | $language = $this->user->getConfigValue('language'); | 74 | $language = $this->user->getConfigValue('language'); |
75 | putenv('LC_ALL=' . $language); | 75 | @putenv('LC_ALL=' . $language); |
76 | setlocale(LC_ALL, $language); | 76 | setlocale(LC_ALL, $language); |
77 | bindtextdomain($language, LOCALE); | 77 | bindtextdomain($language, LOCALE); |
78 | textdomain($language); | 78 | textdomain($language); |
@@ -101,7 +101,7 @@ class Poche | |||
101 | 101 | ||
102 | public function configFileIsAvailable() { | 102 | public function configFileIsAvailable() { |
103 | if (! self::$configFileAvailable) { | 103 | if (! self::$configFileAvailable) { |
104 | $this->notInstalledMessage[] = 'You have to rename inc/poche/config.inc.php.new to inc/poche/config.inc.php.'; | 104 | $this->notInstalledMessage[] = 'You have to copy (don\'t just rename!) inc/poche/config.inc.default.php to inc/poche/config.inc.php.'; |
105 | 105 | ||
106 | return false; | 106 | return false; |
107 | } | 107 | } |
@@ -242,6 +242,58 @@ class Poche | |||
242 | $this->tpl->addFilter($filter); | 242 | $this->tpl->addFilter($filter); |
243 | } | 243 | } |
244 | 244 | ||
245 | public function createNewUser() { | ||
246 | if (isset($_GET['newuser'])){ | ||
247 | if ($_POST['newusername'] != "" && $_POST['password4newuser'] != ""){ | ||
248 | $newusername = filter_var($_POST['newusername'], FILTER_SANITIZE_STRING); | ||
249 | if (!$this->store->userExists($newusername)){ | ||
250 | if ($this->store->install($newusername, Tools::encodeString($_POST['password4newuser'] . $newusername))) { | ||
251 | Tools::logm('The new user '.$newusername.' has been installed'); | ||
252 | $this->messages->add('s', sprintf(_('The new user %s has been installed. Do you want to <a href="?logout">logout ?</a>'),$newusername)); | ||
253 | Tools::redirect(); | ||
254 | } | ||
255 | else { | ||
256 | Tools::logm('error during adding new user'); | ||
257 | Tools::redirect(); | ||
258 | } | ||
259 | } | ||
260 | else { | ||
261 | $this->messages->add('e', sprintf(_('Error : An user with the name %s already exists !'),$newusername)); | ||
262 | Tools::logm('An user with the name '.$newusername.' already exists !'); | ||
263 | Tools::redirect(); | ||
264 | } | ||
265 | } | ||
266 | } | ||
267 | } | ||
268 | |||
269 | public function deleteUser(){ | ||
270 | if (isset($_GET['deluser'])){ | ||
271 | if ($this->store->listUsers() > 1) { | ||
272 | if (Tools::encodeString($_POST['password4deletinguser'].$this->user->getUsername()) == $this->store->getUserPassword($this->user->getId())) { | ||
273 | $username = $this->user->getUsername(); | ||
274 | $this->store->deleteUserConfig($this->user->getId()); | ||
275 | Tools::logm('The configuration for user '. $username .' has been deleted !'); | ||
276 | $this->store->deleteTagsEntriesAndEntries($this->user->getId()); | ||
277 | Tools::logm('The entries for user '. $username .' has been deleted !'); | ||
278 | $this->store->deleteUser($this->user->getId()); | ||
279 | Tools::logm('User '. $username .' has been completely deleted !'); | ||
280 | Session::logout(); | ||
281 | Tools::logm('logout'); | ||
282 | Tools::redirect(); | ||
283 | $this->messages->add('s', sprintf(_('User %s has been successfully deleted !'),$newusername)); | ||
284 | } | ||
285 | else { | ||
286 | Tools::logm('Bad password !'); | ||
287 | $this->messages->add('e', _('Error : The password is wrong !')); | ||
288 | } | ||
289 | } | ||
290 | else { | ||
291 | Tools::logm('Only user !'); | ||
292 | $this->messages->add('e', _('Error : You are the only user, you cannot delete your account !')); | ||
293 | } | ||
294 | } | ||
295 | } | ||
296 | |||
245 | private function install() | 297 | private function install() |
246 | { | 298 | { |
247 | Tools::logm('poche still not installed'); | 299 | Tools::logm('poche still not installed'); |
@@ -434,12 +486,24 @@ class Poche | |||
434 | case 'toggle_fav' : | 486 | case 'toggle_fav' : |
435 | $this->store->favoriteById($id, $this->user->getId()); | 487 | $this->store->favoriteById($id, $this->user->getId()); |
436 | Tools::logm('mark as favorite link #' . $id); | 488 | Tools::logm('mark as favorite link #' . $id); |
437 | Tools::redirect(); | 489 | if ( Tools::isAjaxRequest() ) { |
490 | echo 1; | ||
491 | exit; | ||
492 | } | ||
493 | else { | ||
494 | Tools::redirect(); | ||
495 | } | ||
438 | break; | 496 | break; |
439 | case 'toggle_archive' : | 497 | case 'toggle_archive' : |
440 | $this->store->archiveById($id, $this->user->getId()); | 498 | $this->store->archiveById($id, $this->user->getId()); |
441 | Tools::logm('archive link #' . $id); | 499 | Tools::logm('archive link #' . $id); |
442 | Tools::redirect(); | 500 | if ( Tools::isAjaxRequest() ) { |
501 | echo 1; | ||
502 | exit; | ||
503 | } | ||
504 | else { | ||
505 | Tools::redirect(); | ||
506 | } | ||
443 | break; | 507 | break; |
444 | case 'archive_all' : | 508 | case 'archive_all' : |
445 | $this->store->archiveAll($this->user->getId()); | 509 | $this->store->archiveAll($this->user->getId()); |
@@ -447,42 +511,55 @@ class Poche | |||
447 | Tools::redirect(); | 511 | Tools::redirect(); |
448 | break; | 512 | break; |
449 | case 'add_tag' : | 513 | case 'add_tag' : |
450 | $tags = explode(',', $_POST['value']); | 514 | if (isset($_GET['search'])) { |
451 | $entry_id = $_POST['entry_id']; | 515 | //when we want to apply a tag to a search |
452 | $entry = $this->store->retrieveOneById($entry_id, $this->user->getId()); | 516 | $tags = array($_GET['search']); |
453 | if (!$entry) { | 517 | $allentry_ids = $this->store->search($tags[0], $this->user->getId()); |
454 | $this->messages->add('e', _('Article not found!')); | 518 | $entry_ids = array(); |
455 | Tools::logm('error : article not found'); | 519 | foreach ($allentry_ids as $eachentry) { |
456 | Tools::redirect(); | 520 | $entry_ids[] = $eachentry[0]; |
457 | } | 521 | } |
458 | //get all already set tags to preven duplicates | 522 | } else { //add a tag to a single article |
459 | $already_set_tags = array(); | 523 | $tags = explode(',', $_POST['value']); |
460 | $entry_tags = $this->store->retrieveTagsByEntry($entry_id); | 524 | $entry_ids = array($_POST['entry_id']); |
461 | foreach ($entry_tags as $tag) { | ||
462 | $already_set_tags[] = $tag['value']; | ||
463 | } | 525 | } |
464 | foreach($tags as $key => $tag_value) { | 526 | foreach($entry_ids as $entry_id) { |
465 | $value = trim($tag_value); | 527 | $entry = $this->store->retrieveOneById($entry_id, $this->user->getId()); |
466 | if ($value && !in_array($value, $already_set_tags)) { | 528 | if (!$entry) { |
467 | $tag = $this->store->retrieveTagByValue($value); | 529 | $this->messages->add('e', _('Article not found!')); |
468 | 530 | Tools::logm('error : article not found'); | |
469 | if (is_null($tag)) { | 531 | Tools::redirect(); |
470 | # we create the tag | 532 | } |
471 | $tag = $this->store->createTag($value); | 533 | //get all already set tags to preven duplicates |
472 | $sequence = ''; | 534 | $already_set_tags = array(); |
473 | if (STORAGE == 'postgres') { | 535 | $entry_tags = $this->store->retrieveTagsByEntry($entry_id); |
474 | $sequence = 'tags_id_seq'; | 536 | foreach ($entry_tags as $tag) { |
537 | $already_set_tags[] = $tag['value']; | ||
538 | } | ||
539 | foreach($tags as $key => $tag_value) { | ||
540 | $value = trim($tag_value); | ||
541 | if ($value && !in_array($value, $already_set_tags)) { | ||
542 | $tag = $this->store->retrieveTagByValue($value); | ||
543 | if (is_null($tag)) { | ||
544 | # we create the tag | ||
545 | $tag = $this->store->createTag($value); | ||
546 | $sequence = ''; | ||
547 | if (STORAGE == 'postgres') { | ||
548 | $sequence = 'tags_id_seq'; | ||
549 | } | ||
550 | $tag_id = $this->store->getLastId($sequence); | ||
475 | } | 551 | } |
476 | $tag_id = $this->store->getLastId($sequence); | 552 | else { |
477 | } | 553 | $tag_id = $tag['id']; |
478 | else { | 554 | } |
479 | $tag_id = $tag['id']; | 555 | |
480 | } | 556 | # we assign the tag to the article |
481 | 557 | $this->store->setTagToEntry($tag_id, $entry_id); | |
482 | # we assign the tag to the article | 558 | } |
483 | $this->store->setTagToEntry($tag_id, $entry_id); | ||
484 | } | 559 | } |
485 | } | 560 | } |
561 | $this->messages->add('s', _('The tag has been applied successfully')); | ||
562 | Tools::logm('The tag has been applied successfully'); | ||
486 | Tools::redirect(); | 563 | Tools::redirect(); |
487 | break; | 564 | break; |
488 | case 'remove_tag' : | 565 | case 'remove_tag' : |
@@ -494,6 +571,11 @@ class Poche | |||
494 | Tools::redirect(); | 571 | Tools::redirect(); |
495 | } | 572 | } |
496 | $this->store->removeTagForEntry($id, $tag_id); | 573 | $this->store->removeTagForEntry($id, $tag_id); |
574 | Tools::logm('tag entry deleted'); | ||
575 | if ($this->store->cleanUnusedTag($tag_id)) { | ||
576 | Tools::logm('tag deleted'); | ||
577 | } | ||
578 | $this->messages->add('s', _('The tag has been successfully deleted')); | ||
497 | Tools::redirect(); | 579 | Tools::redirect(); |
498 | break; | 580 | break; |
499 | default: | 581 | default: |
@@ -520,6 +602,7 @@ class Poche | |||
520 | $languages = $this->getInstalledLanguages(); | 602 | $languages = $this->getInstalledLanguages(); |
521 | $token = $this->user->getConfigValue('token'); | 603 | $token = $this->user->getConfigValue('token'); |
522 | $http_auth = (isset($_SERVER['PHP_AUTH_USER']) || isset($_SERVER['REMOTE_USER'])) ? true : false; | 604 | $http_auth = (isset($_SERVER['PHP_AUTH_USER']) || isset($_SERVER['REMOTE_USER'])) ? true : false; |
605 | $only_user = ($this->store->listUsers() > 1) ? false : true; | ||
523 | $tpl_vars = array( | 606 | $tpl_vars = array( |
524 | 'themes' => $themes, | 607 | 'themes' => $themes, |
525 | 'languages' => $languages, | 608 | 'languages' => $languages, |
@@ -532,6 +615,7 @@ class Poche | |||
532 | 'token' => $token, | 615 | 'token' => $token, |
533 | 'user_id' => $this->user->getId(), | 616 | 'user_id' => $this->user->getId(), |
534 | 'http_auth' => $http_auth, | 617 | 'http_auth' => $http_auth, |
618 | 'only_user' => $only_user | ||
535 | ); | 619 | ); |
536 | Tools::logm('config view'); | 620 | Tools::logm('config view'); |
537 | break; | 621 | break; |
@@ -822,13 +906,6 @@ class Poche | |||
822 | */ | 906 | */ |
823 | public function import() { | 907 | public function import() { |
824 | 908 | ||
825 | if (!defined('IMPORT_LIMIT')) { | ||
826 | define('IMPORT_LIMIT', 5); | ||
827 | } | ||
828 | if (!defined('IMPORT_DELAY')) { | ||
829 | define('IMPORT_DELAY', 5); | ||
830 | } | ||
831 | |||
832 | if ( isset($_FILES['file']) ) { | 909 | if ( isset($_FILES['file']) ) { |
833 | Tools::logm('Import stated: parsing file'); | 910 | Tools::logm('Import stated: parsing file'); |
834 | 911 | ||
@@ -1065,11 +1142,127 @@ class Poche | |||
1065 | * return new purifier object with actual config | 1142 | * return new purifier object with actual config |
1066 | */ | 1143 | */ |
1067 | protected function getPurifier() { | 1144 | protected function getPurifier() { |
1068 | $config = HTMLPurifier_Config::createDefault(); | 1145 | $config = HTMLPurifier_Config::createDefault(); |
1069 | $config->set('Cache.SerializerPath', CACHE); | 1146 | $config->set('Cache.SerializerPath', CACHE); |
1070 | $config->set('HTML.SafeIframe', true); | 1147 | $config->set('HTML.SafeIframe', true); |
1071 | $config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/)%'); //allow YouTube and Vimeo$purifier = new HTMLPurifier($config); | 1148 | //allow YouTube, Vimeo and dailymotion videos |
1072 | 1149 | $config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/|www\.dailymotion\.com/embed/video/)%'); | |
1150 | |||
1073 | return new HTMLPurifier($config); | 1151 | return new HTMLPurifier($config); |
1074 | } | 1152 | } |
1153 | |||
1154 | /** | ||
1155 | * handle epub | ||
1156 | */ | ||
1157 | public function createEpub() { | ||
1158 | |||
1159 | switch ($_GET['method']) { | ||
1160 | case 'id': | ||
1161 | $entryID = filter_var($_GET['id'],FILTER_SANITIZE_NUMBER_INT); | ||
1162 | $entry = $this->store->retrieveOneById($entryID, $this->user->getId()); | ||
1163 | $entries = array($entry); | ||
1164 | $bookTitle = $entry['title']; | ||
1165 | $bookFileName = substr($bookTitle, 0, 200); | ||
1166 | break; | ||
1167 | case 'all': | ||
1168 | $entries = $this->store->retrieveAll($this->user->getId()); | ||
1169 | $bookTitle = sprintf(_('All my articles on '), date(_('d.m.y'))); #translatable because each country has it's own date format system | ||
1170 | $bookFileName = _('Allarticles') . date(_('dmY')); | ||
1171 | break; | ||
1172 | case 'tag': | ||
1173 | $tag = filter_var($_GET['tag'],FILTER_SANITIZE_STRING); | ||
1174 | $tags_id = $this->store->retrieveAllTags($this->user->getId(),$tag); | ||
1175 | $tag_id = $tags_id[0]["id"]; // we take the first result, which is supposed to match perfectly. There must be a workaround. | ||
1176 | $entries = $this->store->retrieveEntriesByTag($tag_id,$this->user->getId()); | ||
1177 | $bookTitle = sprintf(_('Articles tagged %s'),$tag); | ||
1178 | $bookFileName = substr(sprintf(_('Tag %s'),$tag), 0, 200); | ||
1179 | break; | ||
1180 | case 'category': | ||
1181 | $category = filter_var($_GET['category'],FILTER_SANITIZE_STRING); | ||
1182 | $entries = $this->store->getEntriesByView($category,$this->user->getId()); | ||
1183 | $bookTitle = sprintf(_('All articles in category %s'), $category); | ||
1184 | $bookFileName = substr(sprintf(_('Category %s'),$category), 0, 200); | ||
1185 | break; | ||
1186 | case 'search': | ||
1187 | $search = filter_var($_GET['search'],FILTER_SANITIZE_STRING); | ||
1188 | $entries = $this->store->search($search,$this->user->getId()); | ||
1189 | $bookTitle = sprintf(_('All articles for search %s'), $search); | ||
1190 | $bookFileName = substr(sprintf(_('Search %s'), $search), 0, 200); | ||
1191 | break; | ||
1192 | case 'default': | ||
1193 | die(_('Uh, there is a problem while generating epub.')); | ||
1194 | |||
1195 | } | ||
1196 | |||
1197 | $content_start = | ||
1198 | "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | ||
1199 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
1200 | . "<head>" | ||
1201 | . "<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n" | ||
1202 | . "<title>wallabag articles book</title>\n" | ||
1203 | . "</head>\n" | ||
1204 | . "<body>\n"; | ||
1205 | |||
1206 | $bookEnd = "</body>\n</html>\n"; | ||
1207 | |||
1208 | $log = new Logger("wallabag", TRUE); | ||
1209 | $fileDir = CACHE; | ||
1210 | |||
1211 | |||
1212 | $book = new EPub(EPub::BOOK_VERSION_EPUB3); | ||
1213 | $log->logLine("new EPub()"); | ||
1214 | $log->logLine("EPub class version: " . EPub::VERSION); | ||
1215 | $log->logLine("EPub Req. Zip version: " . EPub::REQ_ZIP_VERSION); | ||
1216 | $log->logLine("Zip version: " . Zip::VERSION); | ||
1217 | $log->logLine("getCurrentServerURL: " . $book->getCurrentServerURL()); | ||
1218 | $log->logLine("getCurrentPageURL..: " . $book->getCurrentPageURL()); | ||
1219 | |||
1220 | $book->setTitle(_('wallabag\'s articles')); | ||
1221 | $book->setIdentifier("http://$_SERVER[HTTP_HOST]", EPub::IDENTIFIER_URI); // Could also be the ISBN number, prefered for published books, or a UUID. | ||
1222 | //$book->setLanguage("en"); // Not needed, but included for the example, Language is mandatory, but EPub defaults to "en". Use RFC3066 Language codes, such as "en", "da", "fr" etc. | ||
1223 | $book->setDescription(_("Some articles saved on my wallabag")); | ||
1224 | $book->setAuthor("wallabag","wallabag"); | ||
1225 | $book->setPublisher("wallabag","wallabag"); // I hope this is a non existant address :) | ||
1226 | $book->setDate(time()); // Strictly not needed as the book date defaults to time(). | ||
1227 | //$book->setRights("Copyright and licence information specific for the book."); // As this is generated, this _could_ contain the name or licence information of the user who purchased the book, if needed. If this is used that way, the identifier must also be made unique for the book. | ||
1228 | $book->setSourceURL("http://$_SERVER[HTTP_HOST]"); | ||
1229 | |||
1230 | $book->addDublinCoreMetadata(DublinCore::CONTRIBUTOR, "PHP"); | ||
1231 | $book->addDublinCoreMetadata(DublinCore::CONTRIBUTOR, "wallabag"); | ||
1232 | |||
1233 | $cssData = "body {\n margin-left: .5em;\n margin-right: .5em;\n text-align: justify;\n}\n\np {\n font-family: serif;\n font-size: 10pt;\n text-align: justify;\n text-indent: 1em;\n margin-top: 0px;\n margin-bottom: 1ex;\n}\n\nh1, h2 {\n font-family: sans-serif;\n font-style: italic;\n text-align: center;\n background-color: #6b879c;\n color: white;\n width: 100%;\n}\n\nh1 {\n margin-bottom: 2px;\n}\n\nh2 {\n margin-top: -2px;\n margin-bottom: 2px;\n}\n"; | ||
1234 | |||
1235 | $log->logLine("Add Cover"); | ||
1236 | |||
1237 | $fullTitle = "<h1> " . $bookTitle . "</h1>\n"; | ||
1238 | |||
1239 | $book->setCoverImage("Cover.png", file_get_contents("themes/baggy/img/apple-touch-icon-152.png"), "image/png", $fullTitle); | ||
1240 | |||
1241 | $cover = $content_start . '<div style="text-align:center;"><p>' . _('Produced by wallabag with PHPePub') . '</p><p>'. _('Please open <a href="https://github.com/wallabag/wallabag/issues" >an issue</a> if you have trouble with the display of this E-Book on your device.') . '</p></div>' . $bookEnd; | ||
1242 | |||
1243 | //$book->addChapter("Table of Contents", "TOC.xhtml", NULL, false, EPub::EXTERNAL_REF_IGNORE); | ||
1244 | $book->addChapter("Notices", "Cover2.html", $cover); | ||
1245 | |||
1246 | $book->buildTOC(); | ||
1247 | |||
1248 | foreach ($entries as $entry) { //set tags as subjects | ||
1249 | $tags = $this->store->retrieveTagsByEntry($entry['id']); | ||
1250 | foreach ($tags as $tag) { | ||
1251 | $book->setSubject($tag['value']); | ||
1252 | } | ||
1253 | |||
1254 | $log->logLine("Set up parameters"); | ||
1255 | |||
1256 | $chapter = $content_start . $entry['content'] . $bookEnd; | ||
1257 | $book->addChapter($entry['title'], htmlspecialchars($entry['title']) . ".html", $chapter, true, EPub::EXTERNAL_REF_ADD); | ||
1258 | $log->logLine("Added chapter " . $entry['title']); | ||
1259 | } | ||
1260 | |||
1261 | if (DEBUG_POCHE) { | ||
1262 | $epuplog = $book->getLog(); | ||
1263 | $book->addChapter("Log", "Log.html", $content_start . $log->getLog() . "\n</pre>" . $bookEnd); // log generation | ||
1264 | } | ||
1265 | $book->finalize(); | ||
1266 | $zipData = $book->sendBook($bookFileName); | ||
1267 | } | ||
1075 | } | 1268 | } |
diff --git a/inc/poche/Tools.class.php b/inc/poche/Tools.class.php index 7f064020..8073a3fe 100755 --- a/inc/poche/Tools.class.php +++ b/inc/poche/Tools.class.php | |||
@@ -60,6 +60,10 @@ class Tools | |||
60 | } | 60 | } |
61 | 61 | ||
62 | $host = (isset($_SERVER['HTTP_X_FORWARDED_HOST']) ? $_SERVER['HTTP_X_FORWARDED_HOST'] : (isset($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'])); | 62 | $host = (isset($_SERVER['HTTP_X_FORWARDED_HOST']) ? $_SERVER['HTTP_X_FORWARDED_HOST'] : (isset($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'])); |
63 | |||
64 | if (strpos($host, ':') !== false) { | ||
65 | $serverport = ''; | ||
66 | } | ||
63 | 67 | ||
64 | return 'http' . ($https ? 's' : '') . '://' | 68 | return 'http' . ($https ? 's' : '') . '://' |
65 | . $host . $serverport . $scriptname; | 69 | . $host . $serverport . $scriptname; |
diff --git a/inc/poche/config.inc.default.php b/inc/poche/config.inc.default.php new file mode 100755 index 00000000..ffcd205d --- /dev/null +++ b/inc/poche/config.inc.default.php | |||
@@ -0,0 +1,64 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * wallabag, self hostable application allowing you to not miss any content anymore | ||
4 | * | ||
5 | * @category wallabag | ||
6 | * @author Nicolas Lœuillet <nicolas@loeuillet.org> | ||
7 | * @copyright 2013 | ||
8 | * @license http://www.wtfpl.net/ see COPYING file | ||
9 | */ | ||
10 | |||
11 | @define ('SALT', ''); # put a strong string here | ||
12 | @define ('LANG', 'en_EN.utf8'); | ||
13 | |||
14 | @define ('STORAGE', 'sqlite'); # postgres, mysql or sqlite | ||
15 | |||
16 | @define ('STORAGE_SQLITE', ROOT . '/db/poche.sqlite'); # if you are using sqlite, where the database file is located | ||
17 | |||
18 | # only for postgres & mysql | ||
19 | @define ('STORAGE_SERVER', 'localhost'); | ||
20 | @define ('STORAGE_DB', 'poche'); | ||
21 | @define ('STORAGE_USER', 'poche'); | ||
22 | @define ('STORAGE_PASSWORD', 'poche'); | ||
23 | |||
24 | ################################################################################# | ||
25 | # Do not trespass unless you know what you are doing | ||
26 | ################################################################################# | ||
27 | |||
28 | // Change this if not using the standart port for SSL - i.e you server is behind sslh | ||
29 | @define ('SSL_PORT', 443); | ||
30 | |||
31 | @define ('MODE_DEMO', FALSE); | ||
32 | @define ('DEBUG_POCHE', FALSE); | ||
33 | @define ('DOWNLOAD_PICTURES', FALSE); # This can slow down the process of adding articles | ||
34 | @define ('REGENERATE_PICTURES_QUALITY', 75); | ||
35 | @define ('CONVERT_LINKS_FOOTNOTES', FALSE); | ||
36 | @define ('REVERT_FORCED_PARAGRAPH_ELEMENTS', FALSE); | ||
37 | @define ('SHARE_TWITTER', TRUE); | ||
38 | @define ('SHARE_MAIL', TRUE); | ||
39 | @define ('SHARE_SHAARLI', FALSE); | ||
40 | @define ('SHAARLI_URL', 'http://myshaarliurl.com'); | ||
41 | @define ('FLATTR', TRUE); | ||
42 | @define ('FLATTR_API', 'https://api.flattr.com/rest/v2/things/lookup/?url='); | ||
43 | @define ('NOT_FLATTRABLE', '0'); | ||
44 | @define ('FLATTRABLE', '1'); | ||
45 | @define ('FLATTRED', '2'); | ||
46 | // display or not print link in article view | ||
47 | @define ('SHOW_PRINTLINK', '1'); | ||
48 | // display or not percent of read in article view. Affects only default theme. | ||
49 | @define ('SHOW_READPERCENT', '1'); | ||
50 | @define ('ABS_PATH', 'assets/'); | ||
51 | |||
52 | @define ('DEFAULT_THEME', 'baggy'); | ||
53 | |||
54 | @define ('THEME', ROOT . '/themes'); | ||
55 | @define ('LOCALE', ROOT . '/locale'); | ||
56 | @define ('CACHE', ROOT . '/cache'); | ||
57 | |||
58 | @define ('PAGINATION', '10'); | ||
59 | |||
60 | //limit for download of articles during import | ||
61 | @define ('IMPORT_LIMIT', 5); | ||
62 | //delay between downloads (in sec) | ||
63 | @define ('IMPORT_DELAY', 5); | ||
64 | |||
diff --git a/inc/poche/config.inc.php.new b/inc/poche/config.inc.php.new deleted file mode 100755 index 83b3c4c0..00000000 --- a/inc/poche/config.inc.php.new +++ /dev/null | |||
@@ -1,59 +0,0 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * wallabag, self hostable application allowing you to not miss any content anymore | ||
4 | * | ||
5 | * @category wallabag | ||
6 | * @author Nicolas Lœuillet <nicolas@loeuillet.org> | ||
7 | * @copyright 2013 | ||
8 | * @license http://www.wtfpl.net/ see COPYING file | ||
9 | */ | ||
10 | |||
11 | define ('SALT', ''); # put a strong string here | ||
12 | define ('LANG', 'en_EN.utf8'); | ||
13 | |||
14 | define ('STORAGE', 'sqlite'); # postgres, mysql or sqlite | ||
15 | |||
16 | define ('STORAGE_SQLITE', ROOT . '/db/poche.sqlite'); # if you are using sqlite, where the database file is located | ||
17 | |||
18 | # only for postgres & mysql | ||
19 | define ('STORAGE_SERVER', 'localhost'); | ||
20 | define ('STORAGE_DB', 'poche'); | ||
21 | define ('STORAGE_USER', 'poche'); | ||
22 | define ('STORAGE_PASSWORD', 'poche'); | ||
23 | |||
24 | ################################################################################# | ||
25 | # Do not trespass unless you know what you are doing | ||
26 | ################################################################################# | ||
27 | |||
28 | // Change this if not using the standart port for SSL - i.e you server is behind sslh | ||
29 | define ('SSL_PORT', 443); | ||
30 | |||
31 | define ('MODE_DEMO', FALSE); | ||
32 | define ('DEBUG_POCHE', FALSE); | ||
33 | define ('DOWNLOAD_PICTURES', FALSE); | ||
34 | define ('CONVERT_LINKS_FOOTNOTES', FALSE); | ||
35 | define ('REVERT_FORCED_PARAGRAPH_ELEMENTS', FALSE); | ||
36 | define ('SHARE_TWITTER', TRUE); | ||
37 | define ('SHARE_MAIL', TRUE); | ||
38 | define ('SHARE_SHAARLI', FALSE); | ||
39 | define ('SHAARLI_URL', 'http://myshaarliurl.com'); | ||
40 | define ('FLATTR', TRUE); | ||
41 | define ('FLATTR_API', 'https://api.flattr.com/rest/v2/things/lookup/?url='); | ||
42 | define ('NOT_FLATTRABLE', '0'); | ||
43 | define ('FLATTRABLE', '1'); | ||
44 | define ('FLATTRED', '2'); | ||
45 | define ('ABS_PATH', 'assets/'); | ||
46 | |||
47 | define ('DEFAULT_THEME', 'baggy'); | ||
48 | |||
49 | define ('THEME', ROOT . '/themes'); | ||
50 | define ('LOCALE', ROOT . '/locale'); | ||
51 | define ('CACHE', ROOT . '/cache'); | ||
52 | |||
53 | define ('PAGINATION', '10'); | ||
54 | |||
55 | //limit for download of articles during import | ||
56 | define ('IMPORT_LIMIT', 5); | ||
57 | //delay between downloads (in sec) | ||
58 | define ('IMPORT_DELAY', 5); | ||
59 | |||
diff --git a/inc/poche/global.inc.php b/inc/poche/global.inc.php index 15091387..8cf86d03 100644..100755 --- a/inc/poche/global.inc.php +++ b/inc/poche/global.inc.php | |||
@@ -31,6 +31,11 @@ require_once INCLUDES . '/3rdparty/FlattrItem.class.php'; | |||
31 | 31 | ||
32 | require_once INCLUDES . '/3rdparty/htmlpurifier/HTMLPurifier.auto.php'; | 32 | require_once INCLUDES . '/3rdparty/htmlpurifier/HTMLPurifier.auto.php'; |
33 | 33 | ||
34 | # epub library | ||
35 | require_once INCLUDES . '/3rdparty/libraries/PHPePub/Logger.php'; | ||
36 | require_once INCLUDES . '/3rdparty/libraries/PHPePub/EPub.php'; | ||
37 | require_once INCLUDES . '/3rdparty/libraries/PHPePub/EPubChapterSplitter.php'; | ||
38 | |||
34 | # Composer its autoloader for automatically loading Twig | 39 | # Composer its autoloader for automatically loading Twig |
35 | if (! file_exists(ROOT . '/vendor/autoload.php')) { | 40 | if (! file_exists(ROOT . '/vendor/autoload.php')) { |
36 | Poche::$canRenderTemplates = false; | 41 | Poche::$canRenderTemplates = false; |
@@ -43,6 +48,7 @@ if (! file_exists(INCLUDES . '/poche/config.inc.php')) { | |||
43 | Poche::$configFileAvailable = false; | 48 | Poche::$configFileAvailable = false; |
44 | } else { | 49 | } else { |
45 | require_once INCLUDES . '/poche/config.inc.php'; | 50 | require_once INCLUDES . '/poche/config.inc.php'; |
51 | require_once INCLUDES . '/poche/config.inc.default.php'; | ||
46 | } | 52 | } |
47 | 53 | ||
48 | if (Poche::$configFileAvailable && DOWNLOAD_PICTURES) { | 54 | if (Poche::$configFileAvailable && DOWNLOAD_PICTURES) { |
diff --git a/inc/poche/pochePictures.php b/inc/poche/pochePictures.php index e4b0b160..7c319a85 100644 --- a/inc/poche/pochePictures.php +++ b/inc/poche/pochePictures.php | |||
@@ -14,6 +14,7 @@ | |||
14 | function filtre_picture($content, $url, $id) | 14 | function filtre_picture($content, $url, $id) |
15 | { | 15 | { |
16 | $matches = array(); | 16 | $matches = array(); |
17 | $processing_pictures = array(); // list of processing image to avoid processing the same pictures twice | ||
17 | preg_match_all('#<\s*(img)[^>]+src="([^"]*)"[^>]*>#Si', $content, $matches, PREG_SET_ORDER); | 18 | preg_match_all('#<\s*(img)[^>]+src="([^"]*)"[^>]*>#Si', $content, $matches, PREG_SET_ORDER); |
18 | foreach($matches as $i => $link) { | 19 | foreach($matches as $i => $link) { |
19 | $link[1] = trim($link[1]); | 20 | $link[1] = trim($link[1]); |
@@ -22,8 +23,17 @@ function filtre_picture($content, $url, $id) | |||
22 | $filename = basename(parse_url($absolute_path, PHP_URL_PATH)); | 23 | $filename = basename(parse_url($absolute_path, PHP_URL_PATH)); |
23 | $directory = create_assets_directory($id); | 24 | $directory = create_assets_directory($id); |
24 | $fullpath = $directory . '/' . $filename; | 25 | $fullpath = $directory . '/' . $filename; |
25 | download_pictures($absolute_path, $fullpath); | 26 | |
26 | $content = str_replace($matches[$i][2], $fullpath, $content); | 27 | if (in_array($absolute_path, $processing_pictures) === true) { |
28 | // replace picture's URL only if processing is OK : already processing -> go to next picture | ||
29 | continue; | ||
30 | } | ||
31 | |||
32 | if (download_pictures($absolute_path, $fullpath) === true) { | ||
33 | $content = str_replace($matches[$i][2], $fullpath, $content); | ||
34 | } | ||
35 | |||
36 | $processing_pictures[] = $absolute_path; | ||
27 | } | 37 | } |
28 | 38 | ||
29 | } | 39 | } |
@@ -64,17 +74,55 @@ function get_absolute_link($relative_link, $url) { | |||
64 | 74 | ||
65 | /** | 75 | /** |
66 | * Téléchargement des images | 76 | * Téléchargement des images |
77 | * | ||
78 | * @return bool true if the download and processing is OK, false else | ||
67 | */ | 79 | */ |
68 | function download_pictures($absolute_path, $fullpath) | 80 | function download_pictures($absolute_path, $fullpath) |
69 | { | 81 | { |
70 | $rawdata = Tools::getFile($absolute_path); | 82 | $rawdata = Tools::getFile($absolute_path); |
83 | $fullpath = urldecode($fullpath); | ||
71 | 84 | ||
72 | if(file_exists($fullpath)) { | 85 | if(file_exists($fullpath)) { |
73 | unlink($fullpath); | 86 | unlink($fullpath); |
74 | } | 87 | } |
75 | $fp = fopen($fullpath, 'x'); | 88 | |
76 | fwrite($fp, $rawdata); | 89 | // check extension |
77 | fclose($fp); | 90 | $file_ext = strrchr($fullpath, '.'); |
91 | $whitelist = array(".jpg",".jpeg",".gif",".png"); | ||
92 | if (!(in_array($file_ext, $whitelist))) { | ||
93 | Tools::logm('processed image with not allowed extension. Skipping ' . $fullpath); | ||
94 | return false; | ||
95 | } | ||
96 | |||
97 | // check headers | ||
98 | $imageinfo = getimagesize($absolute_path); | ||
99 | if ($imageinfo['mime'] != 'image/gif' && $imageinfo['mime'] != 'image/jpeg'&& $imageinfo['mime'] != 'image/jpg'&& $imageinfo['mime'] != 'image/png') { | ||
100 | Tools::logm('processed image with bad header. Skipping ' . $fullpath); | ||
101 | return false; | ||
102 | } | ||
103 | |||
104 | // regenerate image | ||
105 | $im = imagecreatefromstring($rawdata); | ||
106 | if ($im === false) { | ||
107 | Tools::logm('error while regenerating image ' . $fullpath); | ||
108 | return false; | ||
109 | } | ||
110 | |||
111 | switch ($imageinfo['mime']) { | ||
112 | case 'image/gif': | ||
113 | $result = imagegif($im, $fullpath); | ||
114 | break; | ||
115 | case 'image/jpeg': | ||
116 | case 'image/jpg': | ||
117 | $result = imagejpeg($im, $fullpath, REGENERATE_PICTURES_QUALITY); | ||
118 | break; | ||
119 | case 'image/png': | ||
120 | $result = imagepng($im, $fullpath, ceil(REGENERATE_PICTURES_QUALITY / 100 * 9)); | ||
121 | break; | ||
122 | } | ||
123 | imagedestroy($im); | ||
124 | |||
125 | return $result; | ||
78 | } | 126 | } |
79 | 127 | ||
80 | /** | 128 | /** |