diff options
author | Nicolas Lœuillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
---|---|---|
committer | Nicolas Lœuillet <nicolas@loeuillet.org> | 2014-05-29 18:54:06 +0200 |
commit | a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (patch) | |
tree | 80b5bfc9836ae92cc4929a4d72ae0b2730e568bc /inc/3rdparty/libraries | |
parent | 96834a47b09985e1c82b82857fc108f20e8b8f2b (diff) | |
parent | 8038b38802769031e050c753fc0a388a2276629e (diff) | |
download | wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.gz wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.tar.zst wallabag-a9f5e572dde4f986a498d2fbe92a38a1b22f9595.zip |
Merge pull request #712 from wallabag/dev1.7.0
1.7, call me "Premium version"
Diffstat (limited to 'inc/3rdparty/libraries')
22 files changed, 10679 insertions, 4009 deletions
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php new file mode 100644 index 00000000..376b6133 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.HtmlEntities.php | |||
@@ -0,0 +1,266 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * This should be a complete list of all HTML entities, mapped to their UTF-8 character codes. | ||
4 | * | ||
5 | * @author A. Grandt | ||
6 | * @copyright A. Grandt 2009-2013 | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 3.00 | ||
9 | */ | ||
10 | global $htmlEntities; | ||
11 | $htmlEntities = array(); | ||
12 | |||
13 | $htmlEntities["""] ="\x22"; // " ((double) quotation mark) | ||
14 | $htmlEntities["&"] ="\x26"; // & (ampersand) | ||
15 | $htmlEntities["'"] ="\x27"; // ' (apostrophe = apostrophe-quote) | ||
16 | $htmlEntities["<"] ="\x3C"; // < (less-than sign) | ||
17 | $htmlEntities[">"] ="\x3E"; // > (greater-than sign) | ||
18 | $htmlEntities[" "] ="\xC2\xA0"; //   (non-breaking space) | ||
19 | $htmlEntities["¡"] ="\xC2\xA1"; // ¡ (inverted exclamation mark) | ||
20 | $htmlEntities["¢"] ="\xC2\xA2"; // ¢ (cent) | ||
21 | $htmlEntities["£"] ="\xC2\xA3"; // £ (pound) | ||
22 | $htmlEntities["¤"] ="\xC2\xA4"; // ¤ (currency) | ||
23 | $htmlEntities["¥"] ="\xC2\xA5"; // ¥ (yen) | ||
24 | $htmlEntities["¦"] ="\xC2\xA6"; // ¦ (broken vertical bar) | ||
25 | $htmlEntities["§"] ="\xC2\xA7"; // § (section) | ||
26 | $htmlEntities["¨"] ="\xC2\xA8"; // ¨ (spacing diaeresis) | ||
27 | $htmlEntities["©"] ="\xC2\xA9"; // © (copyright) | ||
28 | $htmlEntities["ª"] ="\xC2\xAA"; // ª (feminine ordinal indicator) | ||
29 | $htmlEntities["«"] ="\xC2\xAB"; // « (angle quotation mark (left)) | ||
30 | $htmlEntities["¬"] ="\xC2\xAC"; // ¬ (negation) | ||
31 | $htmlEntities["­"] ="\xC2\xAD"; // ­ (soft hyphen) | ||
32 | $htmlEntities["®"] ="\xC2\xAE"; // ® (registered trademark) | ||
33 | $htmlEntities["¯"] ="\xC2\xAF"; // ¯ (spacing macron) | ||
34 | $htmlEntities["°"] ="\xC2\xB0"; // ° (degree) | ||
35 | $htmlEntities["±"] ="\xC2\xB1"; // ± (plus-or-minus) | ||
36 | $htmlEntities["²"] ="\xC2\xB2"; // ² (superscript 2) | ||
37 | $htmlEntities["³"] ="\xC2\xB3"; // ³ (superscript 3) | ||
38 | $htmlEntities["´"] ="\xC2\xB4"; // ´ (spacing acute) | ||
39 | $htmlEntities["µ"] ="\xC2\xB5"; // µ (micro) | ||
40 | $htmlEntities["¶"] ="\xC2\xB6"; // ¶ (paragraph) | ||
41 | $htmlEntities["·"] ="\xC2\xB7"; // · (middle dot) | ||
42 | $htmlEntities["¸"] ="\xC2\xB8"; // ¸ (spacing cedilla) | ||
43 | $htmlEntities["¹"] ="\xC2\xB9"; // ¹ (superscript 1) | ||
44 | $htmlEntities["º"] ="\xC2\xBA"; // º (masculine ordinal indicator) | ||
45 | $htmlEntities["»"] ="\xC2\xBB"; // » (angle quotation mark (right)) | ||
46 | $htmlEntities["¼"] ="\xC2\xBC"; // ¼ (fraction 1/4) | ||
47 | $htmlEntities["½"] ="\xC2\xBD"; // ½ (fraction 1/2) | ||
48 | $htmlEntities["¾"] ="\xC2\xBE"; // ¾ (fraction 3/4) | ||
49 | $htmlEntities["¿"] ="\xC2\xBF"; // ¿ (inverted question mark) | ||
50 | $htmlEntities["À"] ="\xC3\x80"; // À (capital a, grave accent) | ||
51 | $htmlEntities["Á"] ="\xC3\x81"; // Á (capital a, acute accent) | ||
52 | $htmlEntities["Â"] ="\xC3\x82"; // Â (capital a, circumflex accent) | ||
53 | $htmlEntities["Ã"] ="\xC3\x83"; // Ã (capital a, tilde) | ||
54 | $htmlEntities["Ä"] ="\xC3\x84"; // Ä (capital a, umlaut mark) | ||
55 | $htmlEntities["Å"] ="\xC3\x85"; // Å (capital a, ring) | ||
56 | $htmlEntities["Æ"] ="\xC3\x86"; // Æ (capital ae) | ||
57 | $htmlEntities["Ç"] ="\xC3\x87"; // Ç (capital c, cedilla) | ||
58 | $htmlEntities["È"] ="\xC3\x88"; // È (capital e, grave accent) | ||
59 | $htmlEntities["É"] ="\xC3\x89"; // É (capital e, acute accent) | ||
60 | $htmlEntities["Ê"] ="\xC3\x8A"; // Ê (capital e, circumflex accent) | ||
61 | $htmlEntities["Ë"] ="\xC3\x8B"; // Ë (capital e, umlaut mark) | ||
62 | $htmlEntities["Ì"] ="\xC3\x8C"; // Ì (capital i, grave accent) | ||
63 | $htmlEntities["Í"] ="\xC3\x8D"; // Í (capital i, acute accent) | ||
64 | $htmlEntities["Î"] ="\xC3\x8E"; // Î (capital i, circumflex accent) | ||
65 | $htmlEntities["Ï"] ="\xC3\x8F"; // Ï (capital i, umlaut mark) | ||
66 | $htmlEntities["Ð"] ="\xC3\x90"; // Ð (capital eth, Icelandic) | ||
67 | $htmlEntities["Ñ"] ="\xC3\x91"; // Ñ (capital n, tilde) | ||
68 | $htmlEntities["Ò"] ="\xC3\x92"; // Ò (capital o, grave accent) | ||
69 | $htmlEntities["Ó"] ="\xC3\x93"; // Ó (capital o, acute accent) | ||
70 | $htmlEntities["Ô"] ="\xC3\x94"; // Ô (capital o, circumflex accent) | ||
71 | $htmlEntities["Õ"] ="\xC3\x95"; // Õ (capital o, tilde) | ||
72 | $htmlEntities["Ö"] ="\xC3\x96"; // Ö (capital o, umlaut mark) | ||
73 | $htmlEntities["×"] ="\xC3\x97"; // × (multiplication) | ||
74 | $htmlEntities["Ø"] ="\xC3\x98"; // Ø (capital o, slash) | ||
75 | $htmlEntities["Ù"] ="\xC3\x99"; // Ù (capital u, grave accent) | ||
76 | $htmlEntities["Ú"] ="\xC3\x9A"; // Ú (capital u, acute accent) | ||
77 | $htmlEntities["Û"] ="\xC3\x9B"; // Û (capital u, circumflex accent) | ||
78 | $htmlEntities["Ü"] ="\xC3\x9C"; // Ü (capital u, umlaut mark) | ||
79 | $htmlEntities["Ý"] ="\xC3\x9D"; // Ý (capital y, acute accent) | ||
80 | $htmlEntities["Þ"] ="\xC3\x9E"; // Þ (capital THORN, Icelandic) | ||
81 | $htmlEntities["ß"] ="\xC3\x9F"; // ß (small sharp s, German) | ||
82 | $htmlEntities["à"] ="\xC3\xA0"; // à (small a, grave accent) | ||
83 | $htmlEntities["á"] ="\xC3\xA1"; // á (small a, acute accent) | ||
84 | $htmlEntities["â"] ="\xC3\xA2"; // â (small a, circumflex accent) | ||
85 | $htmlEntities["ã"] ="\xC3\xA3"; // ã (small a, tilde) | ||
86 | $htmlEntities["ä"] ="\xC3\xA4"; // ä (small a, umlaut mark) | ||
87 | $htmlEntities["å"] ="\xC3\xA5"; // å (small a, ring) | ||
88 | $htmlEntities["æ"] ="\xC3\xA6"; // æ (small ae) | ||
89 | $htmlEntities["ç"] ="\xC3\xA7"; // ç (small c, cedilla) | ||
90 | $htmlEntities["è"] ="\xC3\xA8"; // è (small e, grave accent) | ||
91 | $htmlEntities["é"] ="\xC3\xA9"; // é (small e, acute accent) | ||
92 | $htmlEntities["ê"] ="\xC3\xAA"; // ê (small e, circumflex accent) | ||
93 | $htmlEntities["ë"] ="\xC3\xAB"; // ë (small e, umlaut mark) | ||
94 | $htmlEntities["ì"] ="\xC3\xAC"; // ì (small i, grave accent) | ||
95 | $htmlEntities["í"] ="\xC3\xAD"; // í (small i, acute accent) | ||
96 | $htmlEntities["î"] ="\xC3\xAE"; // î (small i, circumflex accent) | ||
97 | $htmlEntities["ï"] ="\xC3\xAF"; // ï (small i, umlaut mark) | ||
98 | $htmlEntities["ð"] ="\xC3\xB0"; // ð (small eth, Icelandic) | ||
99 | $htmlEntities["ñ"] ="\xC3\xB1"; // ñ (small n, tilde) | ||
100 | $htmlEntities["ò"] ="\xC3\xB2"; // ò (small o, grave accent) | ||
101 | $htmlEntities["ó"] ="\xC3\xB3"; // ó (small o, acute accent) | ||
102 | $htmlEntities["ô"] ="\xC3\xB4"; // ô (small o, circumflex accent) | ||
103 | $htmlEntities["õ"] ="\xC3\xB5"; // õ (small o, tilde) | ||
104 | $htmlEntities["ö"] ="\xC3\xB6"; // ö (small o, umlaut mark) | ||
105 | $htmlEntities["÷"] ="\xC3\xB7"; // ÷ (division) | ||
106 | $htmlEntities["ø"] ="\xC3\xB8"; // ø (small o, slash) | ||
107 | $htmlEntities["ù"] ="\xC3\xB9"; // ù (small u, grave accent) | ||
108 | $htmlEntities["ú"] ="\xC3\xBA"; // ú (small u, acute accent) | ||
109 | $htmlEntities["û"] ="\xC3\xBB"; // û (small u, circumflex accent) | ||
110 | $htmlEntities["ü"] ="\xC3\xBC"; // ü (small u, umlaut mark) | ||
111 | $htmlEntities["ý"] ="\xC3\xBD"; // ý (small y, acute accent) | ||
112 | $htmlEntities["þ"] ="\xC3\xBE"; // þ (small thorn, Icelandic) | ||
113 | $htmlEntities["ÿ"] ="\xC3\xBF"; // ÿ (small y, umlaut mark) | ||
114 | $htmlEntities["Œ"] ="\xC5\x92"; // Œ (capital ligature OE) | ||
115 | $htmlEntities["œ"] ="\xC5\x93"; // œ (small ligature oe) | ||
116 | $htmlEntities["Š"] ="\xC5\xA0"; // Š (capital S with caron) | ||
117 | $htmlEntities["š"] ="\xC5\xA1"; // š (small S with caron) | ||
118 | $htmlEntities["Ÿ"] ="\xC5\xB8"; // Ÿ (capital Y with diaeres) | ||
119 | $htmlEntities["ƒ"] ="\xC6\x92"; // ƒ (f with hook) | ||
120 | $htmlEntities["ˆ"] ="\xCB\x86"; // ˆ (modifier letter circumflex accent) | ||
121 | $htmlEntities["˜"] ="\xCB\x9C"; // ˜ (small tilde) | ||
122 | $htmlEntities["Α"] ="\xCE\x91"; // Α (Alpha) | ||
123 | $htmlEntities["Β"] ="\xCE\x92"; // Β (Beta) | ||
124 | $htmlEntities["Γ"] ="\xCE\x93"; // Γ (Gamma) | ||
125 | $htmlEntities["Δ"] ="\xCE\x94"; // Δ (Delta) | ||
126 | $htmlEntities["Ε"] ="\xCE\x95"; // Ε (Epsilon) | ||
127 | $htmlEntities["Ζ"] ="\xCE\x96"; // Ζ (Zeta) | ||
128 | $htmlEntities["Η"] ="\xCE\x97"; // Η (Eta) | ||
129 | $htmlEntities["Θ"] ="\xCE\x98"; // Θ (Theta) | ||
130 | $htmlEntities["Ι"] ="\xCE\x99"; // Ι (Iota) | ||
131 | $htmlEntities["Κ"] ="\xCE\x9A"; // Κ (Kappa) | ||
132 | $htmlEntities["Λ"] ="\xCE\x9B"; // Λ (Lambda) | ||
133 | $htmlEntities["Μ"] ="\xCE\x9C"; // Μ (Mu) | ||
134 | $htmlEntities["Ν"] ="\xCE\x9D"; // Ν (Nu) | ||
135 | $htmlEntities["Ξ"] ="\xCE\x9E"; // Ξ (Xi) | ||
136 | $htmlEntities["Ο"] ="\xCE\x9F"; // Ο (Omicron) | ||
137 | $htmlEntities["Π"] ="\xCE\xA0"; // Π (Pi) | ||
138 | $htmlEntities["Ρ"] ="\xCE\xA1"; // Ρ (Rho) | ||
139 | $htmlEntities["Σ"] ="\xCE\xA3"; // Σ (Sigma) | ||
140 | $htmlEntities["Τ"] ="\xCE\xA4"; // Τ (Tau) | ||
141 | $htmlEntities["Υ"] ="\xCE\xA5"; // Υ (Upsilon) | ||
142 | $htmlEntities["Φ"] ="\xCE\xA6"; // Φ (Phi) | ||
143 | $htmlEntities["Χ"] ="\xCE\xA7"; // Χ (Chi) | ||
144 | $htmlEntities["Ψ"] ="\xCE\xA8"; // Ψ (Psi) | ||
145 | $htmlEntities["Ω"] ="\xCE\xA9"; // Ω (Omega) | ||
146 | $htmlEntities["α"] ="\xCE\xB1"; // α (alpha) | ||
147 | $htmlEntities["β"] ="\xCE\xB2"; // β (beta) | ||
148 | $htmlEntities["γ"] ="\xCE\xB3"; // γ (gamma) | ||
149 | $htmlEntities["δ"] ="\xCE\xB4"; // δ (delta) | ||
150 | $htmlEntities["ε"] ="\xCE\xB5"; // ε (epsilon) | ||
151 | $htmlEntities["ζ"] ="\xCE\xB6"; // ζ (zeta) | ||
152 | $htmlEntities["η"] ="\xCE\xB7"; // η (eta) | ||
153 | $htmlEntities["θ"] ="\xCE\xB8"; // θ (theta) | ||
154 | $htmlEntities["ι"] ="\xCE\xB9"; // ι (iota) | ||
155 | $htmlEntities["κ"] ="\xCE\xBA"; // κ (kappa) | ||
156 | $htmlEntities["λ"] ="\xCE\xBB"; // λ (lambda) | ||
157 | $htmlEntities["μ"] ="\xCE\xBC"; // μ (mu) | ||
158 | $htmlEntities["ν"] ="\xCE\xBD"; // ν (nu) | ||
159 | $htmlEntities["ξ"] ="\xCE\xBE"; // ξ (xi) | ||
160 | $htmlEntities["ο"] ="\xCE\xBF"; // ο (omicron) | ||
161 | $htmlEntities["π"] ="\xCF\x80"; // π (pi) | ||
162 | $htmlEntities["ρ"] ="\xCF\x81"; // ρ (rho) | ||
163 | $htmlEntities["ς"] ="\xCF\x82"; // ς (sigmaf) | ||
164 | $htmlEntities["σ"] ="\xCF\x83"; // σ (sigma) | ||
165 | $htmlEntities["τ"] ="\xCF\x84"; // τ (tau) | ||
166 | $htmlEntities["υ"] ="\xCF\x85"; // υ (upsilon) | ||
167 | $htmlEntities["φ"] ="\xCF\x86"; // φ (phi) | ||
168 | $htmlEntities["χ"] ="\xCF\x87"; // χ (chi) | ||
169 | $htmlEntities["ψ"] ="\xCF\x88"; // ψ (psi) | ||
170 | $htmlEntities["ω"] ="\xCF\x89"; // ω (omega) | ||
171 | $htmlEntities["ϑ"] ="\xCF\x91"; // ϑ (theta symbol) | ||
172 | $htmlEntities["ϒ"] ="\xCF\x92"; // ϒ (upsilon symbol) | ||
173 | $htmlEntities["ϖ"] ="\xCF\x96"; // ϖ (pi symbol) | ||
174 | $htmlEntities[" "] ="\xE2\x80\x82"; //   (en space) | ||
175 | $htmlEntities[" "] ="\xE2\x80\x83"; //   (em space) | ||
176 | $htmlEntities[" "] ="\xE2\x80\x89"; //   (thin space) | ||
177 | $htmlEntities["‌"] ="\xE2\x80\x8C"; // ‌ (zero width non-joiner) | ||
178 | $htmlEntities["‍"] ="\xE2\x80\x8D"; // ‍ (zero width joiner) | ||
179 | $htmlEntities["‎"] ="\xE2\x80\x8E"; // ‎ (left-to-right mark) | ||
180 | $htmlEntities["‏"] ="\xE2\x80\x8F"; // ‏ (right-to-left mark) | ||
181 | $htmlEntities["–"] ="\xE2\x80\x93"; // – (en dash) | ||
182 | $htmlEntities["—"] ="\xE2\x80\x94"; // — (em dash) | ||
183 | $htmlEntities["‘"] ="\xE2\x80\x98"; // ‘ (left single quotation mark) | ||
184 | $htmlEntities["’"] ="\xE2\x80\x99"; // ’ (right single quotation mark) | ||
185 | $htmlEntities["‚"] ="\xE2\x80\x9A"; // ‚ (single low-9 quotation mark) | ||
186 | $htmlEntities["“"] ="\xE2\x80\x9C"; // “ (left double quotation mark) | ||
187 | $htmlEntities["”"] ="\xE2\x80\x9D"; // ” (right double quotation mark) | ||
188 | $htmlEntities["„"] ="\xE2\x80\x9E"; // „ (double low-9 quotation mark) | ||
189 | $htmlEntities["†"] ="\xE2\x80\xA0"; // † (dagger) | ||
190 | $htmlEntities["‡"] ="\xE2\x80\xA1"; // ‡ (double dagger) | ||
191 | $htmlEntities["•"] ="\xE2\x80\xA2"; // • (bullet) | ||
192 | $htmlEntities["…"] ="\xE2\x80\xA6"; // … (horizontal ellipsis) | ||
193 | $htmlEntities["‰"] ="\xE2\x80\xB0"; // ‰ (per mille) | ||
194 | $htmlEntities["′"] ="\xE2\x80\xB2"; // ′ (minutes or prime) | ||
195 | $htmlEntities["″"] ="\xE2\x80\xB3"; // ″ (seconds or Double Prime) | ||
196 | $htmlEntities["‹"] ="\xE2\x80\xB9"; // ‹ (single left angle quotation) | ||
197 | $htmlEntities["›"] ="\xE2\x80\xBA"; // › (single right angle quotation) | ||
198 | $htmlEntities["‾"] ="\xE2\x80\xBE"; // ‾ (overline) | ||
199 | $htmlEntities["⁄"] ="\xE2\x81\x84"; // ⁄ (fraction slash) | ||
200 | $htmlEntities["€"] ="\xE2\x82\xAC"; // € (euro) | ||
201 | $htmlEntities["ℑ"] ="\xE2\x84\x91"; // ℑ (blackletter capital I) | ||
202 | $htmlEntities["℘"] ="\xE2\x84\x98"; // ℘ (script capital P) | ||
203 | $htmlEntities["ℜ"] ="\xE2\x84\x9C"; // ℜ (blackletter capital R) | ||
204 | $htmlEntities["™"] ="\xE2\x84\xA2"; // ™ (trademark) | ||
205 | $htmlEntities["ℵ"] ="\xE2\x84\xB5"; // ℵ (alef) | ||
206 | $htmlEntities["←"] ="\xE2\x86\x90"; // ← (left arrow) | ||
207 | $htmlEntities["↑"] ="\xE2\x86\x91"; // ↑ (up arrow) | ||
208 | $htmlEntities["→"] ="\xE2\x86\x92"; // → (right arrow) | ||
209 | $htmlEntities["↓"] ="\xE2\x86\x93"; // ↓ (down arrow) | ||
210 | $htmlEntities["↔"] ="\xE2\x86\x94"; // ↔ (left right arrow) | ||
211 | $htmlEntities["↵"] ="\xE2\x86\xB5"; // ↵ (carriage return arrow) | ||
212 | $htmlEntities["⇐"] ="\xE2\x87\x90"; // ⇐ (left double arrow) | ||
213 | $htmlEntities["⇑"] ="\xE2\x87\x91"; // ⇑ (up double arrow) | ||
214 | $htmlEntities["⇒"] ="\xE2\x87\x92"; // ⇒ (right double arrow) | ||
215 | $htmlEntities["⇓"] ="\xE2\x87\x93"; // ⇓ (down double arrow) | ||
216 | $htmlEntities["⇔"] ="\xE2\x87\x94"; // ⇔ (left right double arrow) | ||
217 | $htmlEntities["∀"] ="\xE2\x88\x80"; // ∀ (for all) | ||
218 | $htmlEntities["∂"] ="\xE2\x88\x82"; // ∂ (partial differential) | ||
219 | $htmlEntities["∃"] ="\xE2\x88\x83"; // ∃ (there exists) | ||
220 | $htmlEntities["∅"] ="\xE2\x88\x85"; // ∅ (empty set) | ||
221 | $htmlEntities["∇"] ="\xE2\x88\x87"; // ∇ (backward difference) | ||
222 | $htmlEntities["∈"] ="\xE2\x88\x88"; // ∈ (element of) | ||
223 | $htmlEntities["∉"] ="\xE2\x88\x89"; // ∉ (not an element of) | ||
224 | $htmlEntities["∋"] ="\xE2\x88\x8B"; // ∋ (ni = contains as member) | ||
225 | $htmlEntities["∏"] ="\xE2\x88\x8F"; // ∏ (n-ary product) | ||
226 | $htmlEntities["∑"] ="\xE2\x88\x91"; // ∑ (n-ary sumation) | ||
227 | $htmlEntities["−"] ="\xE2\x88\x92"; // − (minus) | ||
228 | $htmlEntities["∗"] ="\xE2\x88\x97"; // ∗ (asterisk operator) | ||
229 | $htmlEntities["√"] ="\xE2\x88\x9A"; // √ (square root) | ||
230 | $htmlEntities["∝"] ="\xE2\x88\x9D"; // ∝ (proportional to) | ||
231 | $htmlEntities["∞"] ="\xE2\x88\x9E"; // ∞ (infinity) | ||
232 | $htmlEntities["∠"] ="\xE2\x88\xA0"; // ∠ (angle) | ||
233 | $htmlEntities["∧"] ="\xE2\x88\xA7"; // ∧ (logical and) | ||
234 | $htmlEntities["∨"] ="\xE2\x88\xA8"; // ∨ (logical or) | ||
235 | $htmlEntities["∩"] ="\xE2\x88\xA9"; // ∩ (intersection) | ||
236 | $htmlEntities["∪"] ="\xE2\x88\xAA"; // ∪ (union) | ||
237 | $htmlEntities["∫"] ="\xE2\x88\xAB"; // ∫ (integral) | ||
238 | $htmlEntities["∴"] ="\xE2\x88\xB4"; // ∴ (therefore) | ||
239 | $htmlEntities["∼"] ="\xE2\x88\xBC"; // ∼ (similar to) | ||
240 | $htmlEntities["≅"] ="\xE2\x89\x85"; // ≅ (congruent to) | ||
241 | $htmlEntities["≈"] ="\xE2\x89\x88"; // ≈ (approximately equal) | ||
242 | $htmlEntities["≠"] ="\xE2\x89\xA0"; // ≠ (not equal) | ||
243 | $htmlEntities["≡"] ="\xE2\x89\xA1"; // ≡ (equivalent) | ||
244 | $htmlEntities["≤"] ="\xE2\x89\xA4"; // ≤ (less or equal) | ||
245 | $htmlEntities["≥"] ="\xE2\x89\xA5"; // ≥ (greater or equal) | ||
246 | $htmlEntities["⊂"] ="\xE2\x8A\x82"; // ⊂ (subset of) | ||
247 | $htmlEntities["⊃"] ="\xE2\x8A\x83"; // ⊃ (superset of) | ||
248 | $htmlEntities["⊄"] ="\xE2\x8A\x84"; // ⊄ (not subset of) | ||
249 | $htmlEntities["⊆"] ="\xE2\x8A\x86"; // ⊆ (subset or equal) | ||
250 | $htmlEntities["⊇"] ="\xE2\x8A\x87"; // ⊇ (superset or equal) | ||
251 | $htmlEntities["⊕"] ="\xE2\x8A\x95"; // ⊕ (circled plus) | ||
252 | $htmlEntities["⊗"] ="\xE2\x8A\x87"; // ⊗ (circled times) | ||
253 | $htmlEntities["⊥"] ="\xE2\x8A\xA5"; // ⊥ (perpendicular) | ||
254 | $htmlEntities["⋅"] ="\xE2\x8C\x85"; // ⋅ (dot operator) | ||
255 | $htmlEntities["⌈"] ="\xE2\x8C\x88"; // ⌈ (left ceiling) | ||
256 | $htmlEntities["⌉"] ="\xE2\x8C\x89"; // ⌉ (right ceiling) | ||
257 | $htmlEntities["⌊"] ="\xE2\x8C\x8A"; // ⌊ (left floor) | ||
258 | $htmlEntities["⌋"] ="\xE2\x8C\x8B"; // ⌋ (right floor) | ||
259 | $htmlEntities["⟨"] ="\xE2\x8C\xA9"; // 〈 (left angle bracket = bra) | ||
260 | $htmlEntities["⟩"] ="\xE2\x8C\xAA"; // 〉 (right angle bracket = ket) | ||
261 | $htmlEntities["◊"] ="\xE2\x97\x8A"; // ◊ (lozenge) | ||
262 | $htmlEntities["♠"] ="\xE2\x99\xA0"; // ♠ (spade) | ||
263 | $htmlEntities["♣"] ="\xE2\x99\xA3"; // ♣ (club) | ||
264 | $htmlEntities["♥"] ="\xE2\x99\xA5"; // ♥ (heart) | ||
265 | $htmlEntities["♦"] ="\xE2\x99\xA6"; // ♦ (diamond) | ||
266 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.NCX.php b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php new file mode 100644 index 00000000..e5da05cd --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.NCX.php | |||
@@ -0,0 +1,782 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * ePub NCX file structure | ||
4 | * | ||
5 | * @author A. Grandt <php@grandt.com> | ||
6 | * @copyright 2009-2014 A. Grandt | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 3.20 | ||
9 | */ | ||
10 | class Ncx { | ||
11 | const _VERSION = 3.20; | ||
12 | |||
13 | const MIMETYPE = "application/x-dtbncx+xml"; | ||
14 | |||
15 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
16 | |||
17 | private $navMap = NULL; | ||
18 | private $uid = NULL; | ||
19 | private $meta = array(); | ||
20 | private $docTitle = NULL; | ||
21 | private $docAuthor = NULL; | ||
22 | |||
23 | private $currentLevel = NULL; | ||
24 | private $lastLevel = NULL; | ||
25 | |||
26 | private $languageCode = "en"; | ||
27 | private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT; | ||
28 | |||
29 | public $chapterList = array(); | ||
30 | public $referencesTitle = "Guide"; | ||
31 | public $referencesClass = "references"; | ||
32 | public $referencesId = "references"; | ||
33 | public $referencesList = array(); | ||
34 | public $referencesName = array(); | ||
35 | public $referencesOrder = NULL; | ||
36 | |||
37 | /** | ||
38 | * Class constructor. | ||
39 | * | ||
40 | * @param string $uid | ||
41 | * @param string $docTitle | ||
42 | * @param string $docAuthor | ||
43 | * @param string $languageCode | ||
44 | * @param string $writingDirection | ||
45 | */ | ||
46 | function __construct($uid = NULL, $docTitle = NULL, $docAuthor = NULL, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { | ||
47 | $this->navMap = new NavMap($writingDirection); | ||
48 | $this->currentLevel = $this->navMap; | ||
49 | $this->setUid($uid); | ||
50 | $this->setDocTitle($docTitle); | ||
51 | $this->setDocAuthor($docAuthor); | ||
52 | $this->setLanguageCode($languageCode); | ||
53 | $this->setWritingDirection($writingDirection); | ||
54 | } | ||
55 | |||
56 | /** | ||
57 | * Class destructor | ||
58 | * | ||
59 | * @return void | ||
60 | */ | ||
61 | function __destruct() { | ||
62 | unset($this->bookVersion, $this->navMap, $this->uid, $this->meta); | ||
63 | unset($this->docTitle, $this->docAuthor, $this->currentLevel, $this->lastLevel); | ||
64 | unset($this->languageCode, $this->writingDirection, $this->chapterList, $this->referencesTitle); | ||
65 | unset($this->referencesClass, $this->referencesId, $this->referencesList, $this->referencesName); | ||
66 | unset($this->referencesOrder); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * | ||
71 | * Enter description here ... | ||
72 | * | ||
73 | * @param string $bookVersion | ||
74 | */ | ||
75 | function setVersion($bookVersion) { | ||
76 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
77 | } | ||
78 | |||
79 | /** | ||
80 | * | ||
81 | * @return bool TRUE if the book is set to type ePub 2 | ||
82 | */ | ||
83 | function isEPubVersion2() { | ||
84 | return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; | ||
85 | } | ||
86 | |||
87 | /** | ||
88 | * | ||
89 | * Enter description here ... | ||
90 | * | ||
91 | * @param string $uid | ||
92 | */ | ||
93 | function setUid($uid) { | ||
94 | $this->uid = is_string($uid) ? trim($uid) : NULL; | ||
95 | } | ||
96 | |||
97 | /** | ||
98 | * | ||
99 | * Enter description here ... | ||
100 | * | ||
101 | * @param string $docTitle | ||
102 | */ | ||
103 | function setDocTitle($docTitle) { | ||
104 | $this->docTitle = is_string($docTitle) ? trim($docTitle) : NULL; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * | ||
109 | * Enter description here ... | ||
110 | * | ||
111 | * @param string $docAuthor | ||
112 | */ | ||
113 | function setDocAuthor($docAuthor) { | ||
114 | $this->docAuthor = is_string($docAuthor) ? trim($docAuthor) : NULL; | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * | ||
119 | * Enter description here ... | ||
120 | * | ||
121 | * @param string $languageCode | ||
122 | */ | ||
123 | function setLanguageCode($languageCode) { | ||
124 | $this->languageCode = is_string($languageCode) ? trim($languageCode) : "en"; | ||
125 | } | ||
126 | |||
127 | /** | ||
128 | * | ||
129 | * Enter description here ... | ||
130 | * | ||
131 | * @param string $writingDirection | ||
132 | */ | ||
133 | function setWritingDirection($writingDirection) { | ||
134 | $this->writingDirection = is_string($writingDirection) ? trim($writingDirection) : EPub::DIRECTION_LEFT_TO_RIGHT; | ||
135 | } | ||
136 | |||
137 | /** | ||
138 | * | ||
139 | * Enter description here ... | ||
140 | * | ||
141 | * @param NavMap $navMap | ||
142 | */ | ||
143 | function setNavMap($navMap) { | ||
144 | if ($navMap != NULL && is_object($navMap) && get_class($navMap) === "NavMap") { | ||
145 | $this->navMap = $navMap; | ||
146 | } | ||
147 | } | ||
148 | |||
149 | /** | ||
150 | * Add one chapter level. | ||
151 | * | ||
152 | * Subsequent chapters will be added to this level. | ||
153 | * | ||
154 | * @param string $navTitle | ||
155 | * @param string $navId | ||
156 | * @param string $navClass | ||
157 | * @param string $isNavHidden | ||
158 | * @param string $writingDirection | ||
159 | * @return NavPoint | ||
160 | */ | ||
161 | function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { | ||
162 | $navPoint = FALSE; | ||
163 | if (isset($navTitle) && isset($navClass)) { | ||
164 | $navPoint = new NavPoint($navTitle, NULL, $navId, $navClass, $isNavHidden, $writingDirection); | ||
165 | $this->addNavPoint($navPoint); | ||
166 | } | ||
167 | if ($this->lastLevel !== NULL) { | ||
168 | $this->currentLevel = $this->lastLevel; | ||
169 | } | ||
170 | return $navPoint; | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * Step back one chapter level. | ||
175 | * | ||
176 | * Subsequent chapters will be added to this chapters parent level. | ||
177 | */ | ||
178 | function backLevel() { | ||
179 | $this->lastLevel = $this->currentLevel; | ||
180 | $this->currentLevel = $this->currentLevel->getParent(); | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * Step back to the root level. | ||
185 | * | ||
186 | * Subsequent chapters will be added to the rooot NavMap. | ||
187 | */ | ||
188 | function rootLevel() { | ||
189 | $this->lastLevel = $this->currentLevel; | ||
190 | $this->currentLevel = $this->navMap; | ||
191 | } | ||
192 | |||
193 | /** | ||
194 | * Step back to the given level. | ||
195 | * Useful for returning to a previous level from deep within the structure. | ||
196 | * Values below 2 will have the same effect as rootLevel() | ||
197 | * | ||
198 | * @param int $newLevel | ||
199 | */ | ||
200 | function setCurrentLevel($newLevel) { | ||
201 | if ($newLevel <= 1) { | ||
202 | $this->rootLevel(); | ||
203 | } else { | ||
204 | while ($this->currentLevel->getLevel() > $newLevel) { | ||
205 | $this->backLevel(); | ||
206 | } | ||
207 | } | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * Get current level count. | ||
212 | * The indentation of the current structure point. | ||
213 | * | ||
214 | * @return current level count; | ||
215 | */ | ||
216 | function getCurrentLevel() { | ||
217 | return $this->currentLevel->getLevel(); | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * Add child NavPoints to current level. | ||
222 | * | ||
223 | * @param NavPoint $navPoint | ||
224 | */ | ||
225 | function addNavPoint($navPoint) { | ||
226 | $this->lastLevel = $this->currentLevel->addNavPoint($navPoint); | ||
227 | } | ||
228 | |||
229 | /** | ||
230 | * | ||
231 | * Enter description here ... | ||
232 | * | ||
233 | * @return NavMap | ||
234 | */ | ||
235 | function getNavMap() { | ||
236 | return $this->navMap; | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * | ||
241 | * Enter description here ... | ||
242 | * | ||
243 | * @param string $name | ||
244 | * @param string $content | ||
245 | */ | ||
246 | function addMetaEntry($name, $content) { | ||
247 | $name = is_string($name) ? trim($name) : NULL; | ||
248 | $content = is_string($content) ? trim($content) : NULL; | ||
249 | |||
250 | if ($name != NULL && $content != NULL) { | ||
251 | $this->meta[] = array($name => $content); | ||
252 | } | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * | ||
257 | * Enter description here ... | ||
258 | * | ||
259 | * @return string | ||
260 | */ | ||
261 | function finalize() { | ||
262 | $nav = $this->navMap->finalize(); | ||
263 | |||
264 | $ncx = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; | ||
265 | if ($this->isEPubVersion2()) { | ||
266 | $ncx .= "<!DOCTYPE ncx PUBLIC \"-//NISO//DTD ncx 2005-1//EN\"\n" | ||
267 | . " \"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd\">\n"; | ||
268 | } | ||
269 | $ncx .= "<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\" xml:lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n" | ||
270 | . "\t<head>\n" | ||
271 | . "\t\t<meta name=\"dtb:uid\" content=\"" . $this->uid . "\" />\n" | ||
272 | . "\t\t<meta name=\"dtb:depth\" content=\"" . $this->navMap->getNavLevels() . "\" />\n" | ||
273 | . "\t\t<meta name=\"dtb:totalPageCount\" content=\"0\" />\n" | ||
274 | . "\t\t<meta name=\"dtb:maxPageNumber\" content=\"0\" />\n"; | ||
275 | |||
276 | if (sizeof($this->meta)) { | ||
277 | foreach ($this->meta as $metaEntry) { | ||
278 | list($name, $content) = each($metaEntry); | ||
279 | $ncx .= "\t\t<meta name=\"" . $name . "\" content=\"" . $content . "\" />\n"; | ||
280 | } | ||
281 | } | ||
282 | |||
283 | $ncx .= "\t</head>\n\n\t<docTitle>\n\t\t<text>" | ||
284 | . $this->docTitle | ||
285 | . "</text>\n\t</docTitle>\n\n\t<docAuthor>\n\t\t<text>" | ||
286 | . $this->docAuthor | ||
287 | . "</text>\n\t</docAuthor>\n\n" | ||
288 | . $nav; | ||
289 | |||
290 | return $ncx . "</ncx>\n"; | ||
291 | } | ||
292 | |||
293 | /** | ||
294 | * | ||
295 | * @param string $title | ||
296 | * @param string $cssFileName | ||
297 | * @return string | ||
298 | */ | ||
299 | function finalizeEPub3($title = "Table of Contents", $cssFileName = NULL) { | ||
300 | $end = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | ||
301 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\"\n" | ||
302 | . " xmlns:epub=\"http://www.idpf.org/2007/ops\"\n" | ||
303 | . " xml:lang=\"" . $this->languageCode . "\" lang=\"" . $this->languageCode . "\" dir=\"" . $this->writingDirection . "\">\n" | ||
304 | . "\t<head>\n" | ||
305 | . "\t\t<title>" . $this->docTitle . "</title>\n" | ||
306 | . "\t\t<meta http-equiv=\"default-style\" content=\"text/html; charset=utf-8\"/>\n"; | ||
307 | if ($cssFileName !== NULL) { | ||
308 | $end .= "\t\t<link rel=\"stylesheet\" href=\"" . $cssFileName . "\" type=\"text/css\"/>\n"; | ||
309 | } | ||
310 | $end .= "\t</head>\n" | ||
311 | . "\t<body epub:type=\"frontmatter toc\">\n" | ||
312 | . "\t\t<header>\n" | ||
313 | . "\t\t\t<h1>" . $title . "</h1>\n" | ||
314 | . "\t\t</header>\n" | ||
315 | . $this->navMap->finalizeEPub3() | ||
316 | . $this->finalizeEPub3Landmarks() | ||
317 | . "\t</body>\n" | ||
318 | . "</html>\n"; | ||
319 | |||
320 | return $end; | ||
321 | } | ||
322 | |||
323 | /** | ||
324 | * Build the references for the ePub 2 toc. | ||
325 | * These are merely reference pages added to the end of the navMap though. | ||
326 | * | ||
327 | * @return string | ||
328 | */ | ||
329 | function finalizeReferences() { | ||
330 | if (isset($this->referencesList) && sizeof($this->referencesList) > 0) { | ||
331 | $this->rootLevel(); | ||
332 | $this->subLevel($this->referencesTitle, $this->referencesId, $this->referencesClass); | ||
333 | $refId = 1; | ||
334 | while (list($item, $descriptive) = each($this->referencesOrder)) { | ||
335 | if (array_key_exists($item, $this->referencesList)) { | ||
336 | $name = (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item]); | ||
337 | $navPoint = new NavPoint($name, $this->referencesList[$item], "ref-" . $refId++); | ||
338 | $this->addNavPoint($navPoint); | ||
339 | } | ||
340 | } | ||
341 | } | ||
342 | } | ||
343 | |||
344 | /** | ||
345 | * Build the landmarks for the ePub 3 toc. | ||
346 | * @return string | ||
347 | */ | ||
348 | function finalizeEPub3Landmarks() { | ||
349 | $lm = ""; | ||
350 | if (isset($this->referencesList) && sizeof($this->referencesList) > 0) { | ||
351 | $lm = "\t\t\t<nav epub:type=\"landmarks\">\n" | ||
352 | . "\t\t\t\t<h2" | ||
353 | . ($this->writingDirection === EPub::DIRECTION_RIGHT_TO_LEFT ? " dir=\"rtl\"" : "") | ||
354 | . ">" . $this->referencesTitle . "</h2>\n" | ||
355 | . "\t\t\t\t<ol>\n"; | ||
356 | |||
357 | $li = ""; | ||
358 | while (list($item, $descriptive) = each($this->referencesOrder)) { | ||
359 | if (array_key_exists($item, $this->referencesList)) { | ||
360 | $li .= "\t\t\t\t\t<li><a epub:type=\"" | ||
361 | . $item | ||
362 | . "\" href=\"" . $this->referencesList[$item] . "\">" | ||
363 | . (empty($this->referencesName[$item]) ? $descriptive : $this->referencesName[$item]) | ||
364 | . "</a></li>\n"; | ||
365 | } | ||
366 | } | ||
367 | if (empty($li)) { | ||
368 | return ""; | ||
369 | } | ||
370 | |||
371 | $lm .= $li | ||
372 | . "\t\t\t\t</ol>\n" | ||
373 | . "\t\t\t</nav>\n"; | ||
374 | } | ||
375 | return $lm; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | /** | ||
380 | * ePub NavMap class | ||
381 | */ | ||
382 | class NavMap { | ||
383 | const _VERSION = 3.00; | ||
384 | |||
385 | private $navPoints = array(); | ||
386 | private $navLevels = 0; | ||
387 | private $writingDirection = NULL; | ||
388 | |||
389 | /** | ||
390 | * Class constructor. | ||
391 | * | ||
392 | * @return void | ||
393 | */ | ||
394 | function __construct($writingDirection = NULL) { | ||
395 | $this->setWritingDirection($writingDirection); | ||
396 | } | ||
397 | |||
398 | /** | ||
399 | * Class destructor | ||
400 | * | ||
401 | * @return void | ||
402 | */ | ||
403 | function __destruct() { | ||
404 | unset($this->navPoints, $this->navLevels, $this->writingDirection); | ||
405 | } | ||
406 | |||
407 | /** | ||
408 | * Set the writing direction to be used for this NavPoint. | ||
409 | * | ||
410 | * @param string $writingDirection | ||
411 | */ | ||
412 | function setWritingDirection($writingDirection) { | ||
413 | $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL; | ||
414 | } | ||
415 | |||
416 | function getWritingDirection() { | ||
417 | return $this->writingDirection; | ||
418 | } | ||
419 | |||
420 | /** | ||
421 | * Add a navPoint to the root of the NavMap. | ||
422 | * | ||
423 | * @param NavPoint $navPoint | ||
424 | * @return NavMap | ||
425 | */ | ||
426 | function addNavPoint($navPoint) { | ||
427 | if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") { | ||
428 | $navPoint->setParent($this); | ||
429 | if ($navPoint->getWritingDirection() == NULL) { | ||
430 | $navPoint->setWritingDirection($this->writingDirection); | ||
431 | } | ||
432 | $this->navPoints[] = $navPoint; | ||
433 | return $navPoint; | ||
434 | } | ||
435 | return $this; | ||
436 | } | ||
437 | |||
438 | /** | ||
439 | * The final max depth for the "dtb:depth" meta attribute | ||
440 | * Only available after finalize have been called. | ||
441 | * | ||
442 | * @return number | ||
443 | */ | ||
444 | function getNavLevels() { | ||
445 | return $this->navLevels+1; | ||
446 | } | ||
447 | |||
448 | function getLevel() { | ||
449 | return 1; | ||
450 | } | ||
451 | |||
452 | function getParent() { | ||
453 | return $this; | ||
454 | } | ||
455 | |||
456 | /** | ||
457 | * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization | ||
458 | * | ||
459 | */ | ||
460 | function finalize() { | ||
461 | $playOrder = 0; | ||
462 | $this->navLevels = 0; | ||
463 | |||
464 | $nav = "\t<navMap>\n"; | ||
465 | if (sizeof($this->navPoints) > 0) { | ||
466 | $this->navLevels++; | ||
467 | foreach ($this->navPoints as $navPoint) { | ||
468 | $retLevel = $navPoint->finalize($nav, $playOrder, 0); | ||
469 | if ($retLevel > $this->navLevels) { | ||
470 | $this->navLevels = $retLevel; | ||
471 | } | ||
472 | } | ||
473 | } | ||
474 | return $nav . "\t</navMap>\n"; | ||
475 | } | ||
476 | |||
477 | /** | ||
478 | * Finalize the navMap, the final max depth for the "dtb:depth" meta attribute can be retrieved with getNavLevels after finalization | ||
479 | * | ||
480 | */ | ||
481 | function finalizeEPub3() { | ||
482 | $playOrder = 0; | ||
483 | $level = 0; | ||
484 | $this->navLevels = 0; | ||
485 | |||
486 | $nav = "\t\t<nav epub:type=\"toc\" id=\"toc\">\n"; | ||
487 | |||
488 | if (sizeof($this->navPoints) > 0) { | ||
489 | $this->navLevels++; | ||
490 | |||
491 | $nav .= str_repeat("\t", $level) . "\t\t\t<ol epub:type=\"list\">\n"; | ||
492 | foreach ($this->navPoints as $navPoint) { | ||
493 | $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, 0); | ||
494 | if ($retLevel > $this->navLevels) { | ||
495 | $this->navLevels = $retLevel; | ||
496 | } | ||
497 | } | ||
498 | $nav .= str_repeat("\t", $level) . "\t\t\t</ol>\n"; | ||
499 | } | ||
500 | |||
501 | return $nav . "\t\t</nav>\n"; | ||
502 | } | ||
503 | } | ||
504 | |||
505 | /** | ||
506 | * ePub NavPoint class | ||
507 | */ | ||
508 | class NavPoint { | ||
509 | const _VERSION = 3.00; | ||
510 | |||
511 | private $label = NULL; | ||
512 | private $contentSrc = NULL; | ||
513 | private $id = NULL; | ||
514 | private $navClass = NULL; | ||
515 | private $isNavHidden = FALSE; | ||
516 | private $navPoints = array(); | ||
517 | private $parent = NULL; | ||
518 | |||
519 | /** | ||
520 | * Class constructor. | ||
521 | * | ||
522 | * All three attributes are mandatory, though if ID is set to null (default) the value will be generated. | ||
523 | * | ||
524 | * @param string $label | ||
525 | * @param string $contentSrc | ||
526 | * @param string $id | ||
527 | * @param string $navClass | ||
528 | * @param bool $isNavHidden | ||
529 | * @param string $writingDirection | ||
530 | */ | ||
531 | function __construct($label, $contentSrc = NULL, $id = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { | ||
532 | $this->setLabel($label); | ||
533 | $this->setContentSrc($contentSrc); | ||
534 | $this->setId($id); | ||
535 | $this->setNavClass($navClass); | ||
536 | $this->setNavHidden($isNavHidden); | ||
537 | $this->setWritingDirection($writingDirection); | ||
538 | } | ||
539 | |||
540 | /** | ||
541 | * Class destructor | ||
542 | * | ||
543 | * @return void | ||
544 | */ | ||
545 | function __destruct() { | ||
546 | unset($this->label, $this->contentSrc, $this->id, $this->navClass); | ||
547 | unset($this->isNavHidden, $this->navPoints, $this->parent); | ||
548 | } | ||
549 | |||
550 | /** | ||
551 | * Set the Text label for the NavPoint. | ||
552 | * | ||
553 | * The label is mandatory. | ||
554 | * | ||
555 | * @param string $label | ||
556 | */ | ||
557 | function setLabel($label) { | ||
558 | $this->label = is_string($label) ? trim($label) : NULL; | ||
559 | } | ||
560 | |||
561 | /** | ||
562 | * Get the Text label for the NavPoint. | ||
563 | * | ||
564 | * @return string Label | ||
565 | */ | ||
566 | function getLabel() { | ||
567 | return $this->label; | ||
568 | } | ||
569 | |||
570 | /** | ||
571 | * Set the src reference for the NavPoint. | ||
572 | * | ||
573 | * The src is mandatory for ePub 2. | ||
574 | * | ||
575 | * @param string $contentSrc | ||
576 | */ | ||
577 | function setContentSrc($contentSrc) { | ||
578 | $this->contentSrc = isset($contentSrc) && is_string($contentSrc) ? trim($contentSrc) : NULL; | ||
579 | } | ||
580 | |||
581 | /** | ||
582 | * Get the src reference for the NavPoint. | ||
583 | * | ||
584 | * @return string content src url. | ||
585 | */ | ||
586 | function getContentSrc() { | ||
587 | return $this->contentSrc; | ||
588 | } | ||
589 | /** | ||
590 | * Set the parent for this NavPoint. | ||
591 | * | ||
592 | * @param NavPoint or NavMap $parent | ||
593 | */ | ||
594 | function setParent($parent) { | ||
595 | if ($parent != NULL && is_object($parent) && | ||
596 | (get_class($parent) === "NavPoint" || get_class($parent) === "NavMap") ) { | ||
597 | $this->parent = $parent; | ||
598 | } | ||
599 | } | ||
600 | |||
601 | /** | ||
602 | * Get the parent to this NavPoint. | ||
603 | * | ||
604 | * @return NavPoint, or NavMap if the parent is the root. | ||
605 | */ | ||
606 | function getParent() { | ||
607 | return $this->parent; | ||
608 | } | ||
609 | |||
610 | /** | ||
611 | * Get the current level. 1 = document root. | ||
612 | * | ||
613 | * @return int level | ||
614 | */ | ||
615 | function getLevel() { | ||
616 | return $this->parent === NULL ? 1 : $this->parent->getLevel()+1; | ||
617 | } | ||
618 | |||
619 | /** | ||
620 | * Set the id for the NavPoint. | ||
621 | * | ||
622 | * The id must be unique, and is mandatory. | ||
623 | * | ||
624 | * @param string $id | ||
625 | */ | ||
626 | function setId($id) { | ||
627 | $this->id = is_string($id) ? trim($id) : NULL; | ||
628 | } | ||
629 | |||
630 | /** | ||
631 | * Set the class to be used for this NavPoint. | ||
632 | * | ||
633 | * @param string $navClass | ||
634 | */ | ||
635 | function setNavClass($navClass) { | ||
636 | $this->navClass = isset($navClass) && is_string($navClass) ? trim($navClass) : NULL; | ||
637 | } | ||
638 | |||
639 | /** | ||
640 | * Set the class to be used for this NavPoint. | ||
641 | * | ||
642 | * @param string $navClass | ||
643 | */ | ||
644 | function setNavHidden($isNavHidden) { | ||
645 | $this->isNavHidden = $isNavHidden === TRUE; | ||
646 | } | ||
647 | |||
648 | /** | ||
649 | * Set the writing direction to be used for this NavPoint. | ||
650 | * | ||
651 | * @param string $writingDirection | ||
652 | */ | ||
653 | function setWritingDirection($writingDirection) { | ||
654 | $this->writingDirection = isset($writingDirection) && is_string($writingDirection) ? trim($writingDirection) : NULL; | ||
655 | } | ||
656 | |||
657 | function getWritingDirection() { | ||
658 | return $this->writingDirection; | ||
659 | } | ||
660 | |||
661 | /** | ||
662 | * Add child NavPoints for multi level NavMaps. | ||
663 | * | ||
664 | * @param NavPoint $navPoint | ||
665 | */ | ||
666 | function addNavPoint($navPoint) { | ||
667 | if ($navPoint != NULL && is_object($navPoint) && get_class($navPoint) === "NavPoint") { | ||
668 | $navPoint->setParent($this); | ||
669 | if ($navPoint->getWritingDirection() == NULL) { | ||
670 | $navPoint->setWritingDirection($this->writingDirection); | ||
671 | } | ||
672 | $this->navPoints[] = $navPoint; | ||
673 | return $navPoint; | ||
674 | } | ||
675 | return $this; | ||
676 | } | ||
677 | |||
678 | /** | ||
679 | * | ||
680 | * Enter description here ... | ||
681 | * | ||
682 | * @param string $nav | ||
683 | * @param int $playOrder | ||
684 | * @param int $level | ||
685 | * @return int | ||
686 | */ | ||
687 | function finalize(&$nav = "", &$playOrder = 0, $level = 0) { | ||
688 | $maxLevel = $level; | ||
689 | $levelAdjust = 0; | ||
690 | |||
691 | if ($this->isNavHidden) { | ||
692 | return $maxLevel; | ||
693 | } | ||
694 | |||
695 | if (isset($this->contentSrc)) { | ||
696 | $playOrder++; | ||
697 | |||
698 | if ($this->id == NULL) { | ||
699 | $this->id = "navpoint-" . $playOrder; | ||
700 | } | ||
701 | $nav .= str_repeat("\t", $level) . "\t\t<navPoint id=\"" . $this->id . "\" playOrder=\"" . $playOrder . "\">\n" | ||
702 | . str_repeat("\t", $level) . "\t\t\t<navLabel>\n" | ||
703 | . str_repeat("\t", $level) . "\t\t\t\t<text>" . $this->label . "</text>\n" | ||
704 | . str_repeat("\t", $level) . "\t\t\t</navLabel>\n" | ||
705 | . str_repeat("\t", $level) . "\t\t\t<content src=\"" . $this->contentSrc . "\" />\n"; | ||
706 | } else { | ||
707 | $levelAdjust++; | ||
708 | } | ||
709 | |||
710 | if (sizeof($this->navPoints) > 0) { | ||
711 | $maxLevel++; | ||
712 | foreach ($this->navPoints as $navPoint) { | ||
713 | $retLevel = $navPoint->finalize($nav, $playOrder, ($level+1+$levelAdjust)); | ||
714 | if ($retLevel > $maxLevel) { | ||
715 | $maxLevel = $retLevel; | ||
716 | } | ||
717 | } | ||
718 | } | ||
719 | |||
720 | if (isset($this->contentSrc)) { | ||
721 | $nav .= str_repeat("\t", $level) . "\t\t</navPoint>\n"; | ||
722 | } | ||
723 | |||
724 | return $maxLevel; | ||
725 | } | ||
726 | |||
727 | /** | ||
728 | * | ||
729 | * Enter description here ... | ||
730 | * | ||
731 | * @param string $nav | ||
732 | * @param int $playOrder | ||
733 | * @param int $level | ||
734 | * @return int | ||
735 | */ | ||
736 | function finalizeEPub3(&$nav = "", &$playOrder = 0, $level = 0, $subLevelClass = NULL, $subLevelHidden = FALSE) { | ||
737 | $maxLevel = $level; | ||
738 | |||
739 | if ($this->id == NULL) { | ||
740 | $this->id = "navpoint-" . $playOrder; | ||
741 | } | ||
742 | $indent = str_repeat("\t", $level) . "\t\t\t\t"; | ||
743 | |||
744 | $nav .= $indent . "<li id=\"" . $this->id . "\""; | ||
745 | if (isset($this->writingDirection)) { | ||
746 | $nav .= " dir=\"" . $this->writingDirection . "\""; | ||
747 | } | ||
748 | $nav .= ">\n"; | ||
749 | |||
750 | if (isset($this->contentSrc)) { | ||
751 | $nav .= $indent . "\t<a href=\"" . $this->contentSrc . "\">" . $this->label . "</a>\n"; | ||
752 | } else { | ||
753 | $nav .= $indent . "\t<span>" . $this->label . "</span>\n"; | ||
754 | } | ||
755 | |||
756 | if (sizeof($this->navPoints) > 0) { | ||
757 | $maxLevel++; | ||
758 | |||
759 | $nav .= $indent . "\t<ol epub:type=\"list\""; | ||
760 | if (isset($subLevelClass)) { | ||
761 | $nav .= " class=\"" . $subLevelClass . "\""; | ||
762 | } | ||
763 | if ($subLevelHidden) { | ||
764 | $nav .= " hidden=\"hidden\""; | ||
765 | } | ||
766 | $nav .= ">\n"; | ||
767 | |||
768 | foreach ($this->navPoints as $navPoint) { | ||
769 | $retLevel = $navPoint->finalizeEPub3($nav, $playOrder, ($level+2), $subLevelClass, $subLevelHidden); | ||
770 | if ($retLevel > $maxLevel) { | ||
771 | $maxLevel = $retLevel; | ||
772 | } | ||
773 | } | ||
774 | $nav .= $indent . "\t</ol>\n"; | ||
775 | } | ||
776 | |||
777 | $nav .= $indent . "</li>\n"; | ||
778 | |||
779 | return $maxLevel; | ||
780 | } | ||
781 | } | ||
782 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.OPF.php b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php new file mode 100644 index 00000000..803a2108 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.OPF.php | |||
@@ -0,0 +1,1226 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * ePub OPF file structure | ||
4 | * | ||
5 | * @author A. Grandt <php@grandt.com> | ||
6 | * @copyright 2009-2014 A. Grandt | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 3.20 | ||
9 | */ | ||
10 | class Opf { | ||
11 | const _VERSION = 3.20; | ||
12 | |||
13 | /* Core Media types. | ||
14 | * These types are the only guaranteed mime types any ePub reader must understand. | ||
15 | * Any other type muse define a fall back whose fallback chain will end in one of these. | ||
16 | */ | ||
17 | const TYPE_GIF = "image/gif"; | ||
18 | const TYPE_JPEG = "image/jpeg"; | ||
19 | const TYPE_PNG = "image/png"; | ||
20 | const TYPE_SVG = "image/svg+xml"; | ||
21 | const TYPE_XHTML = "application/xhtml+xml"; | ||
22 | const TYPE_DTBOOK = "application/x-dtbook+xml"; | ||
23 | const TYPE_CSS = "text/css"; | ||
24 | const TYPE_XML = "application/xml"; | ||
25 | const TYPE_OEB1_DOC = "text/x-oeb1-document"; // Deprecated | ||
26 | const TYPE_OEB1_CSS = "text/x-oeb1-css"; // Deprecated | ||
27 | const TYPE_NCX = "application/x-dtbncx+xml"; | ||
28 | |||
29 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
30 | private $ident = "BookId"; | ||
31 | |||
32 | public $date = NULL; | ||
33 | public $metadata = NULL; | ||
34 | public $manifest = NULL; | ||
35 | public $spine = NULL; | ||
36 | public $guide = NULL; | ||
37 | |||
38 | /** | ||
39 | * Class constructor. | ||
40 | * | ||
41 | * @return void | ||
42 | */ | ||
43 | function __construct($ident = "BookId", $bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
44 | $this->setIdent($ident); | ||
45 | $this->setVersion($bookVersion); | ||
46 | $this->metadata = new Metadata(); | ||
47 | $this->manifest = new Manifest(); | ||
48 | $this->spine = new Spine(); | ||
49 | $this->guide = new Guide(); | ||
50 | } | ||
51 | |||
52 | /** | ||
53 | * Class destructor | ||
54 | * | ||
55 | * @return void | ||
56 | */ | ||
57 | function __destruct() { | ||
58 | unset ($this->bookVersion, $this->ident, $this->date, $this->metadata, $this->manifest, $this->spine, $this->guide); | ||
59 | } | ||
60 | |||
61 | /** | ||
62 | * | ||
63 | * Enter description here ... | ||
64 | * | ||
65 | * @param string $ident | ||
66 | */ | ||
67 | function setVersion($bookVersion) { | ||
68 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
69 | } | ||
70 | |||
71 | function isEPubVersion2() { | ||
72 | return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * | ||
77 | * Enter description here ... | ||
78 | * | ||
79 | * @param string $ident | ||
80 | */ | ||
81 | function setIdent($ident = "BookId") { | ||
82 | $this->ident = is_string($ident) ? trim($ident) : "BookId"; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * | ||
87 | * Enter description here ... | ||
88 | * | ||
89 | * @return string | ||
90 | */ | ||
91 | function finalize() { | ||
92 | $opf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
93 | . "<package xmlns=\"http://www.idpf.org/2007/opf\" unique-identifier=\"" . $this->ident . "\" version=\"" . $this->bookVersion . "\">\n"; | ||
94 | |||
95 | $opf .= $this->metadata->finalize($this->bookVersion, $this->date); | ||
96 | $opf .= $this->manifest->finalize($this->bookVersion); | ||
97 | $opf .= $this->spine->finalize(); | ||
98 | |||
99 | if ($this->guide->length() > 0) { | ||
100 | $opf .= $this->guide->finalize(); | ||
101 | } | ||
102 | |||
103 | return $opf . "</package>\n"; | ||
104 | } | ||
105 | |||
106 | // Convenience functions: | ||
107 | |||
108 | /** | ||
109 | * | ||
110 | * Enter description here ... | ||
111 | * | ||
112 | * @param string $title | ||
113 | * @param string $language | ||
114 | * @param string $identifier | ||
115 | * @param string $identifierScheme | ||
116 | */ | ||
117 | function initialize($title, $language, $identifier, $identifierScheme) { | ||
118 | $this->metadata->addDublinCore(new DublinCore("title", $title)); | ||
119 | $this->metadata->addDublinCore(new DublinCore("language", $language)); | ||
120 | |||
121 | $dc = new DublinCore("identifier", $identifier); | ||
122 | $dc->addAttr("id", $this->ident); | ||
123 | $dc->addOpfAttr("scheme", $identifierScheme); | ||
124 | $this->metadata->addDublinCore($dc); | ||
125 | } | ||
126 | |||
127 | /** | ||
128 | * | ||
129 | * Enter description here ... | ||
130 | * | ||
131 | * @param string $id | ||
132 | * @param string $href | ||
133 | * @param string $mediaType | ||
134 | */ | ||
135 | function addItem($id, $href, $mediaType, $properties = NULL) { | ||
136 | $this->manifest->addItem(new Item($id, $href, $mediaType, $properties)); | ||
137 | } | ||
138 | |||
139 | /** | ||
140 | * | ||
141 | * Enter description here ... | ||
142 | * | ||
143 | * @param string $idref | ||
144 | * @param bool $linear | ||
145 | */ | ||
146 | function addItemRef($idref, $linear = TRUE) { | ||
147 | $this->spine->addItemref(new Itemref($idref, $linear)); | ||
148 | } | ||
149 | |||
150 | /** | ||
151 | * | ||
152 | * Enter description here ... | ||
153 | * | ||
154 | * @param string $type | ||
155 | * @param string $title | ||
156 | * @param string $href | ||
157 | */ | ||
158 | function addReference($type, $title, $href) { | ||
159 | $this->guide->addReference(new Reference($type, $title, $href)); | ||
160 | } | ||
161 | |||
162 | /** | ||
163 | * | ||
164 | * Enter description here ... | ||
165 | * | ||
166 | * @param string $name | ||
167 | * @param string $value | ||
168 | */ | ||
169 | function addDCMeta($name, $value) { | ||
170 | $this->metadata->addDublinCore(new DublinCore($name, $value)); | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * | ||
175 | * Enter description here ... | ||
176 | * | ||
177 | * @param string $name | ||
178 | * @param string $content | ||
179 | */ | ||
180 | function addMeta($name, $content) { | ||
181 | $this->metadata->addMeta($name, $content); | ||
182 | } | ||
183 | |||
184 | /** | ||
185 | * | ||
186 | * Enter description here ... | ||
187 | * | ||
188 | * @param string $name | ||
189 | * @param string $fileAs | ||
190 | * @param string $role Use the MarcCode constants | ||
191 | */ | ||
192 | function addCreator($name, $fileAs = NULL, $role = NULL) { | ||
193 | $dc = new DublinCore(DublinCore::CREATOR, trim($name)); | ||
194 | |||
195 | if ($fileAs !== NULL) { | ||
196 | $dc->addOpfAttr("file-as", trim($fileAs)); | ||
197 | } | ||
198 | |||
199 | if ($role !== NULL) { | ||
200 | $dc->addOpfAttr("role", trim($role)); | ||
201 | } | ||
202 | |||
203 | $this->metadata->addDublinCore($dc); | ||
204 | } | ||
205 | |||
206 | /** | ||
207 | * | ||
208 | * Enter description here ... | ||
209 | * | ||
210 | * @param string $name | ||
211 | * @param string $fileAs | ||
212 | * @param string $role Use the MarcCode constants | ||
213 | */ | ||
214 | function addColaborator($name, $fileAs = NULL, $role = NULL) { | ||
215 | $dc = new DublinCore(DublinCore::CONTRIBUTOR, trim($name)); | ||
216 | |||
217 | if ($fileAs !== NULL) { | ||
218 | $dc->addOpfAttr("file-as", trim($fileAs)); | ||
219 | } | ||
220 | |||
221 | if ($role !== NULL) { | ||
222 | $dc->addOpfAttr("role", trim($role)); | ||
223 | } | ||
224 | |||
225 | $this->metadata->addDublinCore($dc); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | /** | ||
230 | * ePub OPF Metadata structures | ||
231 | */ | ||
232 | class Metadata { | ||
233 | const _VERSION = 3.00; | ||
234 | |||
235 | private $dc = array(); | ||
236 | private $meta = array(); | ||
237 | |||
238 | /** | ||
239 | * Class constructor. | ||
240 | * | ||
241 | * @return void | ||
242 | */ | ||
243 | function __construct() { | ||
244 | } | ||
245 | |||
246 | /** | ||
247 | * Class destructor | ||
248 | * | ||
249 | * @return void | ||
250 | */ | ||
251 | function __destruct() { | ||
252 | unset ($this->dc, $this->meta); | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * | ||
257 | * Enter description here ... | ||
258 | * | ||
259 | * @param DublinCore $dc | ||
260 | */ | ||
261 | function addDublinCore($dc) { | ||
262 | if ($dc != NULL && is_object($dc) && get_class($dc) === "DublinCore") { | ||
263 | $this->dc[] = $dc; | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /** | ||
268 | * | ||
269 | * Enter description here ... | ||
270 | * | ||
271 | * @param string $name | ||
272 | * @param string $content | ||
273 | */ | ||
274 | function addMeta($name, $content) { | ||
275 | $name = is_string($name) ? trim($name) : NULL; | ||
276 | if (isset($name)) { | ||
277 | $content = is_string($content) ? trim($content) : NULL; | ||
278 | } | ||
279 | if (isset($content)) { | ||
280 | $this->meta[] = array ($name => $content); | ||
281 | } | ||
282 | } | ||
283 | |||
284 | /** | ||
285 | * | ||
286 | * @param string $bookVersion | ||
287 | * @param int $date | ||
288 | * @return string | ||
289 | */ | ||
290 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2, $date = NULL) { | ||
291 | $metadata = "\t<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"; | ||
292 | if ($bookVersion === EPub::BOOK_VERSION_EPUB2) { | ||
293 | $metadata .= "\t\txmlns:opf=\"http://www.idpf.org/2007/opf\"\n\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"; | ||
294 | } else { | ||
295 | $metadata .= "\t\txmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"; | ||
296 | if (!isset($date)) { | ||
297 | $date = time(); | ||
298 | } | ||
299 | $metadata .= "\t\t<meta property=\"dcterms:modified\">" . gmdate("Y-m-d\TH:i:s\Z", $date) . "</meta>\n"; | ||
300 | } | ||
301 | |||
302 | foreach ($this->dc as $dc) { | ||
303 | $metadata .= $dc->finalize($bookVersion); | ||
304 | } | ||
305 | |||
306 | foreach ($this->meta as $data) { | ||
307 | list($name, $content) = each($data); | ||
308 | $metadata .= "\t\t<meta name=\"" . $name . "\" content=\"" . $content . "\" />\n"; | ||
309 | } | ||
310 | |||
311 | return $metadata . "\t</metadata>\n"; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | /** | ||
316 | * ePub OPF Dublin Core (dc:) Metadata structures | ||
317 | */ | ||
318 | class DublinCore { | ||
319 | const _VERSION = 3.00; | ||
320 | |||
321 | const CONTRIBUTOR = "contributor"; | ||
322 | const COVERAGE = "coverage"; | ||
323 | const CREATOR = "creator"; | ||
324 | const DATE = "date"; | ||
325 | const DESCRIPTION = "description"; | ||
326 | const FORMAT = "format"; | ||
327 | const IDENTIFIER = "identifier"; | ||
328 | const LANGUAGE = "language"; | ||
329 | const PUBLISHER = "publisher"; | ||
330 | const RELATION = "relation"; | ||
331 | const RIGHTS = "rights"; | ||
332 | const SOURCE = "source"; | ||
333 | const SUBJECT = "subject"; | ||
334 | const TITLE = "title"; | ||
335 | const TYPE = "type"; | ||
336 | |||
337 | private $dcName = NULL; | ||
338 | private $dcValue = NULL; | ||
339 | private $attr = array(); | ||
340 | private $opfAttr = array(); | ||
341 | |||
342 | /** | ||
343 | * Class constructor. | ||
344 | * | ||
345 | * @return void | ||
346 | */ | ||
347 | function __construct($name, $value) { | ||
348 | $this->setDc($name, $value); | ||
349 | } | ||
350 | |||
351 | /** | ||
352 | * Class destructor | ||
353 | * | ||
354 | * @return void | ||
355 | */ | ||
356 | function __destruct() { | ||
357 | unset ($this->dcName, $this->dcValue, $this->attr, $this->opfAttr); | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * | ||
362 | * Enter description here ... | ||
363 | * | ||
364 | * @param string $name | ||
365 | * @param string $value | ||
366 | */ | ||
367 | function setDc($name, $value) { | ||
368 | $this->dcName = is_string($name) ? trim($name) : NULL; | ||
369 | if (isset($this->dcName)) { | ||
370 | $this->dcValue = isset($value) ? (string)$value : NULL; | ||
371 | } | ||
372 | if (! isset($this->dcValue)) { | ||
373 | $this->dcName = NULL; | ||
374 | } | ||
375 | } | ||
376 | |||
377 | /** | ||
378 | * | ||
379 | * Enter description here ... | ||
380 | * | ||
381 | * @param string $attrName | ||
382 | * @param string $attrValue | ||
383 | */ | ||
384 | function addAttr($attrName, $attrValue) { | ||
385 | $attrName = is_string($attrName) ? trim($attrName) : NULL; | ||
386 | if (isset($attrName)) { | ||
387 | $attrValue = is_string($attrValue) ? trim($attrValue) : NULL; | ||
388 | } | ||
389 | if (isset($attrValue)) { | ||
390 | $this->attr[$attrName] = $attrValue; | ||
391 | } | ||
392 | } | ||
393 | |||
394 | /** | ||
395 | * | ||
396 | * Enter description here ... | ||
397 | * | ||
398 | * @param string $opfAttrName | ||
399 | * @param string $opfAttrValue | ||
400 | */ | ||
401 | function addOpfAttr($opfAttrName, $opfAttrValue) { | ||
402 | $opfAttrName = is_string($opfAttrName) ? trim($opfAttrName) : NULL; | ||
403 | if (isset($opfAttrName)) { | ||
404 | $opfAttrValue = is_string($opfAttrValue) ? trim($opfAttrValue) : NULL; | ||
405 | } | ||
406 | if (isset($opfAttrValue)) { | ||
407 | $this->opfAttr[$opfAttrName] = $opfAttrValue; | ||
408 | } | ||
409 | } | ||
410 | |||
411 | |||
412 | /** | ||
413 | * | ||
414 | * @param string $bookVersion | ||
415 | * @return string | ||
416 | */ | ||
417 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
418 | $dc = "\t\t<dc:" . $this->dcName; | ||
419 | |||
420 | if (sizeof($this->attr) > 0) { | ||
421 | while (list($name, $content) = each($this->attr)) { | ||
422 | $dc .= " " . $name . "=\"" . $content . "\""; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | if ($bookVersion === EPub::BOOK_VERSION_EPUB2 && sizeof($this->opfAttr) > 0) { | ||
427 | while (list($name, $content) = each($this->opfAttr)) { | ||
428 | $dc .= " opf:" . $name . "=\"" . $content . "\""; | ||
429 | } | ||
430 | } | ||
431 | |||
432 | return $dc . ">" . $this->dcValue . "</dc:" . $this->dcName . ">\n"; | ||
433 | } | ||
434 | } | ||
435 | |||
436 | /** | ||
437 | * ePub OPF Manifest structure | ||
438 | */ | ||
439 | class Manifest { | ||
440 | const _VERSION = 3.00; | ||
441 | |||
442 | private $items = array(); | ||
443 | |||
444 | /** | ||
445 | * Class constructor. | ||
446 | * | ||
447 | * @return void | ||
448 | */ | ||
449 | function __construct() { | ||
450 | } | ||
451 | |||
452 | /** | ||
453 | * Class destructor | ||
454 | * | ||
455 | * @return void | ||
456 | */ | ||
457 | function __destruct() { | ||
458 | unset ($this->items); | ||
459 | } | ||
460 | |||
461 | /** | ||
462 | * | ||
463 | * Enter description here ... | ||
464 | * | ||
465 | * @param Item $item | ||
466 | */ | ||
467 | function addItem($item) { | ||
468 | if ($item != NULL && is_object($item) && get_class($item) === "Item") { | ||
469 | $this->items[] = $item; | ||
470 | } | ||
471 | } | ||
472 | |||
473 | /** | ||
474 | * | ||
475 | * @param string $bookVersion | ||
476 | * @return string | ||
477 | */ | ||
478 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
479 | $manifest = "\n\t<manifest>\n"; | ||
480 | foreach ($this->items as $item) { | ||
481 | $manifest .= $item->finalize($bookVersion); | ||
482 | } | ||
483 | return $manifest . "\t</manifest>\n"; | ||
484 | } | ||
485 | } | ||
486 | |||
487 | /** | ||
488 | * ePub OPF Item structure | ||
489 | */ | ||
490 | class Item { | ||
491 | const _VERSION = 3.00; | ||
492 | |||
493 | private $id = NULL; | ||
494 | private $href = NULL; | ||
495 | private $mediaType = NULL; | ||
496 | private $properties = NULL; | ||
497 | private $requiredNamespace = NULL; | ||
498 | private $requiredModules = NULL; | ||
499 | private $fallback = NULL; | ||
500 | private $fallbackStyle = NULL; | ||
501 | |||
502 | /** | ||
503 | * Class constructor. | ||
504 | * | ||
505 | * @return void | ||
506 | */ | ||
507 | function __construct($id, $href, $mediaType, $properties = NULL) { | ||
508 | $this->setId($id); | ||
509 | $this->setHref($href); | ||
510 | $this->setMediaType($mediaType); | ||
511 | $this->setProperties($properties); | ||
512 | } | ||
513 | |||
514 | /** | ||
515 | * Class destructor | ||
516 | * | ||
517 | * @return void | ||
518 | */ | ||
519 | function __destruct() { | ||
520 | unset ($this->id, $this->href, $this->mediaType); | ||
521 | unset ($this->properties, $this->requiredNamespace, $this->requiredModules, $this->fallback, $this->fallbackStyle); | ||
522 | } | ||
523 | |||
524 | /** | ||
525 | * | ||
526 | * Enter description here ... | ||
527 | * | ||
528 | * @param string $id | ||
529 | */ | ||
530 | function setId($id) { | ||
531 | $this->id = is_string($id) ? trim($id) : NULL; | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * | ||
536 | * Enter description here ... | ||
537 | * | ||
538 | * @param string $href | ||
539 | */ | ||
540 | function setHref($href) { | ||
541 | $this->href = is_string($href) ? trim($href) : NULL; | ||
542 | } | ||
543 | |||
544 | /** | ||
545 | * | ||
546 | * Enter description here ... | ||
547 | * | ||
548 | * @param string $mediaType | ||
549 | */ | ||
550 | function setMediaType($mediaType) { | ||
551 | $this->mediaType = is_string($mediaType) ? trim($mediaType) : NULL; | ||
552 | } | ||
553 | |||
554 | /** | ||
555 | * | ||
556 | * Enter description here ... | ||
557 | * | ||
558 | * @param string $properties | ||
559 | */ | ||
560 | function setProperties($properties) { | ||
561 | $this->properties = is_string($properties) ? trim($properties) : NULL; | ||
562 | } | ||
563 | |||
564 | /** | ||
565 | * | ||
566 | * Enter description here ... | ||
567 | * | ||
568 | * @param string $requiredNamespace | ||
569 | */ | ||
570 | function setRequiredNamespace($requiredNamespace) { | ||
571 | $this->requiredNamespace = is_string($requiredNamespace) ? trim($requiredNamespace) : NULL; | ||
572 | } | ||
573 | |||
574 | /** | ||
575 | * | ||
576 | * Enter description here ... | ||
577 | * | ||
578 | * @param string $requiredModules | ||
579 | */ | ||
580 | function setRequiredModules($requiredModules) { | ||
581 | $this->requiredModules = is_string($requiredModules) ? trim($requiredModules) : NULL; | ||
582 | } | ||
583 | |||
584 | /** | ||
585 | * | ||
586 | * Enter description here ... | ||
587 | * | ||
588 | * @param string $fallback | ||
589 | */ | ||
590 | function setfallback($fallback) { | ||
591 | $this->fallback = is_string($fallback) ? trim($fallback) : NULL; | ||
592 | } | ||
593 | |||
594 | /** | ||
595 | * | ||
596 | * Enter description here ... | ||
597 | * | ||
598 | * @param string $fallbackStyle | ||
599 | */ | ||
600 | function setFallbackStyle($fallbackStyle) { | ||
601 | $this->fallbackStyle = is_string($fallbackStyle) ? trim($fallbackStyle) : NULL; | ||
602 | } | ||
603 | |||
604 | /** | ||
605 | * | ||
606 | * @param string $bookVersion | ||
607 | * @return string | ||
608 | */ | ||
609 | function finalize($bookVersion = EPub::BOOK_VERSION_EPUB2) { | ||
610 | $item = "\t\t<item id=\"" . $this->id . "\" href=\"" . $this->href . "\" media-type=\"" . $this->mediaType . "\" "; | ||
611 | if ($bookVersion === EPub::BOOK_VERSION_EPUB3 && isset($this->properties)) { | ||
612 | $item .= "properties=\"" . $this->properties . "\" "; | ||
613 | } | ||
614 | if (isset($this->requiredNamespace)) { | ||
615 | $item .= "\n\t\t\trequired-namespace=\"" . $this->requiredNamespace . "\" "; | ||
616 | if (isset($this->requiredModules)) { | ||
617 | $item .= "required-modules=\"" . $this->requiredModules . "\" "; | ||
618 | } | ||
619 | } | ||
620 | if (isset($this->fallback)) { | ||
621 | $item .= "\n\t\t\tfallback=\"" . $this->fallback . "\" "; | ||
622 | } | ||
623 | if (isset($this->fallbackStyle)) { | ||
624 | $item .= "\n\t\t\tfallback-style=\"" . $this->fallbackStyle . "\" "; | ||
625 | } | ||
626 | return $item . "/>\n"; | ||
627 | } | ||
628 | } | ||
629 | |||
630 | /** | ||
631 | * ePub OPF Spine structure | ||
632 | */ | ||
633 | class Spine { | ||
634 | const _VERSION = 1.00; | ||
635 | |||
636 | private $itemrefs = array(); | ||
637 | private $toc = NULL; | ||
638 | |||
639 | /** | ||
640 | * Class constructor. | ||
641 | * | ||
642 | * @return void | ||
643 | */ | ||
644 | function __construct($toc = "ncx") { | ||
645 | $this->setToc($toc); | ||
646 | } | ||
647 | |||
648 | /** | ||
649 | * Class destructor | ||
650 | * | ||
651 | * @return void | ||
652 | */ | ||
653 | function __destruct() { | ||
654 | unset ($this->itemrefs, $this->toc); | ||
655 | } | ||
656 | |||
657 | /** | ||
658 | * | ||
659 | * Enter description here ... | ||
660 | * | ||
661 | * @param string $toc | ||
662 | */ | ||
663 | function setToc($toc) { | ||
664 | $this->toc = is_string($toc) ? trim($toc) : NULL; | ||
665 | } | ||
666 | |||
667 | /** | ||
668 | * | ||
669 | * Enter description here ... | ||
670 | * | ||
671 | * @param Itemref $itemref | ||
672 | */ | ||
673 | function addItemref($itemref) { | ||
674 | if ($itemref != NULL | ||
675 | && is_object($itemref) | ||
676 | && get_class($itemref) === "Itemref" | ||
677 | && !isset($this->itemrefs[$itemref->getIdref()])) { | ||
678 | $this->itemrefs[$itemref->getIdref()] = $itemref; | ||
679 | } | ||
680 | } | ||
681 | |||
682 | /** | ||
683 | * | ||
684 | * Enter description here ... | ||
685 | * | ||
686 | * @return string | ||
687 | */ | ||
688 | function finalize() { | ||
689 | $spine = "\n\t<spine toc=\"" . $this->toc . "\">\n"; | ||
690 | foreach ($this->itemrefs as $itemref) { | ||
691 | $spine .= $itemref->finalize(); | ||
692 | } | ||
693 | return $spine . "\t</spine>\n"; | ||
694 | } | ||
695 | } | ||
696 | |||
697 | /** | ||
698 | * ePub OPF ItemRef structure | ||
699 | */ | ||
700 | class Itemref { | ||
701 | const _VERSION = 3.00; | ||
702 | |||
703 | private $idref = NULL; | ||
704 | private $linear = TRUE; | ||
705 | |||
706 | /** | ||
707 | * Class constructor. | ||
708 | * | ||
709 | * @return void | ||
710 | */ | ||
711 | function __construct($idref, $linear = TRUE) { | ||
712 | $this->setIdref($idref); | ||
713 | $this->setLinear($linear); | ||
714 | } | ||
715 | |||
716 | /** | ||
717 | * Class destructor | ||
718 | * | ||
719 | * @return void | ||
720 | */ | ||
721 | function __destruct() { | ||
722 | unset ($this->idref, $this->linear); | ||
723 | } | ||
724 | |||
725 | /** | ||
726 | * | ||
727 | * Enter description here ... | ||
728 | * | ||
729 | * @param string $idref | ||
730 | */ | ||
731 | function setIdref($idref) { | ||
732 | $this->idref = is_string($idref) ? trim($idref) : NULL; | ||
733 | } | ||
734 | |||
735 | /** | ||
736 | * | ||
737 | * Enter description here ... | ||
738 | * | ||
739 | * @return string $idref | ||
740 | */ | ||
741 | function getIdref() { | ||
742 | return $this->idref; | ||
743 | } | ||
744 | |||
745 | /** | ||
746 | * | ||
747 | * Enter description here ... | ||
748 | * | ||
749 | * @param bool $linear | ||
750 | */ | ||
751 | function setLinear($linear = TRUE) { | ||
752 | $this->linear = $linear === TRUE; | ||
753 | } | ||
754 | |||
755 | /** | ||
756 | * | ||
757 | * Enter description here ... | ||
758 | * | ||
759 | * @return string | ||
760 | */ | ||
761 | function finalize() { | ||
762 | $itemref = "\t\t<itemref idref=\"" . $this->idref . "\""; | ||
763 | if ($this->linear == FALSE) { | ||
764 | return $itemref .= " linear=\"no\" />\n"; | ||
765 | } | ||
766 | return $itemref . " />\n"; | ||
767 | } | ||
768 | } | ||
769 | |||
770 | /** | ||
771 | * ePub OPF Guide structure | ||
772 | */ | ||
773 | class Guide { | ||
774 | const _VERSION = 3.00; | ||
775 | |||
776 | private $references = array(); | ||
777 | |||
778 | /** | ||
779 | * Class constructor. | ||
780 | * | ||
781 | * @return void | ||
782 | */ | ||
783 | function __construct() { | ||
784 | } | ||
785 | |||
786 | /** | ||
787 | * Class destructor | ||
788 | * | ||
789 | * @return void | ||
790 | */ | ||
791 | function __destruct() { | ||
792 | unset ($this->references); | ||
793 | } | ||
794 | |||
795 | /** | ||
796 | * | ||
797 | * Enter description here ... | ||
798 | * | ||
799 | */ | ||
800 | function length() { | ||
801 | return sizeof($this->references); | ||
802 | } | ||
803 | |||
804 | /** | ||
805 | * | ||
806 | * Enter description here ... | ||
807 | * | ||
808 | * @param Reference $reference | ||
809 | */ | ||
810 | function addReference($reference) { | ||
811 | if ($reference != NULL && is_object($reference) && get_class($reference) === "Reference") { | ||
812 | $this->references[] = $reference; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | /** | ||
817 | * | ||
818 | * Enter description here ... | ||
819 | * | ||
820 | * @return string | ||
821 | */ | ||
822 | function finalize() { | ||
823 | $ref = ""; | ||
824 | if (sizeof($this->references) > 0) { | ||
825 | $ref = "\n\t<guide>\n"; | ||
826 | foreach ($this->references as $reference) { | ||
827 | $ref .= $reference->finalize(); | ||
828 | } | ||
829 | $ref .= "\t</guide>\n"; | ||
830 | } | ||
831 | return $ref; | ||
832 | } | ||
833 | } | ||
834 | |||
835 | /** | ||
836 | * Reference constants | ||
837 | */ | ||
838 | class Reference { | ||
839 | const _VERSION = 1.00; | ||
840 | |||
841 | /* REFERENCE types are derived from the "Chicago Manual of Style" | ||
842 | */ | ||
843 | |||
844 | /** Acknowledgements page */ | ||
845 | const ACKNOWLEDGEMENTS = "acknowledgements"; | ||
846 | |||
847 | /** Bibliography page */ | ||
848 | const BIBLIOGRAPHY = "bibliography"; | ||
849 | |||
850 | /** Colophon page */ | ||
851 | const COLOPHON = "colophon"; | ||
852 | |||
853 | /** Copyright page */ | ||
854 | const COPYRIGHT_PAGE = "copyright-page"; | ||
855 | |||
856 | /** Dedication */ | ||
857 | const DEDICATION = "dedication"; | ||
858 | |||
859 | /** Epigraph */ | ||
860 | const EPIGRAPH = "epigraph"; | ||
861 | |||
862 | /** Foreword */ | ||
863 | const FOREWORD = "foreword"; | ||
864 | |||
865 | /** Glossary page */ | ||
866 | const GLOSSARY = "glossary"; | ||
867 | |||
868 | /** back-of-book style index */ | ||
869 | const INDEX = "index"; | ||
870 | |||
871 | /** List of illustrations */ | ||
872 | const LIST_OF_ILLUSTRATIONS = "loi"; | ||
873 | |||
874 | /** List of tables */ | ||
875 | const LIST_OF_TABLES = "lot"; | ||
876 | |||
877 | /** Notes page */ | ||
878 | const NOTES = "notes"; | ||
879 | |||
880 | /** Preface page */ | ||
881 | const PREFACE = "preface"; | ||
882 | |||
883 | /** Table of contents */ | ||
884 | const TABLE_OF_CONTENTS = "toc"; | ||
885 | |||
886 | /** Page with possibly title, author, publisher, and other metadata */ | ||
887 | const TITLE_PAGE = "titlepage"; | ||
888 | |||
889 | /** First page of the book, ie. first page of the first chapter */ | ||
890 | const TEXT = "text"; | ||
891 | |||
892 | // ****************** | ||
893 | // ePub3 constants | ||
894 | // ****************** | ||
895 | |||
896 | // Document partitions | ||
897 | /** The publications cover(s), jacket information, etc. This is officially in ePub3, but works for ePub 2 as well */ | ||
898 | const COVER = "cover"; | ||
899 | |||
900 | /** Preliminary material to the content body, such as tables of contents, dedications, etc. */ | ||
901 | const FRONTMATTER = "frontmatter"; | ||
902 | |||
903 | /** The main (body) content of a document. */ | ||
904 | const BODYMATTER = "bodymatter"; | ||
905 | |||
906 | /** Ancillary material occurring after the document body, such as indices, appendices, etc. */ | ||
907 | const BACKMATTER = "backmatter"; | ||
908 | |||
909 | |||
910 | private $type = NULL; | ||
911 | private $title = NULL; | ||
912 | private $href = NULL; | ||
913 | |||
914 | /** | ||
915 | * Class constructor. | ||
916 | * | ||
917 | * @param string $type | ||
918 | * @param string $title | ||
919 | * @param string $href | ||
920 | */ | ||
921 | function __construct($type, $title, $href) { | ||
922 | $this->setType($type); | ||
923 | $this->setTitle($title); | ||
924 | $this->setHref($href); | ||
925 | } | ||
926 | |||
927 | /** | ||
928 | * Class destructor | ||
929 | * | ||
930 | * @return void | ||
931 | */ | ||
932 | function __destruct() { | ||
933 | unset ($this->type, $this->title, $this->href); | ||
934 | } | ||
935 | |||
936 | /** | ||
937 | * | ||
938 | * Enter description here ... | ||
939 | * | ||
940 | * @param string $type | ||
941 | */ | ||
942 | function setType($type) { | ||
943 | $this->type = is_string($type) ? trim($type) : NULL; | ||
944 | } | ||
945 | |||
946 | /** | ||
947 | * | ||
948 | * Enter description here ... | ||
949 | * | ||
950 | * @param string $title | ||
951 | */ | ||
952 | function setTitle($title) { | ||
953 | $this->title = is_string($title) ? trim($title) : NULL; | ||
954 | } | ||
955 | |||
956 | /** | ||
957 | * | ||
958 | * Enter description here ... | ||
959 | * | ||
960 | * @param string $href | ||
961 | */ | ||
962 | function setHref($href) { | ||
963 | $this->href = is_string($href) ? trim($href) : NULL; | ||
964 | } | ||
965 | |||
966 | /** | ||
967 | * | ||
968 | * Enter description here ... | ||
969 | * | ||
970 | * @return string | ||
971 | */ | ||
972 | function finalize() { | ||
973 | return "\t\t<reference type=\"" . $this->type . "\" title=\"" . $this->title . "\" href=\"" . $this->href . "\" />\n"; | ||
974 | } | ||
975 | } | ||
976 | |||
977 | /** | ||
978 | * Common Marc codes. | ||
979 | * Ref: http://www.loc.gov/marc/relators/ | ||
980 | */ | ||
981 | class MarcCode { | ||
982 | const _VERSION = 3.00; | ||
983 | |||
984 | /** | ||
985 | * Adapter | ||
986 | * | ||
987 | * Use for a person who | ||
988 | * 1) reworks a musical composition, usually for a different medium, or | ||
989 | * 2) rewrites novels or stories for motion pictures or other audiovisual medium. | ||
990 | */ | ||
991 | const ADAPTER = "adp"; | ||
992 | |||
993 | /** | ||
994 | * Annotator | ||
995 | * | ||
996 | * Use for a person who writes manuscript annotations on a printed item. | ||
997 | */ | ||
998 | const ANNOTATOR = "ann"; | ||
999 | |||
1000 | /** | ||
1001 | * Arranger | ||
1002 | * | ||
1003 | * Use for a person who transcribes a musical composition, usually for a different | ||
1004 | * medium from that of the original; in an arrangement the musical substance remains | ||
1005 | * essentially unchanged. | ||
1006 | */ | ||
1007 | const ARRANGER = "arr"; | ||
1008 | |||
1009 | /** | ||
1010 | * Artist | ||
1011 | * | ||
1012 | * Use for a person (e.g., a painter) who conceives, and perhaps also implements, | ||
1013 | * an original graphic design or work of art, if specific codes (e.g., [egr], | ||
1014 | * [etr]) are not desired. For book illustrators, prefer Illustrator [ill]. | ||
1015 | */ | ||
1016 | const ARTIST = "art"; | ||
1017 | |||
1018 | /** | ||
1019 | * Associated name | ||
1020 | * | ||
1021 | * Use as a general relator for a name associated with or found in an item or | ||
1022 | * collection, or which cannot be determined to be that of a Former owner [fmo] | ||
1023 | * or other designated relator indicative of provenance. | ||
1024 | */ | ||
1025 | const ASSOCIATED_NAME = "asn"; | ||
1026 | |||
1027 | /** | ||
1028 | * Author | ||
1029 | * | ||
1030 | * Use for a person or corporate body chiefly responsible for the intellectual | ||
1031 | * or artistic content of a work. This term may also be used when more than one | ||
1032 | * person or body bears such responsibility. | ||
1033 | */ | ||
1034 | const AUTHOR = "aut"; | ||
1035 | |||
1036 | /** | ||
1037 | * Author in quotations or text extracts | ||
1038 | * | ||
1039 | * Use for a person whose work is largely quoted or extracted in a works to which | ||
1040 | * he or she did not contribute directly. Such quotations are found particularly | ||
1041 | * in exhibition catalogs, collections of photographs, etc. | ||
1042 | */ | ||
1043 | const AUTHOR_IN_QUOTES = "aqt"; | ||
1044 | |||
1045 | /** | ||
1046 | * Author of afterword, colophon, etc. | ||
1047 | * | ||
1048 | * Use for a person or corporate body responsible for an afterword, postface, | ||
1049 | * colophon, etc. but who is not the chief author of a work. | ||
1050 | */ | ||
1051 | const AUTHOR_OF_AFTERWORD = "aft"; | ||
1052 | |||
1053 | /** | ||
1054 | * Author of introduction, etc. | ||
1055 | * | ||
1056 | * Use for a person or corporate body responsible for an introduction, preface, | ||
1057 | * foreword, or other critical matter, but who is not the chief author. | ||
1058 | */ | ||
1059 | const AUTHOR_OF_INTRO = "aui"; | ||
1060 | |||
1061 | /** | ||
1062 | * Bibliographic antecedent | ||
1063 | * | ||
1064 | * Use for the author responsible for a work upon which the work represented by | ||
1065 | * the catalog record is based. This can be appropriate for adaptations, sequels, | ||
1066 | * continuations, indexes, etc. | ||
1067 | */ | ||
1068 | const BIB_ANTECEDENT = "ant"; | ||
1069 | |||
1070 | /** | ||
1071 | * Book producer | ||
1072 | * | ||
1073 | * Use for the person or firm responsible for the production of books and other | ||
1074 | * print media, if specific codes (e.g., [bkd], [egr], [tyd], [prt]) are not desired. | ||
1075 | */ | ||
1076 | const BOOK_PRODUCER = "bkp"; | ||
1077 | |||
1078 | /** | ||
1079 | * Collaborator | ||
1080 | * | ||
1081 | * Use for a person or corporate body that takes a limited part in the elaboration | ||
1082 | * of a work of another author or that brings complements (e.g., appendices, notes) | ||
1083 | * to the work of another author. | ||
1084 | */ | ||
1085 | const COLABORATOR = "clb"; | ||
1086 | |||
1087 | /** | ||
1088 | * Commentator | ||
1089 | * | ||
1090 | * Use for a person who provides interpretation, analysis, or a discussion of the | ||
1091 | * subject matter on a recording, motion picture, or other audiovisual medium. | ||
1092 | * Compiler [com] Use for a person who produces a work or publication by selecting | ||
1093 | * and putting together material from the works of various persons or bodies. | ||
1094 | */ | ||
1095 | const COMMENTATOR = "cmm"; | ||
1096 | |||
1097 | /** | ||
1098 | * Designer | ||
1099 | * | ||
1100 | * Use for a person or organization responsible for design if specific codes (e.g., | ||
1101 | * [bkd], [tyd]) are not desired. | ||
1102 | */ | ||
1103 | const DESIGNER = "dsr"; | ||
1104 | |||
1105 | /** | ||
1106 | * Editor | ||
1107 | * | ||
1108 | * Use for a person who prepares for publication a work not primarily his/her own, | ||
1109 | * such as by elucidating text, adding introductory or other critical matter, or | ||
1110 | * technically directing an editorial staff. | ||
1111 | */ | ||
1112 | const EDITORT = "edt"; | ||
1113 | |||
1114 | /** | ||
1115 | * Illustrator | ||
1116 | * | ||
1117 | * Use for the person who conceives, and perhaps also implements, a design or | ||
1118 | * illustration, usually to accompany a written text. | ||
1119 | */ | ||
1120 | const ILLUSTRATOR = "ill"; | ||
1121 | |||
1122 | /** | ||
1123 | * Lyricist | ||
1124 | * | ||
1125 | * Use for the writer of the text of a song. | ||
1126 | */ | ||
1127 | const LYRICIST = "lyr"; | ||
1128 | |||
1129 | /** | ||
1130 | * Metadata contact | ||
1131 | * | ||
1132 | * Use for the person or organization primarily responsible for compiling and | ||
1133 | * maintaining the original description of a metadata set (e.g., geospatial | ||
1134 | * metadata set). | ||
1135 | */ | ||
1136 | const METADATA_CONTACT = "mdc"; | ||
1137 | |||
1138 | /** | ||
1139 | * Musician | ||
1140 | * | ||
1141 | * Use for the person who performs music or contributes to the musical content | ||
1142 | * of a work when it is not possible or desirable to identify the function more | ||
1143 | * precisely. | ||
1144 | */ | ||
1145 | const MUSICIAN = "mus"; | ||
1146 | |||
1147 | /** | ||
1148 | * Narrator | ||
1149 | * | ||
1150 | * Use for the speaker who relates the particulars of an act, occurrence, or | ||
1151 | * course of events. | ||
1152 | */ | ||
1153 | const NARRATOR = "nrt"; | ||
1154 | |||
1155 | /** | ||
1156 | * Other | ||
1157 | * | ||
1158 | * Use for relator codes from other lists which have no equivalent in the MARC | ||
1159 | * list or for terms which have not been assigned a code. | ||
1160 | */ | ||
1161 | const OTHER = "oth"; | ||
1162 | |||
1163 | /** | ||
1164 | * Photographer | ||
1165 | * | ||
1166 | * Use for the person or organization responsible for taking photographs, whether | ||
1167 | * they are used in their original form or as reproductions. | ||
1168 | */ | ||
1169 | const PHOTOGRAPHER = "pht"; | ||
1170 | |||
1171 | /** | ||
1172 | * Printer | ||
1173 | * | ||
1174 | * Use for the person or organization who prints texts, whether from type or plates. | ||
1175 | */ | ||
1176 | const PRINTER = "prt"; | ||
1177 | |||
1178 | /** | ||
1179 | * Redactor | ||
1180 | * | ||
1181 | * Use for a person who writes or develops the framework for an item without | ||
1182 | * being intellectually responsible for its content. | ||
1183 | */ | ||
1184 | const REDACTOR = "red"; | ||
1185 | |||
1186 | /** | ||
1187 | * Reviewer | ||
1188 | * | ||
1189 | * Use for a person or corporate body responsible for the review of book, motion | ||
1190 | * picture, performance, etc. | ||
1191 | */ | ||
1192 | const REVIEWER = "rev"; | ||
1193 | |||
1194 | /** | ||
1195 | * Sponsor | ||
1196 | * | ||
1197 | * Use for the person or agency that issued a contract, or under whose auspices | ||
1198 | * a work has been written, printed, published, etc. | ||
1199 | */ | ||
1200 | const SPONSOR = "spn"; | ||
1201 | |||
1202 | /** | ||
1203 | * Thesis advisor | ||
1204 | * | ||
1205 | * Use for the person under whose supervision a degree candidate develops and | ||
1206 | * presents a thesis, memoir, or text of a dissertation. | ||
1207 | */ | ||
1208 | const THESIS_ADVISOR = "ths"; | ||
1209 | |||
1210 | /** | ||
1211 | * Transcriber | ||
1212 | * | ||
1213 | * Use for a person who prepares a handwritten or typewritten copy from original | ||
1214 | * material, including from dictated or orally recorded material. | ||
1215 | */ | ||
1216 | const TRANSCRIBER = "trc"; | ||
1217 | |||
1218 | /** | ||
1219 | * Translator | ||
1220 | * | ||
1221 | * Use for a person who renders a text from one language into another, or from | ||
1222 | * an older form of a language into the modern form. | ||
1223 | */ | ||
1224 | const TRANSLATOR = "trl"; | ||
1225 | } | ||
1226 | ?> | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php new file mode 100644 index 00000000..f1f41bd5 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPub.php | |||
@@ -0,0 +1,2432 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Create an ePub compatible book file. | ||
4 | * | ||
5 | * Please note, once finalized a book can no longer have chapters of data added or changed. | ||
6 | * | ||
7 | * License: GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * | ||
9 | * Thanks to: Adam Schmalhofer and Kirstyn Fox for invaluable input and for "nudging" me in the right direction :) | ||
10 | * | ||
11 | * @author A. Grandt <php@grandt.com> | ||
12 | * @copyright 2009-2014 A. Grandt | ||
13 | * @license GNU LGPL 2.1 | ||
14 | * @version 3.20 | ||
15 | * @link http://www.phpclasses.org/package/6115 | ||
16 | * @link https://github.com/Grandt/PHPePub | ||
17 | * @uses Zip.php version 1.50; http://www.phpclasses.org/browse/package/6110.html or https://github.com/Grandt/PHPZip | ||
18 | */ | ||
19 | class EPub { | ||
20 | const VERSION = 3.20; | ||
21 | const REQ_ZIP_VERSION = 1.60; | ||
22 | |||
23 | const IDENTIFIER_UUID = 'UUID'; | ||
24 | const IDENTIFIER_URI = 'URI'; | ||
25 | const IDENTIFIER_ISBN = 'ISBN'; | ||
26 | |||
27 | /** Ignore all external references, and do not process the file for these */ | ||
28 | const EXTERNAL_REF_IGNORE = 0; | ||
29 | /** Process the file for external references and add them to the book */ | ||
30 | const EXTERNAL_REF_ADD = 1; | ||
31 | /** Process the file for external references and add them to the book, but remove images, and img tags */ | ||
32 | const EXTERNAL_REF_REMOVE_IMAGES = 2; | ||
33 | /** Process the file for external references and add them to the book, but replace images, and img tags with [image] */ | ||
34 | const EXTERNAL_REF_REPLACE_IMAGES = 3; | ||
35 | |||
36 | const DIRECTION_LEFT_TO_RIGHT = "ltr"; | ||
37 | const DIRECTION_RIGHT_TO_LEFT = "rtl"; | ||
38 | |||
39 | const BOOK_VERSION_EPUB2 = "2.0"; | ||
40 | const BOOK_VERSION_EPUB3 = "3.0"; | ||
41 | |||
42 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
43 | |||
44 | public $maxImageWidth = 768; | ||
45 | public $maxImageHeight = 1024; | ||
46 | |||
47 | public $splitDefaultSize = 250000; | ||
48 | /** Gifs can crash some early ADE based readers, and are disabled by default. | ||
49 | * getImage will convert these if it can, unless this is set to TRUE. | ||
50 | */ | ||
51 | public $isGifImagesEnabled = FALSE; | ||
52 | public $isReferencesAddedToToc = TRUE; | ||
53 | |||
54 | private $zip; | ||
55 | |||
56 | private $title = ""; | ||
57 | private $language = "en"; | ||
58 | private $identifier = ""; | ||
59 | private $identifierType = ""; | ||
60 | private $description = ""; | ||
61 | private $author = ""; | ||
62 | private $authorSortKey = ""; | ||
63 | private $publisherName = ""; | ||
64 | private $publisherURL = ""; | ||
65 | private $date = 0; | ||
66 | private $rights = ""; | ||
67 | private $coverage = ""; | ||
68 | private $relation = ""; | ||
69 | private $sourceURL = ""; | ||
70 | |||
71 | private $chapterCount = 0; | ||
72 | private $opf = NULL; | ||
73 | private $ncx = NULL; | ||
74 | private $isFinalized = FALSE; | ||
75 | private $isCoverImageSet = FALSE; | ||
76 | private $buildTOC = FALSE; | ||
77 | private $tocTitle = NULL; | ||
78 | private $tocFileName = NULL; | ||
79 | private $tocCSSClass = NULL; | ||
80 | private $tocAddReferences = FALSE; | ||
81 | private $tocCssFileName = NULL; | ||
82 | |||
83 | private $fileList = array(); | ||
84 | private $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT; | ||
85 | private $languageCode = "en"; | ||
86 | |||
87 | /** | ||
88 | * Used for building the TOC. | ||
89 | * If this list is overwritten it MUST contain at least "text" as an element. | ||
90 | */ | ||
91 | public $referencesOrder = NULL; | ||
92 | |||
93 | private $dateformat = 'Y-m-d\TH:i:s.000000P'; // ISO 8601 long | ||
94 | private $dateformatShort = 'Y-m-d'; // short date format to placate ePubChecker. | ||
95 | private $headerDateFormat = "D, d M Y H:i:s T"; | ||
96 | |||
97 | protected $isCurlInstalled; | ||
98 | protected $isGdInstalled; | ||
99 | protected $isExifInstalled; | ||
100 | protected $isFileGetContentsInstalled; | ||
101 | protected $isFileGetContentsExtInstalled; | ||
102 | |||
103 | private $bookRoot = "OEBPS/"; | ||
104 | private $docRoot = NULL; | ||
105 | private $EPubMark = TRUE; | ||
106 | private $generator = ""; | ||
107 | |||
108 | private $log = NULL; | ||
109 | public $isLogging = TRUE; | ||
110 | |||
111 | public $encodeHTML = FALSE; | ||
112 | |||
113 | private $mimetypes = array( | ||
114 | "js" => "application/x-javascript", "swf" => "application/x-shockwave-flash", "xht" => "application/xhtml+xml", "xhtml" => "application/xhtml+xml", "zip" => "application/zip", | ||
115 | "aif" => "audio/x-aiff", "aifc" => "audio/x-aiff", "aiff" => "audio/x-aiff", "au" => "audio/basic", "kar" => "audio/midi", "m3u" => "audio/x-mpegurl", "mid" => "audio/midi", "midi" => "audio/midi", "mp2" => "audio/mpeg", "mp3" => "audio/mpeg", "mpga" => "audio/mpeg", "oga" => "audio/ogg", "ogg" => "audio/ogg", "ra" => "audio/x-realaudio", "ram" => "audio/x-pn-realaudio", "rm" => "audio/x-pn-realaudio", "rpm" => "audio/x-pn-realaudio-plugin", "snd" => "audio/basic", "wav" => "audio/x-wav", | ||
116 | "bmp" => "image/bmp", "djv" => "image/vnd.djvu", "djvu" => "image/vnd.djvu", "gif" => "image/gif", "ief" => "image/ief", "jpe" => "image/jpeg", "jpeg" => "image/jpeg", "jpg" => "image/jpeg", "pbm" => "image/x-portable-bitmap", "pgm" => "image/x-portable-graymap", "png" => "image/png", "pnm" => "image/x-portable-anymap", "ppm" => "image/x-portable-pixmap", "ras" => "image/x-cmu-raster", "rgb" => "image/x-rgb", "tif" => "image/tif", "tiff" => "image/tiff", "wbmp" => "image/vnd.wap.wbmp", "xbm" => "image/x-xbitmap", "xpm" => "image/x-xpixmap", "xwd" => "image/x-windowdump", | ||
117 | "asc" => "text/plain", "css" => "text/css", "etx" => "text/x-setext", "htm" => "text/html", "html" => "text/html", "rtf" => "text/rtf", "rtx" => "text/richtext", "sgm" => "text/sgml", "sgml" => "text/sgml", "tsv" => "text/tab-seperated-values", "txt" => "text/plain", "wml" => "text/vnd.wap.wml", "wmls" => "text/vnd.wap.wmlscript", "xml" => "text/xml", "xsl" => "text/xml", | ||
118 | "avi" => "video/x-msvideo", "mov" => "video/quicktime", "movie" => "video/x-sgi-movie", "mp4" => "video/mp4", "mpe" => "video/mpeg", "mpeg" => "video/mpeg", "mpg" => "video/mpeg", "mxu" => "video/vnd.mpegurl", "ogv" => "video/ogg", "qt" => "video/quicktime", "webm" => "video/webm"); | ||
119 | |||
120 | // These are the ONLY allowed types in that these are the ones ANY reader must support, any other MUST have the fallback attribute pointing to one of these. | ||
121 | private $coreMediaTypes = array("image/gif", "image/jpeg", "image/png", "image/svg+xml", "application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/css", "text/x-oeb1-css", "text/x-oeb1-document"); | ||
122 | |||
123 | private $opsContentTypes = array("application/xhtml+xml", "application/x-dtbook+xml", "application/xml", "application/x-dtbncx+xml", "text/x-oeb1-document"); | ||
124 | |||
125 | private $forbiddenCharacters = array("?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "%"); | ||
126 | |||
127 | private $htmlContentHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<title></title>\n</head>\n<body>\n"; | ||
128 | private $htmlContentFooter = "</body>\n</html>\n"; | ||
129 | |||
130 | /** | ||
131 | * Class constructor. | ||
132 | * | ||
133 | * @return void | ||
134 | */ | ||
135 | function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { | ||
136 | include_once("Zip.php"); | ||
137 | include_once("Logger.php"); | ||
138 | |||
139 | $this->bookVersion = $bookVersion; | ||
140 | $this->writingDirection = $writingDirection; | ||
141 | $this->languageCode = $languageCode; | ||
142 | |||
143 | $this->log = new Logger("EPub", $this->isLogging); | ||
144 | |||
145 | /* Prepare Logging. Just in case it's used. later */ | ||
146 | if ($this->isLogging) { | ||
147 | $this->log->logLine("EPub class version....: " . self::VERSION); | ||
148 | $this->log->logLine("EPub req. Zip version.: " . self::REQ_ZIP_VERSION); | ||
149 | $this->log->logLine("Zip version...........: " . Zip::VERSION); | ||
150 | $this->log->dumpInstalledModules(); | ||
151 | } | ||
152 | |||
153 | if (!defined("Zip::VERSION") || Zip::VERSION < self::REQ_ZIP_VERSION) { | ||
154 | die("<p>EPub version " . self::VERSION . " requires Zip.php at version " . self::REQ_ZIP_VERSION . " or higher.<br />You can obtain the latest version from <a href=\"http://www.phpclasses.org/browse/package/6110.html\">http://www.phpclasses.org/browse/package/6110.html</a>.</p>"); | ||
155 | } | ||
156 | |||
157 | include_once("EPubChapterSplitter.php"); | ||
158 | include_once("EPub.HtmlEntities.php"); | ||
159 | include_once("EPub.NCX.php"); | ||
160 | include_once("EPub.OPF.php"); | ||
161 | |||
162 | $this->initialize(); | ||
163 | } | ||
164 | |||
165 | /** | ||
166 | * Class destructor | ||
167 | * | ||
168 | * @return void | ||
169 | * @TODO make sure elements in the destructor match the current class elements | ||
170 | */ | ||
171 | function __destruct() { | ||
172 | unset($this->bookVersion, $this->maxImageWidth, $this->maxImageHeight); | ||
173 | unset($this->splitDefaultSize, $this->isGifImagesEnabled, $this->isReferencesAddedToToc); | ||
174 | unset($this->zip, $this->title, $this->language, $this->identifier, $this->identifierType); | ||
175 | unset($this->description, $this->author, $this->authorSortKey, $this->publisherName); | ||
176 | unset($this->publisherURL, $this->date, $this->rights, $this->coverage, $this->relation); | ||
177 | unset($this->sourceURL, $this->chapterCount, $this->opf, $this->ncx, $this->isFinalized); | ||
178 | unset($this->isCoverImageSet, $this->fileList, $this->writingDirection, $this->languageCode); | ||
179 | unset($this->referencesOrder, $this->dateformat, $this->dateformatShort, $this->headerDateFormat); | ||
180 | unset($this->isCurlInstalled, $this->isGdInstalled, $this->isExifInstalled); | ||
181 | unset($this->isFileGetContentsInstalled, $this->isFileGetContentsExtInstalled, $this->bookRoot); | ||
182 | unset($this->docRoot, $this->EPubMark, $this->generator, $this->log, $this->isLogging); | ||
183 | unset($this->encodeHTML, $this->mimetypes, $this->coreMediaTypes, $this->opsContentTypes); | ||
184 | unset($this->forbiddenCharacters, $this->htmlContentHeader, $this->htmlContentFooter); | ||
185 | unset($this->buildTOC, $this->tocTitle, $this->tocCSSClass, $this->tocAddReferences); | ||
186 | unset($this->tocFileName, $this->tocCssFileName); | ||
187 | } | ||
188 | |||
189 | /** | ||
190 | * initialize defaults. | ||
191 | */ | ||
192 | private function initialize() { | ||
193 | $this->referencesOrder = array( | ||
194 | Reference::COVER => "Cover Page", | ||
195 | Reference::TITLE_PAGE => "Title Page", | ||
196 | Reference::ACKNOWLEDGEMENTS => "Acknowledgements", | ||
197 | Reference::BIBLIOGRAPHY => "Bibliography", | ||
198 | Reference::COLOPHON => "Colophon", | ||
199 | Reference::COPYRIGHT_PAGE => "Copyright", | ||
200 | Reference::DEDICATION => "Dedication", | ||
201 | Reference::EPIGRAPH => "Epigraph", | ||
202 | Reference::FOREWORD => "Foreword", | ||
203 | Reference::TABLE_OF_CONTENTS => "Table of Contents", | ||
204 | Reference::NOTES => "Notes", | ||
205 | Reference::PREFACE => "Preface", | ||
206 | Reference::TEXT => "First Page", | ||
207 | Reference::LIST_OF_ILLUSTRATIONS => "List of Illustrations", | ||
208 | Reference::LIST_OF_TABLES => "List of Tables", | ||
209 | Reference::GLOSSARY => "Glossary", | ||
210 | Reference::INDEX => "Index"); | ||
211 | |||
212 | $this->docRoot = filter_input(INPUT_SERVER, "DOCUMENT_ROOT") . "/"; | ||
213 | |||
214 | $this->isCurlInstalled = extension_loaded('curl') && function_exists('curl_version'); | ||
215 | $this->isGdInstalled = extension_loaded('gd') && function_exists('gd_info'); | ||
216 | $this->isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype'); | ||
217 | $this->isFileGetContentsInstalled = function_exists('file_get_contents'); | ||
218 | $this->isFileGetContentsExtInstalled = $this->isFileGetContentsInstalled && ini_get('allow_url_fopen'); | ||
219 | |||
220 | $this->zip = new Zip(); | ||
221 | $this->zip->setExtraField(FALSE); | ||
222 | $this->zip->addFile("application/epub+zip", "mimetype"); | ||
223 | $this->zip->setExtraField(TRUE); | ||
224 | $this->zip->addDirectory("META-INF"); | ||
225 | |||
226 | $this->content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n\t<rootfiles>\n\t\t<rootfile full-path=\"" . $this->bookRoot . "book.opf\" media-type=\"application/oebps-package+xml\" />\n\t</rootfiles>\n</container>\n"; | ||
227 | |||
228 | if (!$this->isEPubVersion2()) { | ||
229 | $this->htmlContentHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
230 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
231 | . "<head>" | ||
232 | . "<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n" | ||
233 | . "<title></title>\n" | ||
234 | . "</head>\n" | ||
235 | . "<body>\n"; | ||
236 | } | ||
237 | |||
238 | $this->zip->addFile($this->content, "META-INF/container.xml", 0, NULL, FALSE); | ||
239 | $this->content = NULL; | ||
240 | $this->ncx = new Ncx(NULL, NULL, NULL, $this->languageCode, $this->writingDirection); | ||
241 | $this->opf = new Opf(); | ||
242 | $this->ncx->setVersion($this->bookVersion); | ||
243 | $this->opf->setVersion($this->bookVersion); | ||
244 | $this->opf->addItem("ncx", "book.ncx", Ncx::MIMETYPE); | ||
245 | $this->chapterCount = 0; | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * Add dynamically generated data as a file to the book. | ||
250 | * | ||
251 | * @param string $fileName Filename to use for the file, must be unique for the book. | ||
252 | * @param string $fileId Unique identifier for the file. | ||
253 | * @param string $fileData File data | ||
254 | * @param string $mimetype file mime type | ||
255 | * @return bool $success | ||
256 | */ | ||
257 | function addFile($fileName, $fileId, $fileData, $mimetype) { | ||
258 | if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
259 | return FALSE; | ||
260 | } | ||
261 | |||
262 | $fileName = $this->normalizeFileName($fileName); | ||
263 | |||
264 | $compress = (strpos($mimetype, "image/") !== 0); | ||
265 | |||
266 | $this->zip->addFile($fileData, $this->bookRoot.$fileName, 0, NULL, $compress); | ||
267 | $this->fileList[$fileName] = $fileName; | ||
268 | $this->opf->addItem($fileId, $fileName, $mimetype); | ||
269 | return TRUE; | ||
270 | } | ||
271 | |||
272 | /** | ||
273 | * Add a large file directly from the filestystem to the book. | ||
274 | * | ||
275 | * @param string $fileName Filename to use for the file, must be unique for the book. | ||
276 | * @param string $fileId Unique identifier for the file. | ||
277 | * @param string $filePath File path | ||
278 | * @param string $mimetype file mime type | ||
279 | * @return bool $success | ||
280 | */ | ||
281 | function addLargeFile($fileName, $fileId, $filePath, $mimetype) { | ||
282 | if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
283 | return FALSE; | ||
284 | } | ||
285 | $fileName = $this->normalizeFileName($fileName); | ||
286 | |||
287 | if ($this->zip->addLargeFile($filePath, $this->bookRoot.$fileName)) { | ||
288 | $this->fileList[$fileName] = $fileName; | ||
289 | $this->opf->addItem($fileId, $fileName, $mimetype); | ||
290 | return TRUE; | ||
291 | } | ||
292 | return FALSE; | ||
293 | } | ||
294 | |||
295 | /** | ||
296 | * Add a CSS file to the book. | ||
297 | * | ||
298 | * @param string $fileName Filename to use for the CSS file, must be unique for the book. | ||
299 | * @param string $fileId Unique identifier for the file. | ||
300 | * @param string $fileData CSS data | ||
301 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for <code>processCSSExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE. | ||
302 | * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. | ||
303 | * | ||
304 | * @return bool $success | ||
305 | */ | ||
306 | function addCSSFile($fileName, $fileId, $fileData, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { | ||
307 | if ($this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
308 | return FALSE; | ||
309 | } | ||
310 | $fileName = Zip::getRelativePath($fileName); | ||
311 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
312 | |||
313 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
314 | $cssDir = pathinfo($fileName); | ||
315 | $cssDir = preg_replace('#^[/\.]+#i', "", $cssDir["dirname"] . "/"); | ||
316 | if (!empty($cssDir)) { | ||
317 | $cssDir = preg_replace('#[^/]+/#i', "../", $cssDir); | ||
318 | } | ||
319 | |||
320 | $this->processCSSExternalReferences($fileData, $externalReferences, $baseDir, $cssDir); | ||
321 | } | ||
322 | |||
323 | $this->addFile($fileName, "css_" . $fileId, $fileData, "text/css"); | ||
324 | |||
325 | return TRUE; | ||
326 | } | ||
327 | |||
328 | /** | ||
329 | * Add a chapter to the book, as a chapter should not exceed 250kB, you can parse an array with multiple parts as $chapterData. | ||
330 | * These will still only show up as a single chapter in the book TOC. | ||
331 | * | ||
332 | * @param string $chapterName Name of the chapter, will be use din the TOC | ||
333 | * @param string $fileName Filename to use for the chapter, must be unique for the book. | ||
334 | * @param string $chapter Chapter text in XHTML or array $chapterData valid XHTML data for the chapter. File should NOT exceed 250kB. | ||
335 | * @param bool $autoSplit Should the chapter be split if it exceeds the default split size? Default=FALSE, only used if $chapterData is a string. | ||
336 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? See documentation for <code>processChapterExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE. | ||
337 | * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. | ||
338 | * @return mixed $success FALSE if the addition failed, else the new NavPoint. | ||
339 | */ | ||
340 | function addChapter($chapterName, $fileName, $chapterData = NULL, $autoSplit = FALSE, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { | ||
341 | if ($this->isFinalized) { | ||
342 | return FALSE; | ||
343 | } | ||
344 | $fileName = Zip::getRelativePath($fileName); | ||
345 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
346 | $fileName = $this->sanitizeFileName($fileName); | ||
347 | |||
348 | $chapter = $chapterData; | ||
349 | if ($autoSplit && is_string($chapterData) && mb_strlen($chapterData) > $this->splitDefaultSize) { | ||
350 | $splitter = new EPubChapterSplitter(); | ||
351 | |||
352 | $chapterArray = $splitter->splitChapter($chapterData); | ||
353 | if (count($chapterArray) > 1) { | ||
354 | $chapter = $chapterArray; | ||
355 | } | ||
356 | } | ||
357 | |||
358 | if (!empty($chapter) && is_string($chapter)) { | ||
359 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
360 | $htmlDirInfo = pathinfo($fileName); | ||
361 | $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/"); | ||
362 | $this->processChapterExternalReferences($chapter, $externalReferences, $baseDir, $htmlDir); | ||
363 | } | ||
364 | |||
365 | if ($this->encodeHTML === TRUE) { | ||
366 | $chapter = $this->encodeHtml($chapter); | ||
367 | } | ||
368 | |||
369 | $this->chapterCount++; | ||
370 | $this->addFile($fileName, "chapter" . $this->chapterCount, $chapter, "application/xhtml+xml"); | ||
371 | $this->opf->addItemRef("chapter" . $this->chapterCount); | ||
372 | |||
373 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); | ||
374 | $this->ncx->addNavPoint($navPoint); | ||
375 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
376 | } else if (is_array($chapter)) { | ||
377 | $fileNameParts = pathinfo($fileName); | ||
378 | $extension = $fileNameParts['extension']; | ||
379 | $name = $fileNameParts['filename']; | ||
380 | |||
381 | $partCount = 0; | ||
382 | $this->chapterCount++; | ||
383 | |||
384 | $oneChapter = each($chapter); | ||
385 | while ($oneChapter) { | ||
386 | list($k, $v) = $oneChapter; | ||
387 | if ($this->encodeHTML === TRUE) { | ||
388 | $v = $this->encodeHtml($v); | ||
389 | } | ||
390 | |||
391 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
392 | $this->processChapterExternalReferences($v, $externalReferences, $baseDir); | ||
393 | } | ||
394 | $partCount++; | ||
395 | $partName = $name . "_" . $partCount; | ||
396 | $this->addFile($partName . "." . $extension, $partName, $v, "application/xhtml+xml"); | ||
397 | $this->opf->addItemRef($partName); | ||
398 | |||
399 | $oneChapter = each($chapter); | ||
400 | } | ||
401 | $partName = $name . "_1." . $extension; | ||
402 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $partName, $partName); | ||
403 | $this->ncx->addNavPoint($navPoint); | ||
404 | |||
405 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
406 | } else if (!isset($chapterData) && strpos($fileName, "#") > 0) { | ||
407 | $this->chapterCount++; | ||
408 | //$this->opf->addItemRef("chapter" . $this->chapterCount); | ||
409 | |||
410 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); | ||
411 | $this->ncx->addNavPoint($navPoint); | ||
412 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
413 | } else if (!isset($chapterData) && $fileName=="TOC.xhtml") { | ||
414 | $this->chapterCount++; | ||
415 | $this->opf->addItemRef("toc"); | ||
416 | |||
417 | $navPoint = new NavPoint($this->decodeHtmlEntities($chapterName), $fileName, "chapter" . $this->chapterCount); | ||
418 | $this->ncx->addNavPoint($navPoint); | ||
419 | $this->ncx->chapterList[$chapterName] = $navPoint; | ||
420 | } | ||
421 | return $navPoint; | ||
422 | } | ||
423 | |||
424 | /** | ||
425 | * Add one chapter level. | ||
426 | * | ||
427 | * Subsequent chapters will be added to this level. | ||
428 | * | ||
429 | * @param string $navTitle | ||
430 | * @param string $navId | ||
431 | * @param string $navClass | ||
432 | * @param int $isNavHidden | ||
433 | * @param string $writingDirection | ||
434 | * @return NavPoint The new NavPoint for that level. | ||
435 | */ | ||
436 | function subLevel($navTitle = NULL, $navId = NULL, $navClass = NULL, $isNavHidden = FALSE, $writingDirection = NULL) { | ||
437 | return $this->ncx->subLevel($this->decodeHtmlEntities($navTitle), $navId, $navClass, $isNavHidden, $writingDirection); | ||
438 | } | ||
439 | |||
440 | /** | ||
441 | * Step back one chapter level. | ||
442 | * | ||
443 | * Subsequent chapters will be added to this chapters parent level. | ||
444 | */ | ||
445 | function backLevel() { | ||
446 | $this->ncx->backLevel(); | ||
447 | } | ||
448 | |||
449 | /** | ||
450 | * Step back to the root level. | ||
451 | * | ||
452 | * Subsequent chapters will be added to the rooot NavMap. | ||
453 | */ | ||
454 | function rootLevel() { | ||
455 | $this->ncx->rootLevel(); | ||
456 | } | ||
457 | |||
458 | /** | ||
459 | * Step back to the given level. | ||
460 | * Useful for returning to a previous level from deep within the structure. | ||
461 | * Values below 2 will have the same effect as rootLevel() | ||
462 | * | ||
463 | * @param int $newLevel | ||
464 | */ | ||
465 | function setCurrentLevel($newLevel) { | ||
466 | $this->ncx->setCurrentLevel($newLevel); | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * Get current level count. | ||
471 | * The indentation of the current structure point. | ||
472 | * | ||
473 | * @return current level count; | ||
474 | */ | ||
475 | function getCurrentLevel() { | ||
476 | return $this->ncx->getCurrentLevel(); | ||
477 | } | ||
478 | |||
479 | /** | ||
480 | * Wrap ChapterContent with Head and Footer | ||
481 | * | ||
482 | * @param $content | ||
483 | * @return string $content | ||
484 | */ | ||
485 | private function wrapChapter($content) { | ||
486 | return $this->htmlContentHeader . "\n" . $content . "\n" . $this->htmlContentFooter; | ||
487 | } | ||
488 | |||
489 | /** | ||
490 | * Reference pages is usually one or two pages for items such as Table of Contents, reference lists, Author notes or Acknowledgements. | ||
491 | * These do not show up in the regular navigation list. | ||
492 | * | ||
493 | * As they are supposed to be short. | ||
494 | * | ||
495 | * @param string $pageName Name of the chapter, will be use din the TOC | ||
496 | * @param string $fileName Filename to use for the chapter, must be unique for the book. | ||
497 | * @param string $pageData Page content in XHTML. File should NOT exceed 250kB. | ||
498 | * @param string $reference Reference key | ||
499 | * @param int $externalReferences How to handle external references. See documentation for <code>processChapterExternalReferences</code> for explanation. Default is EPub::EXTERNAL_REF_IGNORE. | ||
500 | * @param string $baseDir Default is "", meaning it is pointing to the document root. NOT used if $externalReferences is set to EPub::EXTERNAL_REF_IGNORE. | ||
501 | * @return bool $success | ||
502 | */ | ||
503 | function addReferencePage($pageName, $fileName, $pageData, $reference, $externalReferences = EPub::EXTERNAL_REF_IGNORE, $baseDir = "") { | ||
504 | if ($this->isFinalized) { | ||
505 | return FALSE; | ||
506 | } | ||
507 | $fileName = Zip::getRelativePath($fileName); | ||
508 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
509 | |||
510 | |||
511 | if (!empty($pageData) && is_string($pageData)) { | ||
512 | if ($this->encodeHTML === TRUE) { | ||
513 | $pageData = $this->encodeHtml($pageData); | ||
514 | } | ||
515 | |||
516 | $this->wrapChapter($pageData); | ||
517 | |||
518 | if ($externalReferences !== EPub::EXTERNAL_REF_IGNORE) { | ||
519 | $htmlDirInfo = pathinfo($fileName); | ||
520 | $htmlDir = preg_replace('#^[/\.]+#i', "", $htmlDirInfo["dirname"] . "/"); | ||
521 | $this->processChapterExternalReferences($pageData, $externalReferences, $baseDir, $htmlDir); | ||
522 | } | ||
523 | |||
524 | $this->addFile($fileName, "ref_" . $reference, $pageData, "application/xhtml+xml"); | ||
525 | |||
526 | if ($reference !== Reference::TABLE_OF_CONTENTS || !isset($this->ncx->referencesList[$reference])) { | ||
527 | $this->opf->addItemRef("ref_" . $reference, FALSE); | ||
528 | $this->opf->addReference($reference, $pageName, $fileName); | ||
529 | |||
530 | $this->ncx->referencesList[$reference] = $fileName; | ||
531 | $this->ncx->referencesName[$reference] = $pageName; | ||
532 | } | ||
533 | return TRUE; | ||
534 | } | ||
535 | return TRUE; | ||
536 | } | ||
537 | |||
538 | /** | ||
539 | * Add custom metadata to the book. | ||
540 | * | ||
541 | * It is up to the builder to make sure there are no collisions. Metadata are just key value pairs. | ||
542 | * | ||
543 | * @param string $name | ||
544 | * @param string $content | ||
545 | */ | ||
546 | function addCustomMetadata($name, $content) { | ||
547 | $this->opf->addMeta($name, $content); | ||
548 | } | ||
549 | |||
550 | /** | ||
551 | * Add DublinCore metadata to the book | ||
552 | * | ||
553 | * Use the DublinCore constants included in EPub, ie DublinCore::DATE | ||
554 | * | ||
555 | * @param string $dublinCore name | ||
556 | * @param string $value | ||
557 | */ | ||
558 | function addDublinCoreMetadata($dublinCoreConstant, $value) { | ||
559 | if ($this->isFinalized) { | ||
560 | return; | ||
561 | } | ||
562 | |||
563 | $this->opf->addDCMeta($dublinCoreConstant, $this->decodeHtmlEntities($value)); | ||
564 | } | ||
565 | |||
566 | /** | ||
567 | * Add a cover image to the book. | ||
568 | * If the $imageData is not set, the function assumes the $fileName is the path to the image file. | ||
569 | * | ||
570 | * The styling and structure of the generated XHTML is heavily inspired by the XHTML generated by Calibre. | ||
571 | * | ||
572 | * @param string $fileName Filename to use for the image, must be unique for the book. | ||
573 | * @param string $imageData Binary image data | ||
574 | * @param string $mimetype Image mimetype, such as "image/jpeg" or "image/png". | ||
575 | * @return bool $success | ||
576 | */ | ||
577 | function setCoverImage($fileName, $imageData = NULL, $mimetype = NULL,$bookTitle) { | ||
578 | if ($this->isFinalized || $this->isCoverImageSet || array_key_exists("CoverPage.html", $this->fileList)) { | ||
579 | return FALSE; | ||
580 | } | ||
581 | |||
582 | if ($imageData == NULL) { | ||
583 | // assume $fileName is the valid file path. | ||
584 | if (!file_exists($fileName)) { | ||
585 | // Attempt to locate the file using the doc root. | ||
586 | $rp = realpath($this->docRoot . "/" . $fileName); | ||
587 | |||
588 | if ($rp !== FALSE) { | ||
589 | // only assign the docroot path if it actually exists there. | ||
590 | $fileName = $rp; | ||
591 | } | ||
592 | } | ||
593 | $image = $this->getImage($fileName); | ||
594 | $imageData = $image['image']; | ||
595 | $mimetype = $image['mime']; | ||
596 | $fileName = preg_replace("#\.[^\.]+$#", "." . $image['ext'], $fileName); | ||
597 | } | ||
598 | |||
599 | |||
600 | $path = pathinfo($fileName); | ||
601 | $imgPath = "images/" . $path["basename"]; | ||
602 | |||
603 | if (empty($mimetype) && file_exists($fileName)) { | ||
604 | list($width, $height, $type, $attr) = getimagesize($fileName); | ||
605 | $mimetype = image_type_to_mime_type($type); | ||
606 | } | ||
607 | if (empty($mimetype)) { | ||
608 | $ext = strtolower($path['extension']); | ||
609 | if ($ext == "jpg") { | ||
610 | $ext = "jpeg"; | ||
611 | } | ||
612 | $mimetype = "image/" . $ext; | ||
613 | } | ||
614 | |||
615 | $coverPage = ""; | ||
616 | |||
617 | if ($this->isEPubVersion2()) { | ||
618 | $coverPage = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
619 | . "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n" | ||
620 | . " \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n" | ||
621 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" xml:lang=\"en\">\n" | ||
622 | . "\t<head>\n" | ||
623 | . "\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>\n" | ||
624 | . "\t\t<title>Cover Image</title>\n" | ||
625 | . "\t\t<link type=\"text/css\" rel=\"stylesheet\" href=\"Styles/CoverPage.css\" />\n" | ||
626 | . "\t</head>\n" | ||
627 | . "\t<body>\n" | ||
628 | . "\t" . $bookTitle . "\n" | ||
629 | . "\t\t<div>\n" | ||
630 | . "\t\t\t<img src=\"" . $imgPath . "\" alt=\"Cover image\" style=\"height: 100%\"/>\n" | ||
631 | . "\t\t</div>\n" | ||
632 | . "\t</body>\n" | ||
633 | . "</html>\n"; | ||
634 | } else { | ||
635 | $coverPage = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" | ||
636 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
637 | . "<head>" | ||
638 | . "\t<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n" | ||
639 | . "\t\t<title>Cover Image</title>\n" | ||
640 | . "\t\t<link type=\"text/css\" rel=\"stylesheet\" href=\"Styles/CoverPage.css\" />\n" | ||
641 | . "\t</head>\n" | ||
642 | . "\t<body>\n" | ||
643 | . "\t\t<section epub:type=\"cover\">\n" | ||
644 | . "\t" . $bookTitle . "\n" | ||
645 | . "\t\t\t<img src=\"" . $imgPath . "\" alt=\"Cover image\" style=\"height: 30%\"/>\n" | ||
646 | . "\t\t</section>\n" | ||
647 | . "\t</body>\n" | ||
648 | . "</html>\n"; | ||
649 | } | ||
650 | $coverPageCss = "@page, body, div, img {\n" | ||
651 | . "\tpadding: 0pt;\n" | ||
652 | . "\tmargin:0pt;\n" | ||
653 | . "}\n\nbody {\n" | ||
654 | . "\ttext-align: center;\n" | ||
655 | . "}\n"; | ||
656 | |||
657 | $this->addCSSFile("Styles/CoverPage.css", "CoverPageCss", $coverPageCss); | ||
658 | $this->addFile($imgPath, "CoverImage", $imageData, $mimetype); | ||
659 | $this->addReferencePage("CoverPage", "CoverPage.xhtml", $coverPage, "cover"); | ||
660 | $this->isCoverImageSet = TRUE; | ||
661 | return TRUE; | ||
662 | } | ||
663 | |||
664 | /** | ||
665 | * Process external references from a HTML to the book. The chapter itself is not stored. | ||
666 | * the HTML is scanned for <link..., <style..., and <img tags. | ||
667 | * Embedded CSS styles and links will also be processed. | ||
668 | * Script tags are not processed, as scripting should be avoided in e-books. | ||
669 | * | ||
670 | * EPub keeps track of added files, and duplicate files referenced across multiple | ||
671 | * chapters, are only added once. | ||
672 | * | ||
673 | * If the $doc is a string, it is assumed to be the content of an HTML file, | ||
674 | * else is it assumes to be a DOMDocument. | ||
675 | * | ||
676 | * Basedir is the root dir the HTML is supposed to "live" in, used to resolve | ||
677 | * relative references such as <code><img src="../images/image.png"/></code> | ||
678 | * | ||
679 | * $externalReferences determines how the function will handle external references. | ||
680 | * | ||
681 | * @param mixed &$doc (referenced) | ||
682 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
683 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
684 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
685 | * | ||
686 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
687 | */ | ||
688 | protected function processChapterExternalReferences(&$doc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { | ||
689 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
690 | return FALSE; | ||
691 | } | ||
692 | |||
693 | $backPath = preg_replace('#[^/]+/#i', "../", $htmlDir); | ||
694 | $isDocAString = is_string($doc); | ||
695 | $xmlDoc = NULL; | ||
696 | |||
697 | if ($isDocAString) { | ||
698 | $xmlDoc = new DOMDocument(); | ||
699 | @$xmlDoc->loadHTML($doc); | ||
700 | } else { | ||
701 | $xmlDoc = $doc; | ||
702 | } | ||
703 | |||
704 | $this->processChapterStyles($xmlDoc, $externalReferences, $baseDir, $htmlDir); | ||
705 | $this->processChapterLinks($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); | ||
706 | $this->processChapterImages($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); | ||
707 | $this->processChapterSources($xmlDoc, $externalReferences, $baseDir, $htmlDir, $backPath); | ||
708 | |||
709 | if ($isDocAString) { | ||
710 | //$html = $xmlDoc->saveXML(); | ||
711 | |||
712 | $htmlNode = $xmlDoc->getElementsByTagName("html"); | ||
713 | $headNode = $xmlDoc->getElementsByTagName("head"); | ||
714 | $bodyNode = $xmlDoc->getElementsByTagName("body"); | ||
715 | |||
716 | $htmlNS = ""; | ||
717 | for ($index = 0; $index < $htmlNode->item(0)->attributes->length; $index++) { | ||
718 | $nodeName = $htmlNode->item(0)->attributes->item($index)->nodeName; | ||
719 | $nodeValue = $htmlNode->item(0)->attributes->item($index)->nodeValue; | ||
720 | |||
721 | if ($nodeName != "xmlns") { | ||
722 | $htmlNS .= " $nodeName=\"$nodeValue\""; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | $xml = new DOMDocument('1.0', "utf-8"); | ||
727 | $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
728 | $xml->preserveWhiteSpace = FALSE; | ||
729 | $xml->formatOutput = TRUE; | ||
730 | |||
731 | $xml2Doc = new DOMDocument('1.0', "utf-8"); | ||
732 | $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
733 | $xml2Doc->loadXML("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\"$htmlNS>\n</html>\n"); | ||
734 | $html = $xml2Doc->getElementsByTagName("html")->item(0); | ||
735 | $html->appendChild($xml2Doc->importNode($headNode->item(0), TRUE)); | ||
736 | $html->appendChild($xml2Doc->importNode($bodyNode->item(0), TRUE)); | ||
737 | |||
738 | // force pretty printing and correct formatting, should not be needed, but it is. | ||
739 | $xml->loadXML($xml2Doc->saveXML()); | ||
740 | $doc = $xml->saveXML(); | ||
741 | |||
742 | if (!$this->isEPubVersion2()) { | ||
743 | $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc); | ||
744 | } | ||
745 | } | ||
746 | return TRUE; | ||
747 | } | ||
748 | |||
749 | /** | ||
750 | * Process images referenced from an CSS file to the book. | ||
751 | * | ||
752 | * $externalReferences determins how the function will handle external references. | ||
753 | * | ||
754 | * @param string &$cssFile (referenced) | ||
755 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
756 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
757 | * @param string $cssDir The of the CSS file's directory from the root of the archive. | ||
758 | * | ||
759 | * @return bool FALSE if unsuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
760 | */ | ||
761 | protected function processCSSExternalReferences(&$cssFile, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $cssDir = "") { | ||
762 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
763 | return FALSE; | ||
764 | } | ||
765 | |||
766 | $backPath = preg_replace('#[^/]+/#i', "../", $cssDir); | ||
767 | $imgs = null; | ||
768 | preg_match_all('#url\s*\([\'\"\s]*(.+?)[\'\"\s]*\)#im', $cssFile, $imgs, PREG_SET_ORDER); | ||
769 | |||
770 | $itemCount = count($imgs); | ||
771 | for ($idx = 0; $idx < $itemCount; $idx++) { | ||
772 | $img = $imgs[$idx]; | ||
773 | if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES || $externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { | ||
774 | $cssFile = str_replace($img[0], "", $cssFile); | ||
775 | } else { | ||
776 | $source = $img[1]; | ||
777 | |||
778 | $pathData = pathinfo($source); | ||
779 | $internalSrc = $pathData['basename']; | ||
780 | $internalPath = ""; | ||
781 | $isSourceExternal = FALSE; | ||
782 | |||
783 | if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $cssDir, $backPath)) { | ||
784 | $cssFile = str_replace($img[0], "url('" . $backPath . $internalPath . "')", $cssFile); | ||
785 | } else if ($isSourceExternal) { | ||
786 | $cssFile = str_replace($img[0], "", $cssFile); // External image is missing | ||
787 | } // else do nothing, if the image is local, and missing, assume it's been generated. | ||
788 | } | ||
789 | } | ||
790 | return TRUE; | ||
791 | } | ||
792 | |||
793 | /** | ||
794 | * Process style tags in a DOMDocument. Styles will be passed as CSS files and reinserted into the document. | ||
795 | * | ||
796 | * @param DOMDocument &$xmlDoc (referenced) | ||
797 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
798 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
799 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
800 | * | ||
801 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
802 | */ | ||
803 | protected function processChapterStyles(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "") { | ||
804 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
805 | return FALSE; | ||
806 | } | ||
807 | // process inlined CSS styles in style tags. | ||
808 | $styles = $xmlDoc->getElementsByTagName("style"); | ||
809 | $styleCount = $styles->length; | ||
810 | for ($styleIdx = 0; $styleIdx < $styleCount; $styleIdx++) { | ||
811 | $style = $styles->item($styleIdx); | ||
812 | |||
813 | $styleData = preg_replace('#[/\*\s]*\<\!\[CDATA\[[\s\*/]*#im', "", $style->nodeValue); | ||
814 | $styleData = preg_replace('#[/\*\s]*\]\]\>[\s\*/]*#im', "", $styleData); | ||
815 | |||
816 | $this->processCSSExternalReferences($styleData, $externalReferences, $baseDir, $htmlDir); | ||
817 | $style->nodeValue = "\n" . trim($styleData) . "\n"; | ||
818 | } | ||
819 | return TRUE; | ||
820 | } | ||
821 | |||
822 | /** | ||
823 | * Process link tags in a DOMDocument. Linked files will be loaded into the archive, and the link src will be rewritten to point to that location. | ||
824 | * Link types text/css will be passed as CSS files. | ||
825 | * | ||
826 | * @param DOMDocument &$xmlDoc (referenced) | ||
827 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
828 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
829 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
830 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
831 | * | ||
832 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
833 | */ | ||
834 | protected function processChapterLinks(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
835 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
836 | return FALSE; | ||
837 | } | ||
838 | // process link tags. | ||
839 | $links = $xmlDoc->getElementsByTagName("link"); | ||
840 | $linkCount = $links->length; | ||
841 | for ($linkIdx = 0; $linkIdx < $linkCount; $linkIdx++) { | ||
842 | $link = $links->item($linkIdx); | ||
843 | $source = $link->attributes->getNamedItem("href")->nodeValue; | ||
844 | $sourceData = NULL; | ||
845 | |||
846 | $pathData = pathinfo($source); | ||
847 | $internalSrc = $pathData['basename']; | ||
848 | |||
849 | if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { | ||
850 | $urlinfo = parse_url($source); | ||
851 | |||
852 | if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { | ||
853 | $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1); | ||
854 | } | ||
855 | |||
856 | @$sourceData = getFileContents($source); | ||
857 | } else if (strpos($source, "/") === 0) { | ||
858 | @$sourceData = file_get_contents($this->docRoot . $source); | ||
859 | } else { | ||
860 | @$sourceData = file_get_contents($this->docRoot . $baseDir . "/" . $source); | ||
861 | } | ||
862 | |||
863 | if (!empty($sourceData)) { | ||
864 | if (!array_key_exists($internalSrc, $this->fileList)) { | ||
865 | $mime = $link->attributes->getNamedItem("type")->nodeValue; | ||
866 | if (empty($mime)) { | ||
867 | $mime = "text/plain"; | ||
868 | } | ||
869 | if ($mime == "text/css") { | ||
870 | $this->processCSSExternalReferences($sourceData, $externalReferences, $baseDir, $htmlDir); | ||
871 | $this->addCSSFile($internalSrc, $internalSrc, $sourceData, EPub::EXTERNAL_REF_IGNORE, $baseDir); | ||
872 | $link->setAttribute("href", $backPath . $internalSrc); | ||
873 | } else { | ||
874 | $this->addFile($internalSrc, $internalSrc, $sourceData, $mime); | ||
875 | } | ||
876 | $this->fileList[$internalSrc] = $source; | ||
877 | } else { | ||
878 | $link->setAttribute("href", $backPath . $internalSrc); | ||
879 | } | ||
880 | } // else do nothing, if the link is local, and missing, assume it's been generated. | ||
881 | } | ||
882 | return TRUE; | ||
883 | } | ||
884 | |||
885 | /** | ||
886 | * Process img tags in a DOMDocument. | ||
887 | * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly. | ||
888 | * | ||
889 | * @param DOMDocument &$xmlDoc (referenced) | ||
890 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
891 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
892 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
893 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
894 | * | ||
895 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
896 | */ | ||
897 | protected function processChapterImages(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
898 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
899 | return FALSE; | ||
900 | } | ||
901 | // process img tags. | ||
902 | $postProcDomElememts = array(); | ||
903 | $images = $xmlDoc->getElementsByTagName("img"); | ||
904 | $itemCount = $images->length; | ||
905 | |||
906 | for ($idx = 0; $idx < $itemCount; $idx++) { | ||
907 | $img = $images->item($idx); | ||
908 | |||
909 | if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) { | ||
910 | $postProcDomElememts[] = $img; | ||
911 | } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { | ||
912 | $altNode = $img->attributes->getNamedItem("alt"); | ||
913 | $alt = "image"; | ||
914 | if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) { | ||
915 | $alt = $altNode->nodeValue; | ||
916 | } | ||
917 | $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "<em>[" . $alt . "]</em>")); | ||
918 | } else { | ||
919 | $source = $img->attributes->getNamedItem("src")->nodeValue; | ||
920 | |||
921 | $parsedSource = parse_url($source); | ||
922 | $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME))); | ||
923 | $internalPath = ""; | ||
924 | $isSourceExternal = FALSE; | ||
925 | |||
926 | if ($this->resolveImage($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) { | ||
927 | $img->setAttribute("src", $backPath . $internalPath); | ||
928 | } else if ($isSourceExternal) { | ||
929 | $postProcDomElememts[] = $img; // External image is missing | ||
930 | } // else do nothing, if the image is local, and missing, assume it's been generated. | ||
931 | } | ||
932 | } | ||
933 | |||
934 | foreach ($postProcDomElememts as $target) { | ||
935 | if (is_array($target)) { | ||
936 | $target[0]->parentNode->replaceChild($target[1], $target[0]); | ||
937 | } else { | ||
938 | $target->parentNode->removeChild($target); | ||
939 | } | ||
940 | } | ||
941 | return TRUE; | ||
942 | } | ||
943 | |||
944 | /** | ||
945 | * Process source tags in a DOMDocument. | ||
946 | * $externalReferences will determine what will happen to these images, and the img src will be rewritten accordingly. | ||
947 | * | ||
948 | * @param DOMDocument &$xmlDoc (referenced) | ||
949 | * @param int $externalReferences How to handle external references, EPub::EXTERNAL_REF_IGNORE, EPub::EXTERNAL_REF_ADD or EPub::EXTERNAL_REF_REMOVE_IMAGES? Default is EPub::EXTERNAL_REF_ADD. | ||
950 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
951 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
952 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
953 | * | ||
954 | * @return bool FALSE if uncuccessful (book is finalized or $externalReferences == EXTERNAL_REF_IGNORE). | ||
955 | */ | ||
956 | protected function processChapterSources(&$xmlDoc, $externalReferences = EPub::EXTERNAL_REF_ADD, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
957 | if ($this->isFinalized || $externalReferences === EPub::EXTERNAL_REF_IGNORE) { | ||
958 | return FALSE; | ||
959 | } | ||
960 | |||
961 | if ($this->bookVersion !== EPub::BOOK_VERSION_EPUB3) { | ||
962 | // ePub 2 does not support multimedia formats, and they must be removed. | ||
963 | $externalReferences = EPub::EXTERNAL_REF_REMOVE_IMAGES; | ||
964 | } | ||
965 | |||
966 | $postProcDomElememts = array(); | ||
967 | $images = $xmlDoc->getElementsByTagName("source"); | ||
968 | $itemCount = $images->length; | ||
969 | for ($idx = 0; $idx < $itemCount; $idx++) { | ||
970 | $img = $images->item($idx); | ||
971 | if ($externalReferences === EPub::EXTERNAL_REF_REMOVE_IMAGES) { | ||
972 | $postProcDomElememts[] = $img; | ||
973 | } else if ($externalReferences === EPub::EXTERNAL_REF_REPLACE_IMAGES) { | ||
974 | $altNode = $img->attributes->getNamedItem("alt"); | ||
975 | $alt = "image"; | ||
976 | if ($altNode !== NULL && strlen($altNode->nodeValue) > 0) { | ||
977 | $alt = $altNode->nodeValue; | ||
978 | } | ||
979 | $postProcDomElememts[] = array($img, $this->createDomFragment($xmlDoc, "[" . $alt . "]")); | ||
980 | } else { | ||
981 | $source = $img->attributes->getNamedItem("src")->nodeValue; | ||
982 | |||
983 | $parsedSource = parse_url($source); | ||
984 | $internalSrc = $this->sanitizeFileName(urldecode(pathinfo($parsedSource['path'], PATHINFO_BASENAME))); | ||
985 | $internalPath = ""; | ||
986 | $isSourceExternal = FALSE; | ||
987 | |||
988 | if ($this->resolveMedia($source, $internalPath, $internalSrc, $isSourceExternal, $baseDir, $htmlDir, $backPath)) { | ||
989 | $img->setAttribute("src", $backPath . $internalPath); | ||
990 | } else if ($isSourceExternal) { | ||
991 | $postProcDomElememts[] = $img; // External image is missing | ||
992 | } // else do nothing, if the image is local, and missing, assume it's been generated. | ||
993 | } | ||
994 | } | ||
995 | } | ||
996 | |||
997 | /** | ||
998 | * Resolve an image src and determine it's target location and add it to the book. | ||
999 | * | ||
1000 | * @param string $source Image Source link. | ||
1001 | * @param string &$internalPath (referenced) Return value, will be set to the target path and name in the book. | ||
1002 | * @param string &$internalSrc (referenced) Return value, will be set to the target name in the book. | ||
1003 | * @param string &$isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL. | ||
1004 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
1005 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
1006 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
1007 | */ | ||
1008 | protected function resolveImage($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
1009 | if ($this->isFinalized) { | ||
1010 | return FALSE; | ||
1011 | } | ||
1012 | $imageData = NULL; | ||
1013 | |||
1014 | if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { | ||
1015 | $urlinfo = parse_url($source); | ||
1016 | $urlPath = pathinfo($urlinfo['path']); | ||
1017 | |||
1018 | if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { | ||
1019 | $internalSrc = $this->sanitizeFileName(urldecode(substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1))); | ||
1020 | } | ||
1021 | $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME); | ||
1022 | $isSourceExternal = TRUE; | ||
1023 | $imageData = $this->getImage($source); | ||
1024 | } else if (strpos($source, "/") === 0) { | ||
1025 | $internalPath = pathinfo($source, PATHINFO_DIRNAME); | ||
1026 | |||
1027 | $path = $source; | ||
1028 | if (!file_exists($path)) { | ||
1029 | $path = $this->docRoot . $path; | ||
1030 | } | ||
1031 | |||
1032 | $imageData = $this->getImage($path); | ||
1033 | } else { | ||
1034 | $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME)); | ||
1035 | |||
1036 | $path = $baseDir . "/" . $source; | ||
1037 | if (!file_exists($path)) { | ||
1038 | $path = $this->docRoot . $path; | ||
1039 | } | ||
1040 | |||
1041 | $imageData = $this->getImage($path); | ||
1042 | } | ||
1043 | if ($imageData !== FALSE) { | ||
1044 | $iSrcInfo = pathinfo($internalSrc); | ||
1045 | if (!empty($imageData['ext']) && $imageData['ext'] != $iSrcInfo['extension']) { | ||
1046 | $internalSrc = $iSrcInfo['filename'] . "." . $imageData['ext']; | ||
1047 | } | ||
1048 | $internalPath = Zip::getRelativePath("images/" . $internalPath . "/" . $internalSrc); | ||
1049 | if (!array_key_exists($internalPath, $this->fileList)) { | ||
1050 | $this->addFile($internalPath, "i_" . $internalSrc, $imageData['image'], $imageData['mime']); | ||
1051 | $this->fileList[$internalPath] = $source; | ||
1052 | } | ||
1053 | return TRUE; | ||
1054 | } | ||
1055 | return FALSE; | ||
1056 | } | ||
1057 | |||
1058 | /** | ||
1059 | * Resolve a media src and determine it's target location and add it to the book. | ||
1060 | * | ||
1061 | * @param string $source Source link. | ||
1062 | * @param string $internalPath (referenced) Return value, will be set to the target path and name in the book. | ||
1063 | * @param string $internalSrc (referenced) Return value, will be set to the target name in the book. | ||
1064 | * @param string $isSourceExternal (referenced) Return value, will be set to TRUE if the image originated from a full URL. | ||
1065 | * @param string $baseDir Default is "", meaning it is pointing to the document root. | ||
1066 | * @param string $htmlDir The path to the parent HTML file's directory from the root of the archive. | ||
1067 | * @param string $backPath The path to get back to the root of the archive from $htmlDir. | ||
1068 | */ | ||
1069 | protected function resolveMedia($source, &$internalPath, &$internalSrc, &$isSourceExternal, $baseDir = "", $htmlDir = "", $backPath = "") { | ||
1070 | if ($this->isFinalized) { | ||
1071 | return FALSE; | ||
1072 | } | ||
1073 | $mediaPath = NULL; | ||
1074 | $tmpFile; | ||
1075 | |||
1076 | if (preg_match('#^(http|ftp)s?://#i', $source) == 1) { | ||
1077 | $urlinfo = parse_url($source); | ||
1078 | |||
1079 | if (strpos($urlinfo['path'], $baseDir."/") !== FALSE) { | ||
1080 | $internalSrc = substr($urlinfo['path'], strpos($urlinfo['path'], $baseDir."/") + strlen($baseDir) + 1); | ||
1081 | } | ||
1082 | $internalPath = $urlinfo["scheme"] . "/" . $urlinfo["host"] . "/" . pathinfo($urlinfo["path"], PATHINFO_DIRNAME); | ||
1083 | $isSourceExternal = TRUE; | ||
1084 | $mediaPath = $this->getFileContents($source, true); | ||
1085 | $tmpFile = $mediaPath; | ||
1086 | } else if (strpos($source, "/") === 0) { | ||
1087 | $internalPath = pathinfo($source, PATHINFO_DIRNAME); | ||
1088 | |||
1089 | $mediaPath = $source; | ||
1090 | if (!file_exists($mediaPath)) { | ||
1091 | $mediaPath = $this->docRoot . $mediaPath; | ||
1092 | } | ||
1093 | } else { | ||
1094 | $internalPath = $htmlDir . "/" . preg_replace('#^[/\.]+#', '', pathinfo($source, PATHINFO_DIRNAME)); | ||
1095 | |||
1096 | $mediaPath = $baseDir . "/" . $source; | ||
1097 | if (!file_exists($mediaPath)) { | ||
1098 | $mediaPath = $this->docRoot . $mediaPath; | ||
1099 | } | ||
1100 | } | ||
1101 | |||
1102 | if ($mediaPath !== FALSE) { | ||
1103 | $mime = $this->getMime($source); | ||
1104 | $internalPath = Zip::getRelativePath("media/" . $internalPath . "/" . $internalSrc); | ||
1105 | |||
1106 | if (!array_key_exists($internalPath, $this->fileList) && | ||
1107 | $this->addLargeFile($internalPath, "m_" . $internalSrc, $mediaPath, $mime)) { | ||
1108 | $this->fileList[$internalPath] = $source; | ||
1109 | } | ||
1110 | if (isset($tmpFile)) { | ||
1111 | unlink($tmpFile); | ||
1112 | } | ||
1113 | return TRUE; | ||
1114 | } | ||
1115 | return FALSE; | ||
1116 | } | ||
1117 | |||
1118 | /** | ||
1119 | * Get Book Chapter count. | ||
1120 | * | ||
1121 | * @access public | ||
1122 | * @return number of chapters | ||
1123 | */ | ||
1124 | function getChapterCount() { | ||
1125 | return $this->chapterCount; | ||
1126 | } | ||
1127 | |||
1128 | /** | ||
1129 | * Book title, mandatory. | ||
1130 | * | ||
1131 | * Used for the dc:title metadata parameter in the OPF file as well as the DocTitle attribute in the NCX file. | ||
1132 | * | ||
1133 | * @param string $title | ||
1134 | * @access public | ||
1135 | * @return bool $success | ||
1136 | */ | ||
1137 | function setTitle($title) { | ||
1138 | if ($this->isFinalized) { | ||
1139 | return FALSE; | ||
1140 | } | ||
1141 | $this->title = $title; | ||
1142 | return TRUE; | ||
1143 | } | ||
1144 | |||
1145 | /** | ||
1146 | * Get Book title. | ||
1147 | * | ||
1148 | * @access public | ||
1149 | * @return $title | ||
1150 | */ | ||
1151 | function getTitle() { | ||
1152 | return $this->title; | ||
1153 | } | ||
1154 | |||
1155 | /** | ||
1156 | * Book language, mandatory | ||
1157 | * | ||
1158 | * Use the RFC3066 Language codes, such as "en", "da", "fr" etc. | ||
1159 | * Defaults to "en". | ||
1160 | * | ||
1161 | * Used for the dc:language metadata parameter in the OPF file. | ||
1162 | * | ||
1163 | * @param string $language | ||
1164 | * @access public | ||
1165 | * @return bool $success | ||
1166 | */ | ||
1167 | function setLanguage($language) { | ||
1168 | if ($this->isFinalized || mb_strlen($language) != 2) { | ||
1169 | return FALSE; | ||
1170 | } | ||
1171 | $this->language = $language; | ||
1172 | return TRUE; | ||
1173 | } | ||
1174 | |||
1175 | /** | ||
1176 | * Get Book language. | ||
1177 | * | ||
1178 | * @access public | ||
1179 | * @return $language | ||
1180 | */ | ||
1181 | function getLanguage() { | ||
1182 | return $this->language; | ||
1183 | } | ||
1184 | |||
1185 | /** | ||
1186 | * Unique book identifier, mandatory. | ||
1187 | * Use the URI, or ISBN if available. | ||
1188 | * | ||
1189 | * An unambiguous reference to the resource within a given context. | ||
1190 | * | ||
1191 | * Recommended best practice is to identify the resource by means of a | ||
1192 | * string conforming to a formal identification system. | ||
1193 | * | ||
1194 | * Used for the dc:identifier metadata parameter in the OPF file, as well | ||
1195 | * as dtb:uid in the NCX file. | ||
1196 | * | ||
1197 | * Identifier type should only be: | ||
1198 | * EPub::IDENTIFIER_URI | ||
1199 | * EPub::IDENTIFIER_ISBN | ||
1200 | * EPub::IDENTIFIER_UUID | ||
1201 | * | ||
1202 | * @param string $identifier | ||
1203 | * @param string $identifierType | ||
1204 | * @access public | ||
1205 | * @return bool $success | ||
1206 | */ | ||
1207 | function setIdentifier($identifier, $identifierType) { | ||
1208 | if ($this->isFinalized || ($identifierType !== EPub::IDENTIFIER_URI && $identifierType !== EPub::IDENTIFIER_ISBN && $identifierType !== EPub::IDENTIFIER_UUID)) { | ||
1209 | return FALSE; | ||
1210 | } | ||
1211 | $this->identifier = $identifier; | ||
1212 | $this->identifierType = $identifierType; | ||
1213 | return TRUE; | ||
1214 | } | ||
1215 | |||
1216 | /** | ||
1217 | * Get Book identifier. | ||
1218 | * | ||
1219 | * @access public | ||
1220 | * @return $identifier | ||
1221 | */ | ||
1222 | function getIdentifier() { | ||
1223 | return $this->identifier; | ||
1224 | } | ||
1225 | |||
1226 | /** | ||
1227 | * Get Book identifierType. | ||
1228 | * | ||
1229 | * @access public | ||
1230 | * @return $identifierType | ||
1231 | */ | ||
1232 | function getIdentifierType() { | ||
1233 | return $this->identifierType; | ||
1234 | } | ||
1235 | |||
1236 | /** | ||
1237 | * Book description, optional. | ||
1238 | * | ||
1239 | * An account of the resource. | ||
1240 | * | ||
1241 | * Description may include but is not limited to: an abstract, a table of | ||
1242 | * contents, a graphical representation, or a free-text account of the | ||
1243 | * resource. | ||
1244 | * | ||
1245 | * Used for the dc:source metadata parameter in the OPF file | ||
1246 | * | ||
1247 | * @param string $description | ||
1248 | * @access public | ||
1249 | * @return bool $success | ||
1250 | */ | ||
1251 | function setDescription($description) { | ||
1252 | if ($this->isFinalized) { | ||
1253 | return FALSE; | ||
1254 | } | ||
1255 | $this->description = $description; | ||
1256 | return TRUE; | ||
1257 | } | ||
1258 | |||
1259 | /** | ||
1260 | * Get Book description. | ||
1261 | * | ||
1262 | * @access public | ||
1263 | * @return $description | ||
1264 | */ | ||
1265 | function getDescription() { | ||
1266 | return $this->description; | ||
1267 | } | ||
1268 | |||
1269 | /** | ||
1270 | * Book author or creator, optional. | ||
1271 | * The $authorSortKey is basically how the name is to be sorted, usually | ||
1272 | * it's "Lastname, First names" where the $author is the straight | ||
1273 | * "Firstnames Lastname" | ||
1274 | * | ||
1275 | * An entity primarily responsible for making the resource. | ||
1276 | * | ||
1277 | * Examples of a Creator include a person, an organization, or a service. | ||
1278 | * Typically, the name of a Creator should be used to indicate the entity. | ||
1279 | * | ||
1280 | * Used for the dc:creator metadata parameter in the OPF file and the | ||
1281 | * docAuthor attribure in the NCX file. | ||
1282 | * The sort key is used for the opf:file-as attribute in dc:creator. | ||
1283 | * | ||
1284 | * @param string $author | ||
1285 | * @param string $authorSortKey | ||
1286 | * @access public | ||
1287 | * @return bool $success | ||
1288 | */ | ||
1289 | function setAuthor($author, $authorSortKey) { | ||
1290 | if ($this->isFinalized) { | ||
1291 | return FALSE; | ||
1292 | } | ||
1293 | $this->author = $author; | ||
1294 | $this->authorSortKey = $authorSortKey; | ||
1295 | return TRUE; | ||
1296 | } | ||
1297 | |||
1298 | /** | ||
1299 | * Get Book author. | ||
1300 | * | ||
1301 | * @access public | ||
1302 | * @return $author | ||
1303 | */ | ||
1304 | function getAuthor() { | ||
1305 | return $this->author; | ||
1306 | } | ||
1307 | |||
1308 | /** | ||
1309 | * Publisher Information, optional. | ||
1310 | * | ||
1311 | * An entity responsible for making the resource available. | ||
1312 | * | ||
1313 | * Examples of a Publisher include a person, an organization, or a service. | ||
1314 | * Typically, the name of a Publisher should be used to indicate the entity. | ||
1315 | * | ||
1316 | * Used for the dc:publisher and dc:relation metadata parameters in the OPF file. | ||
1317 | * | ||
1318 | * @param string $publisherName | ||
1319 | * @param string $publisherURL | ||
1320 | * @access public | ||
1321 | * @return bool $success | ||
1322 | */ | ||
1323 | function setPublisher($publisherName, $publisherURL) { | ||
1324 | if ($this->isFinalized) { | ||
1325 | return FALSE; | ||
1326 | } | ||
1327 | $this->publisherName = $publisherName; | ||
1328 | $this->publisherURL = $publisherURL; | ||
1329 | return TRUE; | ||
1330 | } | ||
1331 | |||
1332 | /** | ||
1333 | * Get Book publisherName. | ||
1334 | * | ||
1335 | * @access public | ||
1336 | * @return $publisherName | ||
1337 | */ | ||
1338 | function getPublisherName() { | ||
1339 | return $this->publisherName; | ||
1340 | } | ||
1341 | |||
1342 | /** | ||
1343 | * Get Book publisherURL. | ||
1344 | * | ||
1345 | * @access public | ||
1346 | * @return $publisherURL | ||
1347 | */ | ||
1348 | function getPublisherURL() { | ||
1349 | return $this->publisherURL; | ||
1350 | } | ||
1351 | |||
1352 | /** | ||
1353 | * Release date, optional. If left blank, the time of the finalization will | ||
1354 | * be used. | ||
1355 | * | ||
1356 | * A point or period of time associated with an event in the lifecycle of | ||
1357 | * the resource. | ||
1358 | * | ||
1359 | * Date may be used to express temporal information at any level of | ||
1360 | * granularity. Recommended best practice is to use an encoding scheme, | ||
1361 | * such as the W3CDTF profile of ISO 8601 [W3CDTF]. | ||
1362 | * | ||
1363 | * Used for the dc:date metadata parameter in the OPF file | ||
1364 | * | ||
1365 | * @param long $timestamp | ||
1366 | * @access public | ||
1367 | * @return bool $success | ||
1368 | */ | ||
1369 | function setDate($timestamp) { | ||
1370 | if ($this->isFinalized) { | ||
1371 | return FALSE; | ||
1372 | } | ||
1373 | $this->date = $timestamp; | ||
1374 | $this->opf->date = $timestamp; | ||
1375 | return TRUE; | ||
1376 | } | ||
1377 | |||
1378 | /** | ||
1379 | * Get Book date. | ||
1380 | * | ||
1381 | * @access public | ||
1382 | * @return $date | ||
1383 | */ | ||
1384 | function getDate() { | ||
1385 | return $this->date; | ||
1386 | } | ||
1387 | |||
1388 | /** | ||
1389 | * Book (copy)rights, optional. | ||
1390 | * | ||
1391 | * Information about rights held in and over the resource. | ||
1392 | * | ||
1393 | * Typically, rights information includes a statement about various | ||
1394 | * property rights associated with the resource, including intellectual | ||
1395 | * property rights. | ||
1396 | * | ||
1397 | * Used for the dc:rights metadata parameter in the OPF file | ||
1398 | * | ||
1399 | * @param string $rightsText | ||
1400 | * @access public | ||
1401 | * @return bool $success | ||
1402 | */ | ||
1403 | function setRights($rightsText) { | ||
1404 | if ($this->isFinalized) { | ||
1405 | return FALSE; | ||
1406 | } | ||
1407 | $this->rights = $rightsText; | ||
1408 | return TRUE; | ||
1409 | } | ||
1410 | |||
1411 | /** | ||
1412 | * Get Book rights. | ||
1413 | * | ||
1414 | * @access public | ||
1415 | * @return $rights | ||
1416 | */ | ||
1417 | function getRights() { | ||
1418 | return $this->rights; | ||
1419 | } | ||
1420 | |||
1421 | /** | ||
1422 | * Add book Subject. | ||
1423 | * | ||
1424 | * The topic of the resource. | ||
1425 | * | ||
1426 | * Typically, the subject will be represented using keywords, key phrases, | ||
1427 | * or classification codes. Recommended best practice is to use a | ||
1428 | * controlled vocabulary. To describe the spatial or temporal topic of the | ||
1429 | * resource, use the Coverage element. | ||
1430 | * | ||
1431 | * @param string $subject | ||
1432 | */ | ||
1433 | function setSubject($subject) { | ||
1434 | if ($this->isFinalized) { | ||
1435 | return; | ||
1436 | } | ||
1437 | $this->opf->addDCMeta(DublinCore::SUBJECT, $this->decodeHtmlEntities($subject)); | ||
1438 | } | ||
1439 | |||
1440 | /** | ||
1441 | * Book source URL, optional. | ||
1442 | * | ||
1443 | * A related resource from which the described resource is derived. | ||
1444 | * | ||
1445 | * The described resource may be derived from the related resource in whole | ||
1446 | * or in part. Recommended best practice is to identify the related | ||
1447 | * resource by means of a string conforming to a formal identification system. | ||
1448 | * | ||
1449 | * Used for the dc:source metadata parameter in the OPF file | ||
1450 | * | ||
1451 | * @param string $sourceURL | ||
1452 | * @access public | ||
1453 | * @return bool $success | ||
1454 | */ | ||
1455 | function setSourceURL($sourceURL) { | ||
1456 | if ($this->isFinalized) { | ||
1457 | return FALSE; | ||
1458 | } | ||
1459 | $this->sourceURL = $sourceURL; | ||
1460 | return TRUE; | ||
1461 | } | ||
1462 | |||
1463 | /** | ||
1464 | * Get Book sourceURL. | ||
1465 | * | ||
1466 | * @access public | ||
1467 | * @return $sourceURL | ||
1468 | */ | ||
1469 | function getSourceURL() { | ||
1470 | return $this->sourceURL; | ||
1471 | } | ||
1472 | |||
1473 | /** | ||
1474 | * Coverage, optional. | ||
1475 | * | ||
1476 | * The spatial or temporal topic of the resource, the spatial applicability | ||
1477 | * of the resource, or the jurisdiction under which the resource is relevant. | ||
1478 | * | ||
1479 | * Spatial topic and spatial applicability may be a named place or a location | ||
1480 | * specified by its geographic coordinates. Temporal topic may be a named | ||
1481 | * period, date, or date range. A jurisdiction may be a named administrative | ||
1482 | * entity or a geographic place to which the resource applies. Recommended | ||
1483 | * best practice is to use a controlled vocabulary such as the Thesaurus of | ||
1484 | * Geographic Names [TGN]. Where appropriate, named places or time periods | ||
1485 | * can be used in preference to numeric identifiers such as sets of | ||
1486 | * coordinates or date ranges. | ||
1487 | * | ||
1488 | * Used for the dc:coverage metadata parameter in the OPF file | ||
1489 | * | ||
1490 | * Same as ->addDublinCoreMetadata(DublinCore::COVERAGE, $coverage); | ||
1491 | * | ||
1492 | * @param string $coverage | ||
1493 | * @access public | ||
1494 | * @return bool $success | ||
1495 | */ | ||
1496 | function setCoverage($coverage) { | ||
1497 | if ($this->isFinalized) { | ||
1498 | return FALSE; | ||
1499 | } | ||
1500 | $this->coverage = $coverage; | ||
1501 | return TRUE; | ||
1502 | } | ||
1503 | |||
1504 | /** | ||
1505 | * Get Book coverage. | ||
1506 | * | ||
1507 | * @access public | ||
1508 | * @return $coverage | ||
1509 | */ | ||
1510 | function getCoverage() { | ||
1511 | return $this->coverage; | ||
1512 | } | ||
1513 | |||
1514 | /** | ||
1515 | * Set book Relation. | ||
1516 | * | ||
1517 | * A related resource. | ||
1518 | * | ||
1519 | * Recommended best practice is to identify the related resource by means | ||
1520 | * of a string conforming to a formal identification system. | ||
1521 | * | ||
1522 | * @param string $relation | ||
1523 | */ | ||
1524 | function setRelation($relation) { | ||
1525 | if ($this->isFinalized) { | ||
1526 | return; | ||
1527 | } | ||
1528 | $this->relation = $relation; | ||
1529 | } | ||
1530 | |||
1531 | /** | ||
1532 | * Get the book relation. | ||
1533 | * | ||
1534 | * @return string The relation. | ||
1535 | */ | ||
1536 | function getRelation() { | ||
1537 | return $this->relation; | ||
1538 | } | ||
1539 | |||
1540 | /** | ||
1541 | * Set book Generator. | ||
1542 | * | ||
1543 | * The generator is a meta tag added to the ncx file, it is not visible | ||
1544 | * from within the book, but is a kind of electronic watermark. | ||
1545 | * | ||
1546 | * @param string $generator | ||
1547 | */ | ||
1548 | function setGenerator($generator) { | ||
1549 | if ($this->isFinalized) { | ||
1550 | return; | ||
1551 | } | ||
1552 | $this->generator = $generator; | ||
1553 | } | ||
1554 | |||
1555 | /** | ||
1556 | * Get the book relation. | ||
1557 | * | ||
1558 | * @return string The generator identity string. | ||
1559 | */ | ||
1560 | function getGenerator() { | ||
1561 | return $this->generator; | ||
1562 | } | ||
1563 | |||
1564 | /** | ||
1565 | * Set ePub date formate to the short yyyy-mm-dd form, for compliance with | ||
1566 | * a bug in EpubCheck, prior to its version 1.1. | ||
1567 | * | ||
1568 | * The latest version of ePubCheck can be obtained here: | ||
1569 | * http://code.google.com/p/epubcheck/ | ||
1570 | * | ||
1571 | * @access public | ||
1572 | * @return bool $success | ||
1573 | */ | ||
1574 | function setShortDateFormat() { | ||
1575 | if ($this->isFinalized) { | ||
1576 | return FALSE; | ||
1577 | } | ||
1578 | $this->dateformat = $this->dateformatShort; | ||
1579 | return TRUE; | ||
1580 | } | ||
1581 | |||
1582 | /** | ||
1583 | * @Deprecated | ||
1584 | */ | ||
1585 | function setIgnoreEmptyBuffer($ignoreEmptyBuffer = TRUE) { | ||
1586 | die ("Function was deprecated, functionality is no longer needed."); | ||
1587 | } | ||
1588 | |||
1589 | /** | ||
1590 | * Set the references title for the ePub 3 landmarks section | ||
1591 | * | ||
1592 | * @param string $referencesTitle | ||
1593 | * @param string $referencesId | ||
1594 | * @param string $referencesClass | ||
1595 | * @return bool | ||
1596 | */ | ||
1597 | function setReferencesTitle($referencesTitle = "Guide", $referencesId = "", $referencesClass = "references") { | ||
1598 | if ($this->isFinalized) { | ||
1599 | return FALSE; | ||
1600 | } | ||
1601 | $this->ncx->referencesTitle = is_string($referencesTitle) ? trim($referencesTitle) : "Guide"; | ||
1602 | $this->ncx->referencesId = is_string($referencesId) ? trim($referencesId) : "references"; | ||
1603 | $this->ncx->referencesClass = is_string($referencesClass) ? trim($referencesClass) : "references"; | ||
1604 | return TRUE; | ||
1605 | } | ||
1606 | |||
1607 | /** | ||
1608 | * Set the references title for the ePub 3 landmarks section | ||
1609 | * | ||
1610 | * @param bool $referencesTitle | ||
1611 | */ | ||
1612 | function setisReferencesAddedToToc($isReferencesAddedToToc = TRUE) { | ||
1613 | if ($this->isFinalized) { | ||
1614 | return FALSE; | ||
1615 | } | ||
1616 | $this->isReferencesAddedToToc = $isReferencesAddedToToc === TRUE; | ||
1617 | return TRUE; | ||
1618 | } | ||
1619 | |||
1620 | /** | ||
1621 | * Get Book status. | ||
1622 | * | ||
1623 | * @access public | ||
1624 | * @return bool | ||
1625 | */ | ||
1626 | function isFinalized() { | ||
1627 | return $this->isFinalized; | ||
1628 | } | ||
1629 | |||
1630 | /** | ||
1631 | * Build the Table of Contents. This is not strictly necessary, as most eReaders will build it from the navigation structure in the .ncx file. | ||
1632 | * | ||
1633 | * @param string $cssFileName Include a link to this css file in the TOC html. | ||
1634 | * @param string $tocCSSClass The TOC is a <div>, if you need special formatting, you can add a css class for that div. Default is "toc". | ||
1635 | * @param string $title Title of the Table of contents. Default is "Table of Contents". Use this for ie. languages other than English. | ||
1636 | * @param bool $addReferences include reference pages in the TOC, using the $referencesOrder array to determine the order of the pages in the TOC. Default is TRUE. | ||
1637 | * @param bool $addToIndex Add the TOC to the NCX index at the current leve/position. Default is FALSE | ||
1638 | * @param string $tocFileName Change teh default name of the TOC file. The default is "TOC.xhtml" | ||
1639 | */ | ||
1640 | function buildTOC($cssFileName = NULL, $tocCSSClass = "toc", $title = "Table of Contents", $addReferences = TRUE, $addToIndex = FALSE, $tocFileName = "TOC.xhtml") { | ||
1641 | if ($this->isFinalized) { | ||
1642 | return FALSE; | ||
1643 | } | ||
1644 | $this->buildTOC = TRUE; | ||
1645 | $this->tocTitle = $title; | ||
1646 | $this->tocFileName = $this->normalizeFileName($tocFileName); | ||
1647 | if (!empty($cssFileName)) { | ||
1648 | $this->tocCSSFileName = $this->normalizeFileName($cssFileName); | ||
1649 | } | ||
1650 | $this->tocCSSClass = $tocCSSClass; | ||
1651 | $this->tocAddReferences = $addReferences; | ||
1652 | |||
1653 | $this->opf->addItemRef("ref_" . Reference::TABLE_OF_CONTENTS, FALSE); | ||
1654 | $this->opf->addReference(Reference::TABLE_OF_CONTENTS, $title, $this->tocFileName); | ||
1655 | |||
1656 | if ($addToIndex) { | ||
1657 | $navPoint = new NavPoint($this->decodeHtmlEntities($title), $this->tocFileName, "ref_" . Reference::TABLE_OF_CONTENTS); | ||
1658 | $this->ncx->addNavPoint($navPoint); | ||
1659 | } else { | ||
1660 | $this->ncx->referencesList[Reference::TABLE_OF_CONTENTS] = $this->tocFileName; | ||
1661 | $this->ncx->referencesName[Reference::TABLE_OF_CONTENTS] = $title; | ||
1662 | } | ||
1663 | } | ||
1664 | |||
1665 | private function finalizeTOC() { | ||
1666 | if (!$this->buildTOC) { | ||
1667 | return FALSE; | ||
1668 | } | ||
1669 | |||
1670 | if (empty($this->tocTitle)) { | ||
1671 | $this->tocTitle = "Table of Contents"; | ||
1672 | } | ||
1673 | |||
1674 | $tocData = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"; | ||
1675 | |||
1676 | if ($this->isEPubVersion2()) { | ||
1677 | $tocData .= "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\n" | ||
1678 | . " \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n" | ||
1679 | . "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" | ||
1680 | . "<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n"; | ||
1681 | } else { | ||
1682 | $tocData .= "<html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\">\n" | ||
1683 | . "<head>\n<meta http-equiv=\"Default-Style\" content=\"text/html; charset=utf-8\" />\n"; | ||
1684 | } | ||
1685 | |||
1686 | if (!empty($this->tocCssFileName)) { | ||
1687 | $tocData .= "<link rel=\"stylesheet\" type=\"text/css\" href=\"" . $this->tocCssFileName . "\" />\n"; | ||
1688 | } | ||
1689 | |||
1690 | $tocData .= "<title>" . $this->tocTitle . "</title>\n" | ||
1691 | . "</head>\n" | ||
1692 | . "<body>\n" | ||
1693 | . "<h3>" . $this->tocTitle . "</h3>\n<div"; | ||
1694 | |||
1695 | if (!empty($this->tocCSSClass)) { | ||
1696 | $tocData .= " class=\"" . $this->tocCSSClass . "\""; | ||
1697 | } | ||
1698 | $tocData .= ">\n"; | ||
1699 | |||
1700 | while (list($item, $descriptive) = each($this->referencesOrder)) { | ||
1701 | if ($item === "text") { | ||
1702 | while (list($chapterName, $navPoint) = each($this->ncx->chapterList)) { | ||
1703 | $fileName = $navPoint->getContentSrc(); | ||
1704 | $level = $navPoint->getLevel() -2; | ||
1705 | $tocData .= "\t<p>" . str_repeat("      ", $level) . "<a href=\"" . $this->sanitizeFileName($fileName) . "\">" . $chapterName . "</a></p>\n"; | ||
1706 | } | ||
1707 | } else if ($this->tocAddReferences === TRUE) { | ||
1708 | if (array_key_exists($item, $this->ncx->referencesList)) { | ||
1709 | $tocData .= "\t<p><a href=\"" . $this->ncx->referencesList[$item] . "\">" . $descriptive . "</a></p>\n"; | ||
1710 | } else if ($item === "toc") { | ||
1711 | $tocData .= "\t<p><a href=\"TOC.xhtml\">" . $this->tocTitle . "</a></p>\n"; | ||
1712 | } else if ($item === "cover" && $this->isCoverImageSet) { | ||
1713 | $tocData .= "\t<p><a href=\"CoverPage.xhtml\">" . $descriptive . "</a></p>\n"; | ||
1714 | } | ||
1715 | } | ||
1716 | } | ||
1717 | $tocData .= "</div>\n</body>\n</html>\n"; | ||
1718 | |||
1719 | $this->addReferencePage($this->tocTitle, $this->tocFileName, $tocData, Reference::TABLE_OF_CONTENTS); | ||
1720 | |||
1721 | } | ||
1722 | |||
1723 | /** | ||
1724 | * @return bool | ||
1725 | */ | ||
1726 | function isEPubVersion2() { | ||
1727 | return $this->bookVersion === EPub::BOOK_VERSION_EPUB2; | ||
1728 | } | ||
1729 | |||
1730 | /** | ||
1731 | * @param string $cssFileName | ||
1732 | * @param string $title | ||
1733 | * @return string | ||
1734 | */ | ||
1735 | function buildEPub3TOC($cssFileName = NULL, $title = "Table of Contents") { | ||
1736 | $this->ncx->referencesOrder = $this->referencesOrder; | ||
1737 | $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title)); | ||
1738 | return $this->ncx->finalizeEPub3($title, $cssFileName); | ||
1739 | } | ||
1740 | |||
1741 | /** | ||
1742 | * @param string $fileName | ||
1743 | * @param string $tocData | ||
1744 | * @return bool | ||
1745 | */ | ||
1746 | function addEPub3TOC($fileName, $tocData) { | ||
1747 | if ($this->isEPubVersion2() || $this->isFinalized || array_key_exists($fileName, $this->fileList)) { | ||
1748 | return FALSE; | ||
1749 | } | ||
1750 | $fileName = Zip::getRelativePath($fileName); | ||
1751 | $fileName = preg_replace('#^[/\.]+#i', "", $fileName); | ||
1752 | |||
1753 | $this->zip->addFile($tocData, $this->bookRoot.$fileName); | ||
1754 | |||
1755 | $this->fileList[$fileName] = $fileName; | ||
1756 | $this->opf->addItem("toc", $fileName, "application/xhtml+xml", "nav"); | ||
1757 | return TRUE; | ||
1758 | } | ||
1759 | |||
1760 | /** | ||
1761 | * Check for mandatory parameters and finalize the e-book. | ||
1762 | * Once finalized, the book is locked for further additions. | ||
1763 | * | ||
1764 | * @return bool $success | ||
1765 | */ | ||
1766 | function finalize() { | ||
1767 | if ($this->isFinalized || $this->chapterCount == 0 || empty($this->title) || empty($this->language)) { | ||
1768 | return FALSE; | ||
1769 | } | ||
1770 | |||
1771 | if (empty($this->identifier) || empty($this->identifierType)) { | ||
1772 | $this->setIdentifier($this->createUUID(4), EPub::IDENTIFIER_UUID); | ||
1773 | } | ||
1774 | |||
1775 | if ($this->date == 0) { | ||
1776 | $this->date = time(); | ||
1777 | } | ||
1778 | |||
1779 | if (empty($this->sourceURL)) { | ||
1780 | $this->sourceURL = $this->getCurrentPageURL(); | ||
1781 | } | ||
1782 | |||
1783 | if (empty($this->publisherURL)) { | ||
1784 | $this->sourceURL = $this->getCurrentServerURL(); | ||
1785 | } | ||
1786 | |||
1787 | // Generate OPF data: | ||
1788 | $this->opf->setIdent("BookId"); | ||
1789 | $this->opf->initialize($this->title, $this->language, $this->identifier, $this->identifierType); | ||
1790 | |||
1791 | $DCdate = new DublinCore(DublinCore::DATE, gmdate($this->dateformat, $this->date)); | ||
1792 | $DCdate->addOpfAttr("event", "publication"); | ||
1793 | $this->opf->metadata->addDublinCore($DCdate); | ||
1794 | |||
1795 | if (!empty($this->description)) { | ||
1796 | $this->opf->addDCMeta(DublinCore::DESCRIPTION, $this->decodeHtmlEntities($this->description)); | ||
1797 | } | ||
1798 | |||
1799 | if (!empty($this->publisherName)) { | ||
1800 | $this->opf->addDCMeta(DublinCore::PUBLISHER, $this->decodeHtmlEntities($this->publisherName)); | ||
1801 | } | ||
1802 | |||
1803 | if (!empty($this->publisherURL)) { | ||
1804 | $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->publisherURL)); | ||
1805 | } | ||
1806 | |||
1807 | if (!empty($this->author)) { | ||
1808 | $author = $this->decodeHtmlEntities($this->author); | ||
1809 | $this->opf->addCreator($author, $this->decodeHtmlEntities($this->authorSortKey), MarcCode::AUTHOR); | ||
1810 | $this->ncx->setDocAuthor($author); | ||
1811 | } | ||
1812 | |||
1813 | if (!empty($this->rights)) { | ||
1814 | $this->opf->addDCMeta(DublinCore::RIGHTS, $this->decodeHtmlEntities($this->rights)); | ||
1815 | } | ||
1816 | |||
1817 | if (!empty($this->coverage)) { | ||
1818 | $this->opf->addDCMeta(DublinCore::COVERAGE, $this->decodeHtmlEntities($this->coverage)); | ||
1819 | } | ||
1820 | |||
1821 | if (!empty($this->sourceURL)) { | ||
1822 | $this->opf->addDCMeta(DublinCore::SOURCE, $this->sourceURL); | ||
1823 | } | ||
1824 | |||
1825 | if (!empty($this->relation)) { | ||
1826 | $this->opf->addDCMeta(DublinCore::RELATION, $this->decodeHtmlEntities($this->relation)); | ||
1827 | } | ||
1828 | |||
1829 | if ($this->isCoverImageSet) { | ||
1830 | $this->opf->addMeta("cover", "coverImage"); | ||
1831 | } | ||
1832 | |||
1833 | if (!empty($this->generator)) { | ||
1834 | $gen = $this->decodeHtmlEntities($this->generator); | ||
1835 | $this->opf->addMeta("generator", $gen); | ||
1836 | $this->ncx->addMetaEntry("dtb:generator", $gen); | ||
1837 | } | ||
1838 | |||
1839 | if ($this->EPubMark) { | ||
1840 | $this->opf->addMeta("generator", "EPub (Version " . self::VERSION . ") by A. Grandt, http://www.phpclasses.org/package/6115"); | ||
1841 | } | ||
1842 | |||
1843 | reset($this->ncx->chapterList); | ||
1844 | list($firstChapterName, $firstChapterNavPoint) = each($this->ncx->chapterList); | ||
1845 | $firstChapterFileName = $firstChapterNavPoint->getContentSrc(); | ||
1846 | $this->opf->addReference(Reference::TEXT, $this->decodeHtmlEntities($firstChapterName), $firstChapterFileName); | ||
1847 | |||
1848 | $this->ncx->setUid($this->identifier); | ||
1849 | |||
1850 | $this->ncx->setDocTitle($this->decodeHtmlEntities($this->title)); | ||
1851 | |||
1852 | $this->ncx->referencesOrder = $this->referencesOrder; | ||
1853 | if ($this->isReferencesAddedToToc) { | ||
1854 | $this->ncx->finalizeReferences(); | ||
1855 | } | ||
1856 | |||
1857 | $this->finalizeTOC(); | ||
1858 | |||
1859 | if (!$this->isEPubVersion2()) { | ||
1860 | $this->addEPub3TOC("epub3toc.xhtml", $this->buildEPub3TOC()); | ||
1861 | } | ||
1862 | |||
1863 | $opfFinal = $this->fixEncoding($this->opf->finalize()); | ||
1864 | $ncxFinal = $this->fixEncoding($this->ncx->finalize()); | ||
1865 | |||
1866 | if (mb_detect_encoding($opfFinal, 'UTF-8', true) === "UTF-8") { | ||
1867 | $this->zip->addFile($opfFinal, $this->bookRoot."book.opf"); | ||
1868 | } else { | ||
1869 | $this->zip->addFile(mb_convert_encoding($opfFinal, "UTF-8"), $this->bookRoot."book.opf"); | ||
1870 | } | ||
1871 | |||
1872 | if (mb_detect_encoding($ncxFinal, 'UTF-8', true) === "UTF-8") { | ||
1873 | $this->zip->addFile($ncxFinal, $this->bookRoot."book.ncx"); | ||
1874 | } else { | ||
1875 | $this->zip->addFile(mb_convert_encoding($ncxFinal, "UTF-8"), $this->bookRoot."book.ncx"); | ||
1876 | } | ||
1877 | |||
1878 | $this->opf = NULL; | ||
1879 | $this->ncx = NULL; | ||
1880 | |||
1881 | $this->isFinalized = TRUE; | ||
1882 | return TRUE; | ||
1883 | } | ||
1884 | |||
1885 | /** | ||
1886 | * Ensure the encoded string is a valid UTF-8 string. | ||
1887 | * | ||
1888 | * Note, that a mb_detect_encoding on the returned string will still return ASCII if the entire string is comprized of characters in the 1-127 range. | ||
1889 | * | ||
1890 | * @link: http://snippetdb.com/php/convert-string-to-utf-8-for-mysql | ||
1891 | * @param string $in_str | ||
1892 | * @return string converted string. | ||
1893 | */ | ||
1894 | function fixEncoding($in_str) { | ||
1895 | if (mb_detect_encoding($in_str) == "UTF-8" && mb_check_encoding($in_str,"UTF-8")) { | ||
1896 | return $in_str; | ||
1897 | } else { | ||
1898 | return utf8_encode($in_str); | ||
1899 | } | ||
1900 | } | ||
1901 | |||
1902 | /** | ||
1903 | * Return the finalized book. | ||
1904 | * | ||
1905 | * @return string with the book in binary form. | ||
1906 | */ | ||
1907 | function getBook() { | ||
1908 | if (!$this->isFinalized) { | ||
1909 | $this->finalize(); | ||
1910 | } | ||
1911 | |||
1912 | return $this->zip->getZipData(); | ||
1913 | } | ||
1914 | |||
1915 | /** | ||
1916 | * Remove disallowed characters from string to get a nearly safe filename | ||
1917 | * | ||
1918 | * @param string $fileName | ||
1919 | * @return mixed|string | ||
1920 | */ | ||
1921 | function sanitizeFileName($fileName) { | ||
1922 | $fileName1 = str_replace($this->forbiddenCharacters, '', $fileName); | ||
1923 | $fileName2 = preg_replace('/[\s-]+/', '-', $fileName1); | ||
1924 | return trim($fileName2, '.-_'); | ||
1925 | |||
1926 | } | ||
1927 | |||
1928 | /** | ||
1929 | * Cleanup the filepath, and remove leading . and / characters. | ||
1930 | * | ||
1931 | * Sometimes, when a path is generated from multiple fragments, | ||
1932 | * you can get something like "../data/html/../images/image.jpeg" | ||
1933 | * ePub files don't work well with that, this will normalize that | ||
1934 | * example path to "data/images/image.jpeg" | ||
1935 | * | ||
1936 | * @param string $fileName | ||
1937 | * @return string normalized filename | ||
1938 | */ | ||
1939 | function normalizeFileName($fileName) { | ||
1940 | return preg_replace('#^[/\.]+#i', "", Zip::getRelativePath($fileName)); | ||
1941 | } | ||
1942 | |||
1943 | /** | ||
1944 | * Save the ePub file to local disk. | ||
1945 | * | ||
1946 | * @param string $fileName | ||
1947 | * @param string $baseDir If empty baseDir is absolute to server path, if omitted it's relative to script path | ||
1948 | * @return The sent file name if successfull, FALSE if it failed. | ||
1949 | */ | ||
1950 | function saveBook($fileName, $baseDir = '.') { | ||
1951 | |||
1952 | // Make fileName safe | ||
1953 | $fileName = $this->sanitizeFileName($fileName); | ||
1954 | |||
1955 | // Finalize book, if it's not done already | ||
1956 | if (!$this->isFinalized) { | ||
1957 | $this->finalize(); | ||
1958 | } | ||
1959 | |||
1960 | if (stripos(strrev($fileName), "bupe.") !== 0) { | ||
1961 | $fileName .= ".epub"; | ||
1962 | } | ||
1963 | |||
1964 | // Try to open file access | ||
1965 | $fh = fopen($baseDir.'/'.$fileName, "w"); | ||
1966 | |||
1967 | if ($fh) { | ||
1968 | fputs($fh, $this->getBook()); | ||
1969 | fclose($fh); | ||
1970 | |||
1971 | // if file is written return TRUE | ||
1972 | return $fileName; | ||
1973 | } | ||
1974 | |||
1975 | // return FALSE by default | ||
1976 | return FALSE; | ||
1977 | } | ||
1978 | |||
1979 | /** | ||
1980 | * Return the finalized book size. | ||
1981 | * | ||
1982 | * @return string | ||
1983 | */ | ||
1984 | function getBookSize() { | ||
1985 | if (!$this->isFinalized) { | ||
1986 | $this->finalize(); | ||
1987 | } | ||
1988 | |||
1989 | return $this->zip->getArchiveSize(); | ||
1990 | } | ||
1991 | |||
1992 | /** | ||
1993 | * Send the book as a zip download | ||
1994 | * | ||
1995 | * Sending will fail if the output buffer is in use. You can override this limit by | ||
1996 | * calling setIgnoreEmptyBuffer(TRUE), though the function will still fail if that | ||
1997 | * buffer is not empty. | ||
1998 | * | ||
1999 | * @param string $fileName The name of the book without the .epub at the end. | ||
2000 | * @return The sent file name if successfull, FALSE if it failed. | ||
2001 | */ | ||
2002 | function sendBook($fileName) { | ||
2003 | if (!$this->isFinalized) { | ||
2004 | $this->finalize(); | ||
2005 | } | ||
2006 | |||
2007 | if (stripos(strrev($fileName), "bupe.") !== 0) { | ||
2008 | $fileName .= ".epub"; | ||
2009 | } | ||
2010 | |||
2011 | if (TRUE === $this->zip->sendZip($fileName, "application/epub+zip")) { | ||
2012 | return $fileName; | ||
2013 | } | ||
2014 | return FALSE; | ||
2015 | } | ||
2016 | |||
2017 | /** | ||
2018 | * Generates an UUID. | ||
2019 | * | ||
2020 | * Default version (4) will generate a random UUID, version 3 will URL based UUID. | ||
2021 | * | ||
2022 | * Added for convinience | ||
2023 | * | ||
2024 | * @param int $bookVersion UUID version to retrieve, See lib.uuid.manual.html for details. | ||
2025 | * @param string $url | ||
2026 | * @return string The formatted uuid | ||
2027 | */ | ||
2028 | function createUUID($bookVersion = 4, $url = NULL) { | ||
2029 | include_once("lib.uuid.php"); | ||
2030 | return UUID::mint($bookVersion, $url, UUID::nsURL); | ||
2031 | } | ||
2032 | |||
2033 | /** | ||
2034 | * Get the url of the current page. | ||
2035 | * Example use: Default Source URL | ||
2036 | * | ||
2037 | * $return string Page URL. | ||
2038 | */ | ||
2039 | function getCurrentPageURL() { | ||
2040 | $pageURL = $this->getCurrentServerURL() . filter_input(INPUT_SERVER, "REQUEST_URI"); | ||
2041 | return $pageURL; | ||
2042 | } | ||
2043 | |||
2044 | /** | ||
2045 | * Get the url of the server. | ||
2046 | * Example use: Default Publisher URL | ||
2047 | * | ||
2048 | * $return string Server URL. | ||
2049 | */ | ||
2050 | function getCurrentServerURL() { | ||
2051 | $serverURL = 'http'; | ||
2052 | $https = filter_input(INPUT_SERVER, "HTTPS"); | ||
2053 | $port = filter_input(INPUT_SERVER, "SERVER_PORT"); | ||
2054 | |||
2055 | if ($https === "on") { | ||
2056 | $serverURL .= "s"; | ||
2057 | } | ||
2058 | $serverURL .= "://" . filter_input(INPUT_SERVER, "SERVER_NAME"); | ||
2059 | if ($port != "80") { | ||
2060 | $serverURL .= ":" . $port; | ||
2061 | } | ||
2062 | return $serverURL . '/'; | ||
2063 | } | ||
2064 | |||
2065 | /** | ||
2066 | * Try to determine the mimetype of the file path. | ||
2067 | * | ||
2068 | * @param string $source Path | ||
2069 | * @return string mimetype, or FALSE. | ||
2070 | */ | ||
2071 | function getMime($source) { | ||
2072 | return $this->mimetypes[pathinfo($source, PATHINFO_EXTENSION)]; | ||
2073 | } | ||
2074 | |||
2075 | /** | ||
2076 | * Get an image from a file or url, return it resized if the image exceeds the $maxImageWidth or $maxImageHeight directives. | ||
2077 | * | ||
2078 | * The return value is an array. | ||
2079 | * ['width'] is the width of the image. | ||
2080 | * ['height'] is the height of the image. | ||
2081 | * ['mime'] is the mime type of the image. Resized images are always in jpeg format. | ||
2082 | * ['image'] is the image data. | ||
2083 | * ['ext'] is the extension of the image file. | ||
2084 | * | ||
2085 | * @param string $source path or url to file. | ||
2086 | * $return array | ||
2087 | */ | ||
2088 | function getImage($source) { | ||
2089 | $width = -1; | ||
2090 | $height = -1; | ||
2091 | $mime = "application/octet-stream"; | ||
2092 | $type = FALSE; | ||
2093 | $ext = ""; | ||
2094 | |||
2095 | |||
2096 | $image = $this->getFileContents($source); | ||
2097 | |||
2098 | if ($image !== FALSE && strlen($image) > 0) { | ||
2099 | $imageFile = imagecreatefromstring($image); | ||
2100 | if ($imageFile !== false) { | ||
2101 | $width = ImageSX($imageFile); | ||
2102 | $height = ImageSY($imageFile); | ||
2103 | } | ||
2104 | if ($this->isExifInstalled) { | ||
2105 | @$type = exif_imagetype($source); | ||
2106 | $mime = image_type_to_mime_type($type); | ||
2107 | } | ||
2108 | if ($mime === "application/octet-stream") { | ||
2109 | $mime = $this->image_file_type_from_binary($image); | ||
2110 | } | ||
2111 | if ($mime === "application/octet-stream") { | ||
2112 | $mime = $this->getMimeTypeFromUrl($source); | ||
2113 | } | ||
2114 | } else { | ||
2115 | return FALSE; | ||
2116 | } | ||
2117 | |||
2118 | if ($width <= 0 || $height <= 0) { | ||
2119 | return FALSE; | ||
2120 | } | ||
2121 | |||
2122 | $ratio = 1; | ||
2123 | |||
2124 | if ($this->isGdInstalled) { | ||
2125 | if ($width > $this->maxImageWidth) { | ||
2126 | $ratio = $this->maxImageWidth/$width; | ||
2127 | } | ||
2128 | if ($height*$ratio > $this->maxImageHeight) { | ||
2129 | $ratio = $this->maxImageHeight/$height; | ||
2130 | } | ||
2131 | |||
2132 | if ($ratio < 1 || empty($mime) || ($this->isGifImagesEnabled !== FALSE && $mime == "image/gif")) { | ||
2133 | $image_o = imagecreatefromstring($image); | ||
2134 | $image_p = imagecreatetruecolor($width*$ratio, $height*$ratio); | ||
2135 | |||
2136 | if ($mime == "image/png") { | ||
2137 | imagealphablending($image_p, false); | ||
2138 | imagesavealpha($image_p, true); | ||
2139 | imagealphablending($image_o, true); | ||
2140 | |||
2141 | imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height); | ||
2142 | ob_start(); | ||
2143 | imagepng($image_p, NULL, 9); | ||
2144 | $image = ob_get_contents(); | ||
2145 | ob_end_clean(); | ||
2146 | |||
2147 | $ext = "png"; | ||
2148 | } else { | ||
2149 | imagecopyresampled($image_p, $image_o, 0, 0, 0, 0, ($width*$ratio), ($height*$ratio), $width, $height); | ||
2150 | ob_start(); | ||
2151 | imagejpeg($image_p, NULL, 80); | ||
2152 | $image = ob_get_contents(); | ||
2153 | ob_end_clean(); | ||
2154 | |||
2155 | $mime = "image/jpeg"; | ||
2156 | $ext = "jpg"; | ||
2157 | } | ||
2158 | imagedestroy($image_o); | ||
2159 | imagedestroy($image_p); | ||
2160 | } | ||
2161 | } | ||
2162 | |||
2163 | if ($ext === "") { | ||
2164 | static $mimeToExt = array ( | ||
2165 | 'image/jpeg' => 'jpg', | ||
2166 | 'image/gif' => 'gif', | ||
2167 | 'image/png' => 'png' | ||
2168 | ); | ||
2169 | |||
2170 | if (isset($mimeToExt[$mime])) { | ||
2171 | $ext = $mimeToExt[$mime]; | ||
2172 | } | ||
2173 | } | ||
2174 | |||
2175 | $rv = array(); | ||
2176 | $rv['width'] = $width*$ratio; | ||
2177 | $rv['height'] = $height*$ratio; | ||
2178 | $rv['mime'] = $mime; | ||
2179 | $rv['image'] = $image; | ||
2180 | $rv['ext'] = $ext; | ||
2181 | |||
2182 | return $rv; | ||
2183 | } | ||
2184 | |||
2185 | /** | ||
2186 | * Get file contents, using curl if available, else file_get_contents | ||
2187 | * | ||
2188 | * @param string $source | ||
2189 | * @return bool | ||
2190 | */ | ||
2191 | function getFileContents($source, $toTempFile = FALSE) { | ||
2192 | $isExternal = preg_match('#^(http|ftp)s?://#i', $source) == 1; | ||
2193 | |||
2194 | if ($isExternal && $this->isCurlInstalled) { | ||
2195 | $ch = curl_init(); | ||
2196 | $outFile = NULL; | ||
2197 | $fp = NULL; | ||
2198 | $res = FALSE; | ||
2199 | $info = array('http_code' => 500); | ||
2200 | |||
2201 | curl_setopt($ch, CURLOPT_HEADER, 0); | ||
2202 | curl_setopt($ch, CURLOPT_URL, str_replace(" ","%20",$source)); | ||
2203 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | ||
2204 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | ||
2205 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 4096); | ||
2206 | |||
2207 | if ($toTempFile) { | ||
2208 | $outFile = tempnam(sys_get_temp_dir(), "EPub_v" . EPub::VERSION . "_"); | ||
2209 | $fp = fopen($outFile, "w+b"); | ||
2210 | curl_setopt($ch, CURLOPT_FILE, $fp); | ||
2211 | |||
2212 | $res = curl_exec($ch); | ||
2213 | $info = curl_getinfo($ch); | ||
2214 | |||
2215 | curl_close($ch); | ||
2216 | fclose($fp); | ||
2217 | } else { | ||
2218 | $res = curl_exec($ch); | ||
2219 | $info = curl_getinfo($ch); | ||
2220 | |||
2221 | curl_close($ch); | ||
2222 | } | ||
2223 | |||
2224 | if ($info['http_code'] == 200 && $res != false) { | ||
2225 | if ($toTempFile) { | ||
2226 | return $outFile; | ||
2227 | } | ||
2228 | return $res; | ||
2229 | } | ||
2230 | return FALSE; | ||
2231 | } | ||
2232 | |||
2233 | if ($this->isFileGetContentsInstalled && (!$isExternal || $this->isFileGetContentsExtInstalled)) { | ||
2234 | @$data = file_get_contents($source); | ||
2235 | return $data; | ||
2236 | } | ||
2237 | return FALSE; | ||
2238 | } | ||
2239 | |||
2240 | /** | ||
2241 | * get mime type from image data | ||
2242 | * | ||
2243 | * By fireweasel found on http://stackoverflow.com/questions/2207095/get-image-mimetype-from-resource-in-php-gd | ||
2244 | * @staticvar array $type | ||
2245 | * @param object $binary | ||
2246 | * @return string | ||
2247 | */ | ||
2248 | function image_file_type_from_binary($binary) { | ||
2249 | $hits = 0; | ||
2250 | if (!preg_match( | ||
2251 | '/\A(?:(\xff\xd8\xff)|(GIF8[79]a)|(\x89PNG\x0d\x0a)|(BM)|(\x49\x49(?:\x2a\x00|\x00\x4a))|(FORM.{4}ILBM))/', | ||
2252 | $binary, $hits)) { | ||
2253 | return 'application/octet-stream'; | ||
2254 | } | ||
2255 | static $type = array ( | ||
2256 | 1 => 'image/jpeg', | ||
2257 | 2 => 'image/gif', | ||
2258 | 3 => 'image/png', | ||
2259 | 4 => 'image/x-windows-bmp', | ||
2260 | 5 => 'image/tiff', | ||
2261 | 6 => 'image/x-ilbm', | ||
2262 | ); | ||
2263 | return $type[count($hits) - 1]; | ||
2264 | } | ||
2265 | |||
2266 | /** | ||
2267 | * @param string $source URL Source | ||
2268 | * @return string MimeType | ||
2269 | */ | ||
2270 | function getMimeTypeFromUrl($source) { | ||
2271 | $ext = FALSE; | ||
2272 | |||
2273 | $srev = strrev($source); | ||
2274 | $pos = strpos($srev, "?"); | ||
2275 | if ($pos !== FALSE) { | ||
2276 | $srev = substr($srev, $pos+1); | ||
2277 | } | ||
2278 | |||
2279 | $pos = strpos($srev, "."); | ||
2280 | if ($pos !== FALSE) { | ||
2281 | $ext = strtolower(strrev(substr($srev, 0, $pos))); | ||
2282 | } | ||
2283 | |||
2284 | if ($ext !== FALSE) { | ||
2285 | return $this->getMimeTypeFromExtension($ext); | ||
2286 | } | ||
2287 | return "application/octet-stream"; | ||
2288 | } | ||
2289 | |||
2290 | /** | ||
2291 | * @param string $ext Extension | ||
2292 | * @return string MimeType | ||
2293 | */ | ||
2294 | function getMimeTypeFromExtension($ext) { | ||
2295 | switch ($ext) { | ||
2296 | case "jpg": | ||
2297 | case "jpe": | ||
2298 | case "jpeg": | ||
2299 | return 'image/jpeg'; | ||
2300 | case "gif": | ||
2301 | return 'image/gif'; | ||
2302 | case "png": | ||
2303 | return 'image/png'; | ||
2304 | case "bmp": | ||
2305 | return 'image/x-windows-bmp'; | ||
2306 | case "tif": | ||
2307 | case "tiff": | ||
2308 | case "cpt": | ||
2309 | return 'image/tiff'; | ||
2310 | case "lbm": | ||
2311 | case "ilbm": | ||
2312 | return 'image/x-ilbm'; | ||
2313 | default: | ||
2314 | return "application/octet-stream"; | ||
2315 | } | ||
2316 | } | ||
2317 | |||
2318 | /** | ||
2319 | * Encode html code to use html entities, safeguarding it from potential character encoding peoblems | ||
2320 | * This function is a bit different from the vanilla htmlentities function in that it does not encode html tags. | ||
2321 | * | ||
2322 | * The regexp is taken from the PHP Manual discussion, it was written by user "busbyjon". | ||
2323 | * http://www.php.net/manual/en/function.htmlentities.php#90111 | ||
2324 | * | ||
2325 | * @param string $string string to encode. | ||
2326 | */ | ||
2327 | public function encodeHtml($string) { | ||
2328 | $string = strtr($string, $this->html_encoding_characters); | ||
2329 | |||
2330 | //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&\\1", $string); | ||
2331 | //return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/", "&", $string); | ||
2332 | return $string; | ||
2333 | } | ||
2334 | |||
2335 | /** | ||
2336 | * Helper function to create a DOM fragment with given markup. | ||
2337 | * | ||
2338 | * @author Adam Schmalhofer | ||
2339 | * | ||
2340 | * @param DOMDocument $dom | ||
2341 | * @param string $markup | ||
2342 | * @return DOMNode fragment in a node. | ||
2343 | */ | ||
2344 | protected function createDomFragment($dom, $markup) { | ||
2345 | $node = $dom->createDocumentFragment(); | ||
2346 | $node->appendXML($markup); | ||
2347 | return $node; | ||
2348 | } | ||
2349 | |||
2350 | /** | ||
2351 | * Retrieve an array of file names currently added to the book. | ||
2352 | * $key is the filename used in the book | ||
2353 | * $value is the original filename, will be the same as $key for most entries | ||
2354 | * | ||
2355 | * @return array file list | ||
2356 | */ | ||
2357 | function getFileList() { | ||
2358 | return $this->fileList; | ||
2359 | } | ||
2360 | |||
2361 | /** | ||
2362 | * @deprecated Use Zip::getRelativePath($relPath) instead. | ||
2363 | */ | ||
2364 | function relPath($relPath) { | ||
2365 | die ("Function was deprecated, use Zip::getRelativePath(\$relPath); instead"); | ||
2366 | } | ||
2367 | |||
2368 | /** | ||
2369 | * Set default chapter target size. | ||
2370 | * Default is 250000 bytes, and minimum is 10240 bytes. | ||
2371 | * | ||
2372 | * @param int $size segment size in bytes | ||
2373 | * @return void | ||
2374 | */ | ||
2375 | function setSplitSize($size) { | ||
2376 | $this->splitDefaultSize = (int)$size; | ||
2377 | if ($size < 10240) { | ||
2378 | $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. | ||
2379 | } | ||
2380 | } | ||
2381 | |||
2382 | /** | ||
2383 | * Get the chapter target size. | ||
2384 | * | ||
2385 | * @return $size | ||
2386 | */ | ||
2387 | function getSplitSize() { | ||
2388 | return $this->splitDefaultSize; | ||
2389 | } | ||
2390 | |||
2391 | /** | ||
2392 | * Remove all non essential html tags and entities. | ||
2393 | * | ||
2394 | * @global type $htmlEntities | ||
2395 | * @param string $string | ||
2396 | * @return string with the stripped entities. | ||
2397 | */ | ||
2398 | function decodeHtmlEntities($string) { | ||
2399 | global $htmlEntities; | ||
2400 | |||
2401 | $string = preg_replace('~\s*<br\s*/*\s*>\s*~i', "\n", $string); | ||
2402 | $string = preg_replace('~\s*</(p|div)\s*>\s*~i', "\n\n", $string); | ||
2403 | $string = preg_replace('~<[^>]*>~', '', $string); | ||
2404 | |||
2405 | $string = strtr($string, $htmlEntities); | ||
2406 | |||
2407 | $string = str_replace('&', '&', $string); | ||
2408 | $string = str_replace('&amp;', '&', $string); | ||
2409 | $string = preg_replace('~&(#x*[a-fA-F0-9]+;)~', '&\1', $string); | ||
2410 | $string = str_replace('<', '<', $string); | ||
2411 | $string = str_replace('>', '>', $string); | ||
2412 | |||
2413 | return $string; | ||
2414 | } | ||
2415 | |||
2416 | /** | ||
2417 | * Simply remove all HTML tags, brute force and no finesse. | ||
2418 | * | ||
2419 | * @param string $string html | ||
2420 | * @return string | ||
2421 | */ | ||
2422 | function html2text($string) { | ||
2423 | return preg_replace('~<[^>]*>~', '', $string); | ||
2424 | } | ||
2425 | |||
2426 | /** | ||
2427 | * @return string | ||
2428 | */ | ||
2429 | function getLog() { | ||
2430 | return $this->log->getLog(); | ||
2431 | } | ||
2432 | } | ||
diff --git a/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php new file mode 100644 index 00000000..1d44f238 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/EPubChapterSplitter.php | |||
@@ -0,0 +1,201 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts. | ||
4 | * What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts. | ||
5 | * Split size is considered max target size. The actual size is the result of an even split across the resulting files. | ||
6 | * | ||
7 | * @author A. Grandt <php@grandt.com> | ||
8 | * @copyright 2009-2014 A. Grandt | ||
9 | * @license GNU LGPL 2.1 | ||
10 | * @link http://www.phpclasses.org/package/6115 | ||
11 | * @link https://github.com/Grandt/PHPePub | ||
12 | * @version 3.20 | ||
13 | */ | ||
14 | class EPubChapterSplitter { | ||
15 | const VERSION = 3.20; | ||
16 | |||
17 | private $splitDefaultSize = 250000; | ||
18 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | ||
19 | |||
20 | /** | ||
21 | * | ||
22 | * Enter description here ... | ||
23 | * | ||
24 | * @param unknown_type $ident | ||
25 | */ | ||
26 | function setVersion($bookVersion) { | ||
27 | $this->bookVersion = is_string($bookVersion) ? trim($bookVersion) : EPub::BOOK_VERSION_EPUB2; | ||
28 | } | ||
29 | |||
30 | /** | ||
31 | * Set default chapter target size. | ||
32 | * Default is 250000 bytes, and minimum is 10240 bytes. | ||
33 | * | ||
34 | * @param $size segment size in bytes | ||
35 | * @return void | ||
36 | */ | ||
37 | function setSplitSize($size) { | ||
38 | $this->splitDefaultSize = (int)$size; | ||
39 | if ($size < 10240) { | ||
40 | $this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea. | ||
41 | } | ||
42 | } | ||
43 | |||
44 | /** | ||
45 | * Get the chapter target size. | ||
46 | * | ||
47 | * @return $size | ||
48 | */ | ||
49 | function getSplitSize() { | ||
50 | return $this->splitDefaultSize; | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * Split $chapter into multiple parts. | ||
55 | * | ||
56 | * The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php | ||
57 | * If the search string is a regular string, the matching will be for lines in the HTML starting with the string given | ||
58 | * | ||
59 | * @param String $chapter XHTML file | ||
60 | * @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check. | ||
61 | * @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern. | ||
62 | * | ||
63 | * @return array with 1 or more parts | ||
64 | */ | ||
65 | function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') { | ||
66 | $chapterData = array(); | ||
67 | $isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1); | ||
68 | if ($splitOnSearchString && !$isSearchRegexp) { | ||
69 | $searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#"; | ||
70 | } | ||
71 | |||
72 | if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) { | ||
73 | return array($chapter); | ||
74 | } | ||
75 | |||
76 | $xmlDoc = new DOMDocument(); | ||
77 | @$xmlDoc->loadHTML($chapter); | ||
78 | |||
79 | $head = $xmlDoc->getElementsByTagName("head"); | ||
80 | $body = $xmlDoc->getElementsByTagName("body"); | ||
81 | |||
82 | $htmlPos = stripos($chapter, "<html"); | ||
83 | $htmlEndPos = stripos($chapter, ">", $htmlPos); | ||
84 | $newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>"; | ||
85 | if (strpos(trim($newXML), "<?xml ") === FALSE) { | ||
86 | $newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML; | ||
87 | } | ||
88 | $headerLength = strlen($newXML); | ||
89 | |||
90 | $files = array(); | ||
91 | $chapterNames = array(); | ||
92 | $domDepth = 0; | ||
93 | $domPath = array(); | ||
94 | $domClonedPath = array(); | ||
95 | |||
96 | $curFile = $xmlDoc->createDocumentFragment(); | ||
97 | $files[] = $curFile; | ||
98 | $curParent = $curFile; | ||
99 | $curSize = 0; | ||
100 | |||
101 | $bodyLen = strlen($xmlDoc->saveXML($body->item(0))); | ||
102 | $headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength; | ||
103 | |||
104 | $partSize = $this->splitDefaultSize - $headLen; | ||
105 | |||
106 | if ($bodyLen > $partSize) { | ||
107 | $parts = ceil($bodyLen / $partSize); | ||
108 | $partSize = ($bodyLen / $parts) - $headLen; | ||
109 | } | ||
110 | |||
111 | $node = $body->item(0)->firstChild; | ||
112 | |||
113 | do { | ||
114 | $nodeData = $xmlDoc->saveXML($node); | ||
115 | $nodeLen = strlen($nodeData); | ||
116 | |||
117 | if ($nodeLen > $partSize && $node->hasChildNodes()) { | ||
118 | $domPath[] = $node; | ||
119 | $domClonedPath[] = $node->cloneNode(false); | ||
120 | $domDepth++; | ||
121 | |||
122 | $node = $node->firstChild; | ||
123 | } | ||
124 | |||
125 | $node2 = $node->nextSibling; | ||
126 | |||
127 | if ($node != null && $node->nodeName != "#text") { | ||
128 | $doSplit = false; | ||
129 | if ($splitOnSearchString) { | ||
130 | $doSplit = preg_match($searchString, $nodeData) == 1; | ||
131 | if ($doSplit) { | ||
132 | $chapterNames[] = trim($nodeData); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) { | ||
137 | $curFile = $xmlDoc->createDocumentFragment(); | ||
138 | $files[] = $curFile; | ||
139 | $curParent = $curFile; | ||
140 | if ($domDepth > 0) { | ||
141 | reset($domPath); | ||
142 | reset($domClonedPath); | ||
143 | $oneDomClonedPath = each($domClonedPath); | ||
144 | while ($oneDomClonedPath) { | ||
145 | list($k, $v) = $oneDomClonedPath; | ||
146 | $newParent = $v->cloneNode(false); | ||
147 | $curParent->appendChild($newParent); | ||
148 | $curParent = $newParent; | ||
149 | $oneDomClonedPath = each($domClonedPath); | ||
150 | } | ||
151 | } | ||
152 | $curSize = strlen($xmlDoc->saveXML($curFile)); | ||
153 | } | ||
154 | $curParent->appendChild($node->cloneNode(true)); | ||
155 | $curSize += $nodeLen; | ||
156 | } | ||
157 | |||
158 | $node = $node2; | ||
159 | while ($node == null && $domDepth > 0) { | ||
160 | $domDepth--; | ||
161 | $node = end($domPath)->nextSibling; | ||
162 | array_pop($domPath); | ||
163 | array_pop($domClonedPath); | ||
164 | $curParent = $curParent->parentNode; | ||
165 | } | ||
166 | } while ($node != null); | ||
167 | |||
168 | $curFile = null; | ||
169 | $curSize = 0; | ||
170 | |||
171 | $xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | ||
172 | $xml->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
173 | $xml->preserveWhiteSpace = false; | ||
174 | $xml->formatOutput = true; | ||
175 | |||
176 | for ($idx = 0; $idx < count($files); $idx++) { | ||
177 | $xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding); | ||
178 | $xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml"); | ||
179 | $xml2Doc->loadXML($newXML); | ||
180 | $html = $xml2Doc->getElementsByTagName("html")->item(0); | ||
181 | $html->appendChild($xml2Doc->importNode($head->item(0), true)); | ||
182 | $body = $xml2Doc->createElement("body"); | ||
183 | $html->appendChild($body); | ||
184 | $body->appendChild($xml2Doc->importNode($files[$idx], true)); | ||
185 | |||
186 | // force pretty printing and correct formatting, should not be needed, but it is. | ||
187 | $xml->loadXML($xml2Doc->saveXML()); | ||
188 | |||
189 | $doc = $xml->saveXML(); | ||
190 | |||
191 | if ($this->bookVersion === EPub::BOOK_VERSION_EPUB3) { | ||
192 | $doc = preg_replace('#^\s*<!DOCTYPE\ .+?>\s*#im', '', $doc); | ||
193 | } | ||
194 | |||
195 | $chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $doc; | ||
196 | } | ||
197 | |||
198 | return $chapterData; | ||
199 | } | ||
200 | } | ||
201 | ?> | ||
diff --git a/inc/3rdparty/libraries/PHPePub/Logger.php b/inc/3rdparty/libraries/PHPePub/Logger.php new file mode 100644 index 00000000..314019cb --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/Logger.php | |||
@@ -0,0 +1,92 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Simple log line aggregator. | ||
4 | * | ||
5 | * @author A. Grandt <php@grandt.com> | ||
6 | * @copyright 2012-2013 A. Grandt | ||
7 | * @license GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
8 | * @version 1.00 | ||
9 | */ | ||
10 | class Logger { | ||
11 | const VERSION = 1.00; | ||
12 | |||
13 | private $log = ""; | ||
14 | private $tStart; | ||
15 | private $tLast; | ||
16 | private $name = NULL; | ||
17 | private $isLogging = FALSE; | ||
18 | private $isDebugging = FALSE; | ||
19 | |||
20 | /** | ||
21 | * Class constructor. | ||
22 | * | ||
23 | * @return void | ||
24 | */ | ||
25 | function __construct($name = NULL, $isLogging = FALSE) { | ||
26 | if ($name === NULL) { | ||
27 | $this->name = ""; | ||
28 | } else { | ||
29 | $this->name = $name . " : "; | ||
30 | } | ||
31 | $this->isLogging = $isLogging; | ||
32 | $this->start(); | ||
33 | } | ||
34 | |||
35 | /** | ||
36 | * Class destructor | ||
37 | * | ||
38 | * @return void | ||
39 | * @TODO make sure elements in the destructor match the current class elements | ||
40 | */ | ||
41 | function __destruct() { | ||
42 | unset($this->log); | ||
43 | } | ||
44 | |||
45 | function start() { | ||
46 | /* Prepare Logging. Just in case it's used. later */ | ||
47 | if ($this->isLogging) { | ||
48 | $this->tStart = gettimeofday(); | ||
49 | $this->tLast = $this->tStart; | ||
50 | $this->log = "<h1>Log: " . $this->name . "</h1>\n<pre>Started: " . gmdate("D, d M Y H:i:s T", $this->tStart['sec']) . "\n Δ Start ; Δ Last ;"; | ||
51 | $this->logLine("Start"); | ||
52 | } | ||
53 | } | ||
54 | |||
55 | function dumpInstalledModules() { | ||
56 | if ($this->isLogging) { | ||
57 | $isCurlInstalled = extension_loaded('curl') && function_exists('curl_version'); | ||
58 | $isGdInstalled = extension_loaded('gd') && function_exists('gd_info'); | ||
59 | $isExifInstalled = extension_loaded('exif') && function_exists('exif_imagetype'); | ||
60 | $isFileGetContentsInstalled = function_exists('file_get_contents'); | ||
61 | $isFileGetContentsExtInstalled = $isFileGetContentsInstalled && ini_get('allow_url_fopen'); | ||
62 | |||
63 | $this->logLine("isCurlInstalled...............: " . ($isCurlInstalled ? "Yes" : "No")); | ||
64 | $this->logLine("isGdInstalled.................: " . ($isGdInstalled ? "Yes" : "No")); | ||
65 | $this->logLine("isExifInstalled...............: " . ($isExifInstalled ? "Yes" : "No")); | ||
66 | $this->logLine("isFileGetContentsInstalled....: " . ($isFileGetContentsInstalled ? "Yes" : "No")); | ||
67 | $this->logLine("isFileGetContentsExtInstalled.: " . ($isFileGetContentsExtInstalled ? "Yes" : "No")); | ||
68 | } | ||
69 | } | ||
70 | |||
71 | function logLine($line) { | ||
72 | if ($this->isLogging) { | ||
73 | $tTemp = gettimeofday(); | ||
74 | $tS = $this->tStart['sec'] + (((int)($this->tStart['usec']/100))/10000); | ||
75 | $tL = $this->tLast['sec'] + (((int)($this->tLast['usec']/100))/10000); | ||
76 | $tT = $tTemp['sec'] + (((int)($tTemp['usec']/100))/10000); | ||
77 | |||
78 | $logline = sprintf("\n+%08.04f; +%08.04f; ", ($tT-$tS), ($tT-$tL)) . $this->name . $line; | ||
79 | $this->log .= $logline; | ||
80 | $this->tLast = $tTemp; | ||
81 | |||
82 | if ($this->isDebugging) { | ||
83 | echo "<pre>" . $logline . "\n</pre>\n"; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | |||
88 | function getLog() { | ||
89 | return $this->log; | ||
90 | } | ||
91 | } | ||
92 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/PHPePub/Zip.php b/inc/3rdparty/libraries/PHPePub/Zip.php new file mode 100644 index 00000000..01e03566 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/Zip.php | |||
@@ -0,0 +1,818 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Class to create and manage a Zip file. | ||
4 | * | ||
5 | * Initially inspired by CreateZipFile by Rochak Chauhan www.rochakchauhan.com (http://www.phpclasses.org/browse/package/2322.html) | ||
6 | * and | ||
7 | * http://www.pkware.com/documents/casestudies/APPNOTE.TXT Zip file specification. | ||
8 | * | ||
9 | * License: GNU LGPL, Attribution required for commercial implementations, requested for everything else. | ||
10 | * | ||
11 | * @author A. Grandt <php@grandt.com> | ||
12 | * @copyright 2009-2014 A. Grandt | ||
13 | * @license GNU LGPL 2.1 | ||
14 | * @link http://www.phpclasses.org/package/6110 | ||
15 | * @link https://github.com/Grandt/PHPZip | ||
16 | * @version 1.60 | ||
17 | */ | ||
18 | class Zip { | ||
19 | const VERSION = 1.60; | ||
20 | |||
21 | const ZIP_LOCAL_FILE_HEADER = "\x50\x4b\x03\x04"; // Local file header signature | ||
22 | const ZIP_CENTRAL_FILE_HEADER = "\x50\x4b\x01\x02"; // Central file header signature | ||
23 | const ZIP_END_OF_CENTRAL_DIRECTORY = "\x50\x4b\x05\x06\x00\x00\x00\x00"; //end of Central directory record | ||
24 | |||
25 | const EXT_FILE_ATTR_DIR = 010173200020; // Permission 755 drwxr-xr-x = (((S_IFDIR | 0755) << 16) | S_DOS_D); | ||
26 | const EXT_FILE_ATTR_FILE = 020151000040; // Permission 644 -rw-r--r-- = (((S_IFREG | 0644) << 16) | S_DOS_A); | ||
27 | |||
28 | const ATTR_VERSION_TO_EXTRACT = "\x14\x00"; // Version needed to extract | ||
29 | const ATTR_MADE_BY_VERSION = "\x1E\x03"; // Made By Version | ||
30 | |||
31 | // Unix file types | ||
32 | const S_IFIFO = 0010000; // named pipe (fifo) | ||
33 | const S_IFCHR = 0020000; // character special | ||
34 | const S_IFDIR = 0040000; // directory | ||
35 | const S_IFBLK = 0060000; // block special | ||
36 | const S_IFREG = 0100000; // regular | ||
37 | const S_IFLNK = 0120000; // symbolic link | ||
38 | const S_IFSOCK = 0140000; // socket | ||
39 | |||
40 | // setuid/setgid/sticky bits, the same as for chmod: | ||
41 | |||
42 | const S_ISUID = 0004000; // set user id on execution | ||
43 | const S_ISGID = 0002000; // set group id on execution | ||
44 | const S_ISTXT = 0001000; // sticky bit | ||
45 | |||
46 | // And of course, the other 12 bits are for the permissions, the same as for chmod: | ||
47 | // When addding these up, you can also just write the permissions as a simgle octal number | ||
48 | // ie. 0755. The leading 0 specifies octal notation. | ||
49 | const S_IRWXU = 0000700; // RWX mask for owner | ||
50 | const S_IRUSR = 0000400; // R for owner | ||
51 | const S_IWUSR = 0000200; // W for owner | ||
52 | const S_IXUSR = 0000100; // X for owner | ||
53 | const S_IRWXG = 0000070; // RWX mask for group | ||
54 | const S_IRGRP = 0000040; // R for group | ||
55 | const S_IWGRP = 0000020; // W for group | ||
56 | const S_IXGRP = 0000010; // X for group | ||
57 | const S_IRWXO = 0000007; // RWX mask for other | ||
58 | const S_IROTH = 0000004; // R for other | ||
59 | const S_IWOTH = 0000002; // W for other | ||
60 | const S_IXOTH = 0000001; // X for other | ||
61 | const S_ISVTX = 0001000; // save swapped text even after use | ||
62 | |||
63 | // Filetype, sticky and permissions are added up, and shifted 16 bits left BEFORE adding the DOS flags. | ||
64 | |||
65 | // DOS file type flags, we really only use the S_DOS_D flag. | ||
66 | |||
67 | const S_DOS_A = 0000040; // DOS flag for Archive | ||
68 | const S_DOS_D = 0000020; // DOS flag for Directory | ||
69 | const S_DOS_V = 0000010; // DOS flag for Volume | ||
70 | const S_DOS_S = 0000004; // DOS flag for System | ||
71 | const S_DOS_H = 0000002; // DOS flag for Hidden | ||
72 | const S_DOS_R = 0000001; // DOS flag for Read Only | ||
73 | |||
74 | private $zipMemoryThreshold = 1048576; // Autocreate tempfile if the zip data exceeds 1048576 bytes (1 MB) | ||
75 | |||
76 | private $zipData = NULL; | ||
77 | private $zipFile = NULL; | ||
78 | private $zipComment = NULL; | ||
79 | private $cdRec = array(); // central directory | ||
80 | private $offset = 0; | ||
81 | private $isFinalized = FALSE; | ||
82 | private $addExtraField = TRUE; | ||
83 | |||
84 | private $streamChunkSize = 65536; | ||
85 | private $streamFilePath = NULL; | ||
86 | private $streamTimestamp = NULL; | ||
87 | private $streamFileComment = NULL; | ||
88 | private $streamFile = NULL; | ||
89 | private $streamData = NULL; | ||
90 | private $streamFileLength = 0; | ||
91 | private $streamExtFileAttr = null; | ||
92 | |||
93 | /** | ||
94 | * Constructor. | ||
95 | * | ||
96 | * @param boolean $useZipFile Write temp zip data to tempFile? Default FALSE | ||
97 | */ | ||
98 | function __construct($useZipFile = FALSE) { | ||
99 | if ($useZipFile) { | ||
100 | $this->zipFile = tmpfile(); | ||
101 | } else { | ||
102 | $this->zipData = ""; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | function __destruct() { | ||
107 | if (is_resource($this->zipFile)) { | ||
108 | fclose($this->zipFile); | ||
109 | } | ||
110 | $this->zipData = NULL; | ||
111 | } | ||
112 | |||
113 | /** | ||
114 | * Extra fields on the Zip directory records are Unix time codes needed for compatibility on the default Mac zip archive tool. | ||
115 | * These are enabled as default, as they do no harm elsewhere and only add 26 bytes per file added. | ||
116 | * | ||
117 | * @param bool $setExtraField TRUE (default) will enable adding of extra fields, anything else will disable it. | ||
118 | */ | ||
119 | function setExtraField($setExtraField = TRUE) { | ||
120 | $this->addExtraField = ($setExtraField === TRUE); | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * Set Zip archive comment. | ||
125 | * | ||
126 | * @param string $newComment New comment. NULL to clear. | ||
127 | * @return bool $success | ||
128 | */ | ||
129 | public function setComment($newComment = NULL) { | ||
130 | if ($this->isFinalized) { | ||
131 | return FALSE; | ||
132 | } | ||
133 | $this->zipComment = $newComment; | ||
134 | |||
135 | return TRUE; | ||
136 | } | ||
137 | |||
138 | /** | ||
139 | * Set zip file to write zip data to. | ||
140 | * This will cause all present and future data written to this class to be written to this file. | ||
141 | * This can be used at any time, even after the Zip Archive have been finalized. Any previous file will be closed. | ||
142 | * Warning: If the given file already exists, it will be overwritten. | ||
143 | * | ||
144 | * @param string $fileName | ||
145 | * @return bool $success | ||
146 | */ | ||
147 | public function setZipFile($fileName) { | ||
148 | if (is_file($fileName)) { | ||
149 | unlink($fileName); | ||
150 | } | ||
151 | $fd=fopen($fileName, "x+b"); | ||
152 | if (is_resource($this->zipFile)) { | ||
153 | rewind($this->zipFile); | ||
154 | while (!feof($this->zipFile)) { | ||
155 | fwrite($fd, fread($this->zipFile, $this->streamChunkSize)); | ||
156 | } | ||
157 | |||
158 | fclose($this->zipFile); | ||
159 | } else { | ||
160 | fwrite($fd, $this->zipData); | ||
161 | $this->zipData = NULL; | ||
162 | } | ||
163 | $this->zipFile = $fd; | ||
164 | |||
165 | return TRUE; | ||
166 | } | ||
167 | |||
168 | /** | ||
169 | * Add an empty directory entry to the zip archive. | ||
170 | * Basically this is only used if an empty directory is added. | ||
171 | * | ||
172 | * @param string $directoryPath Directory Path and name to be added to the archive. | ||
173 | * @param int $timestamp (Optional) Timestamp for the added directory, if omitted or set to 0, the current time will be used. | ||
174 | * @param string $fileComment (Optional) Comment to be added to the archive for this directory. To use fileComment, timestamp must be given. | ||
175 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
176 | * @return bool $success | ||
177 | */ | ||
178 | public function addDirectory($directoryPath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_DIR) { | ||
179 | if ($this->isFinalized) { | ||
180 | return FALSE; | ||
181 | } | ||
182 | $directoryPath = str_replace("\\", "/", $directoryPath); | ||
183 | $directoryPath = rtrim($directoryPath, "/"); | ||
184 | |||
185 | if (strlen($directoryPath) > 0) { | ||
186 | $this->buildZipEntry($directoryPath.'/', $fileComment, "\x00\x00", "\x00\x00", $timestamp, "\x00\x00\x00\x00", 0, 0, $extFileAttr); | ||
187 | return TRUE; | ||
188 | } | ||
189 | return FALSE; | ||
190 | } | ||
191 | |||
192 | /** | ||
193 | * Add a file to the archive at the specified location and file name. | ||
194 | * | ||
195 | * @param string $data File data. | ||
196 | * @param string $filePath Filepath and name to be used in the archive. | ||
197 | * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. | ||
198 | * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. | ||
199 | * @param bool $compress (Optional) Compress file, if set to FALSE the file will only be stored. Default TRUE. | ||
200 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
201 | * @return bool $success | ||
202 | */ | ||
203 | public function addFile($data, $filePath, $timestamp = 0, $fileComment = NULL, $compress = TRUE, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
204 | if ($this->isFinalized) { | ||
205 | return FALSE; | ||
206 | } | ||
207 | |||
208 | if (is_resource($data) && get_resource_type($data) == "stream") { | ||
209 | $this->addLargeFile($data, $filePath, $timestamp, $fileComment, $extFileAttr); | ||
210 | return FALSE; | ||
211 | } | ||
212 | |||
213 | $gzData = ""; | ||
214 | $gzType = "\x08\x00"; // Compression type 8 = deflate | ||
215 | $gpFlags = "\x00\x00"; // General Purpose bit flags for compression type 8 it is: 0=Normal, 1=Maximum, 2=Fast, 3=super fast compression. | ||
216 | $dataLength = strlen($data); | ||
217 | $fileCRC32 = pack("V", crc32($data)); | ||
218 | |||
219 | if ($compress) { | ||
220 | $gzTmp = gzcompress($data); | ||
221 | $gzData = substr(substr($gzTmp, 0, strlen($gzTmp) - 4), 2); // gzcompress adds a 2 byte header and 4 byte CRC we can't use. | ||
222 | // The 2 byte header does contain useful data, though in this case the 2 parameters we'd be interrested in will always be 8 for compression type, and 2 for General purpose flag. | ||
223 | $gzLength = strlen($gzData); | ||
224 | } else { | ||
225 | $gzLength = $dataLength; | ||
226 | } | ||
227 | |||
228 | if ($gzLength >= $dataLength) { | ||
229 | $gzLength = $dataLength; | ||
230 | $gzData = $data; | ||
231 | $gzType = "\x00\x00"; // Compression type 0 = stored | ||
232 | $gpFlags = "\x00\x00"; // Compression type 0 = stored | ||
233 | } | ||
234 | |||
235 | if (!is_resource($this->zipFile) && ($this->offset + $gzLength) > $this->zipMemoryThreshold) { | ||
236 | $this->zipflush(); | ||
237 | } | ||
238 | |||
239 | $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr); | ||
240 | |||
241 | $this->zipwrite($gzData); | ||
242 | |||
243 | return TRUE; | ||
244 | } | ||
245 | |||
246 | /** | ||
247 | * Add the content to a directory. | ||
248 | * | ||
249 | * @author Adam Schmalhofer <Adam.Schmalhofer@gmx.de> | ||
250 | * @author A. Grandt | ||
251 | * | ||
252 | * @param string $realPath Path on the file system. | ||
253 | * @param string $zipPath Filepath and name to be used in the archive. | ||
254 | * @param bool $recursive Add content recursively, default is TRUE. | ||
255 | * @param bool $followSymlinks Follow and add symbolic links, if they are accessible, default is TRUE. | ||
256 | * @param array &$addedFiles Reference to the added files, this is used to prevent duplicates, efault is an empty array. | ||
257 | * If you start the function by parsing an array, the array will be populated with the realPath | ||
258 | * and zipPath kay/value pairs added to the archive by the function. | ||
259 | * @param bool $overrideFilePermissions Force the use of the file/dir permissions set in the $extDirAttr | ||
260 | * and $extFileAttr parameters. | ||
261 | * @param int $extDirAttr Permissions for directories. | ||
262 | * @param int $extFileAttr Permissions for files. | ||
263 | */ | ||
264 | public function addDirectoryContent($realPath, $zipPath, $recursive = TRUE, $followSymlinks = TRUE, &$addedFiles = array(), | ||
265 | $overrideFilePermissions = FALSE, $extDirAttr = self::EXT_FILE_ATTR_DIR, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
266 | if (file_exists($realPath) && !isset($addedFiles[realpath($realPath)])) { | ||
267 | if (is_dir($realPath)) { | ||
268 | if ($overrideFilePermissions) { | ||
269 | $this->addDirectory($zipPath, 0, null, $extDirAttr); | ||
270 | } else { | ||
271 | $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($realPath)); | ||
272 | } | ||
273 | } | ||
274 | |||
275 | $addedFiles[realpath($realPath)] = $zipPath; | ||
276 | |||
277 | $iter = new DirectoryIterator($realPath); | ||
278 | foreach ($iter as $file) { | ||
279 | if ($file->isDot()) { | ||
280 | continue; | ||
281 | } | ||
282 | $newRealPath = $file->getPathname(); | ||
283 | $newZipPath = self::pathJoin($zipPath, $file->getFilename()); | ||
284 | |||
285 | if (file_exists($newRealPath) && ($followSymlinks === TRUE || !is_link($newRealPath))) { | ||
286 | if ($file->isFile()) { | ||
287 | $addedFiles[realpath($newRealPath)] = $newZipPath; | ||
288 | if ($overrideFilePermissions) { | ||
289 | $this->addLargeFile($newRealPath, $newZipPath, 0, null, $extFileAttr); | ||
290 | } else { | ||
291 | $this->addLargeFile($newRealPath, $newZipPath, 0, null, self::getFileExtAttr($newRealPath)); | ||
292 | } | ||
293 | } else if ($recursive === TRUE) { | ||
294 | $this->addDirectoryContent($newRealPath, $newZipPath, $recursive, $followSymlinks, $addedFiles, $overrideFilePermissions, $extDirAttr, $extFileAttr); | ||
295 | } else { | ||
296 | if ($overrideFilePermissions) { | ||
297 | $this->addDirectory($zipPath, 0, null, $extDirAttr); | ||
298 | } else { | ||
299 | $this->addDirectory($zipPath, 0, null, self::getFileExtAttr($newRealPath)); | ||
300 | } | ||
301 | } | ||
302 | } | ||
303 | } | ||
304 | } | ||
305 | } | ||
306 | |||
307 | /** | ||
308 | * Add a file to the archive at the specified location and file name. | ||
309 | * | ||
310 | * @param string $dataFile File name/path. | ||
311 | * @param string $filePath Filepath and name to be used in the archive. | ||
312 | * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. | ||
313 | * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. | ||
314 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
315 | * @return bool $success | ||
316 | */ | ||
317 | public function addLargeFile($dataFile, $filePath, $timestamp = 0, $fileComment = NULL, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
318 | if ($this->isFinalized) { | ||
319 | return FALSE; | ||
320 | } | ||
321 | |||
322 | if (is_string($dataFile) && is_file($dataFile)) { | ||
323 | $this->processFile($dataFile, $filePath, $timestamp, $fileComment, $extFileAttr); | ||
324 | } else if (is_resource($dataFile) && get_resource_type($dataFile) == "stream") { | ||
325 | $fh = $dataFile; | ||
326 | $this->openStream($filePath, $timestamp, $fileComment, $extFileAttr); | ||
327 | |||
328 | while (!feof($fh)) { | ||
329 | $this->addStreamData(fread($fh, $this->streamChunkSize)); | ||
330 | } | ||
331 | $this->closeStream($this->addExtraField); | ||
332 | } | ||
333 | return TRUE; | ||
334 | } | ||
335 | |||
336 | /** | ||
337 | * Create a stream to be used for large entries. | ||
338 | * | ||
339 | * @param string $filePath Filepath and name to be used in the archive. | ||
340 | * @param int $timestamp (Optional) Timestamp for the added file, if omitted or set to 0, the current time will be used. | ||
341 | * @param string $fileComment (Optional) Comment to be added to the archive for this file. To use fileComment, timestamp must be given. | ||
342 | * @param int $extFileAttr (Optional) The external file reference, use generateExtAttr to generate this. | ||
343 | * @return bool $success | ||
344 | */ | ||
345 | public function openStream($filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
346 | if (!function_exists('sys_get_temp_dir')) { | ||
347 | die ("ERROR: Zip " . self::VERSION . " requires PHP version 5.2.1 or above if large files are used."); | ||
348 | } | ||
349 | |||
350 | if ($this->isFinalized) { | ||
351 | return FALSE; | ||
352 | } | ||
353 | |||
354 | $this->zipflush(); | ||
355 | |||
356 | if (strlen($this->streamFilePath) > 0) { | ||
357 | $this->closeStream(); | ||
358 | } | ||
359 | |||
360 | $this->streamFile = tempnam(sys_get_temp_dir(), 'Zip'); | ||
361 | $this->streamData = fopen($this->streamFile, "wb"); | ||
362 | $this->streamFilePath = $filePath; | ||
363 | $this->streamTimestamp = $timestamp; | ||
364 | $this->streamFileComment = $fileComment; | ||
365 | $this->streamFileLength = 0; | ||
366 | $this->streamExtFileAttr = $extFileAttr; | ||
367 | |||
368 | return TRUE; | ||
369 | } | ||
370 | |||
371 | /** | ||
372 | * Add data to the open stream. | ||
373 | * | ||
374 | * @param string $data | ||
375 | * @return mixed length in bytes added or FALSE if the archive is finalized or there are no open stream. | ||
376 | */ | ||
377 | public function addStreamData($data) { | ||
378 | if ($this->isFinalized || strlen($this->streamFilePath) == 0) { | ||
379 | return FALSE; | ||
380 | } | ||
381 | |||
382 | $length = fwrite($this->streamData, $data, strlen($data)); | ||
383 | if ($length != strlen($data)) { | ||
384 | die ("<p>Length mismatch</p>\n"); | ||
385 | } | ||
386 | $this->streamFileLength += $length; | ||
387 | |||
388 | return $length; | ||
389 | } | ||
390 | |||
391 | /** | ||
392 | * Close the current stream. | ||
393 | * | ||
394 | * @return bool $success | ||
395 | */ | ||
396 | public function closeStream() { | ||
397 | if ($this->isFinalized || strlen($this->streamFilePath) == 0) { | ||
398 | return FALSE; | ||
399 | } | ||
400 | |||
401 | fflush($this->streamData); | ||
402 | fclose($this->streamData); | ||
403 | |||
404 | $this->processFile($this->streamFile, $this->streamFilePath, $this->streamTimestamp, $this->streamFileComment, $this->streamExtFileAttr); | ||
405 | |||
406 | $this->streamData = null; | ||
407 | $this->streamFilePath = null; | ||
408 | $this->streamTimestamp = null; | ||
409 | $this->streamFileComment = null; | ||
410 | $this->streamFileLength = 0; | ||
411 | $this->streamExtFileAttr = null; | ||
412 | |||
413 | // Windows is a little slow at times, so a millisecond later, we can unlink this. | ||
414 | unlink($this->streamFile); | ||
415 | |||
416 | $this->streamFile = null; | ||
417 | |||
418 | return TRUE; | ||
419 | } | ||
420 | |||
421 | private function processFile($dataFile, $filePath, $timestamp = 0, $fileComment = null, $extFileAttr = self::EXT_FILE_ATTR_FILE) { | ||
422 | if ($this->isFinalized) { | ||
423 | return FALSE; | ||
424 | } | ||
425 | |||
426 | $tempzip = tempnam(sys_get_temp_dir(), 'ZipStream'); | ||
427 | |||
428 | $zip = new ZipArchive; | ||
429 | if ($zip->open($tempzip) === TRUE) { | ||
430 | $zip->addFile($dataFile, 'file'); | ||
431 | $zip->close(); | ||
432 | } | ||
433 | |||
434 | $file_handle = fopen($tempzip, "rb"); | ||
435 | $stats = fstat($file_handle); | ||
436 | $eof = $stats['size']-72; | ||
437 | |||
438 | fseek($file_handle, 6); | ||
439 | |||
440 | $gpFlags = fread($file_handle, 2); | ||
441 | $gzType = fread($file_handle, 2); | ||
442 | fread($file_handle, 4); | ||
443 | $fileCRC32 = fread($file_handle, 4); | ||
444 | $v = unpack("Vval", fread($file_handle, 4)); | ||
445 | $gzLength = $v['val']; | ||
446 | $v = unpack("Vval", fread($file_handle, 4)); | ||
447 | $dataLength = $v['val']; | ||
448 | |||
449 | $this->buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr); | ||
450 | |||
451 | fseek($file_handle, 34); | ||
452 | $pos = 34; | ||
453 | |||
454 | while (!feof($file_handle) && $pos < $eof) { | ||
455 | $datalen = $this->streamChunkSize; | ||
456 | if ($pos + $this->streamChunkSize > $eof) { | ||
457 | $datalen = $eof-$pos; | ||
458 | } | ||
459 | $data = fread($file_handle, $datalen); | ||
460 | $pos += $datalen; | ||
461 | |||
462 | $this->zipwrite($data); | ||
463 | } | ||
464 | |||
465 | fclose($file_handle); | ||
466 | |||
467 | unlink($tempzip); | ||
468 | } | ||
469 | |||
470 | /** | ||
471 | * Close the archive. | ||
472 | * A closed archive can no longer have new files added to it. | ||
473 | * | ||
474 | * @return bool $success | ||
475 | */ | ||
476 | public function finalize() { | ||
477 | if (!$this->isFinalized) { | ||
478 | if (strlen($this->streamFilePath) > 0) { | ||
479 | $this->closeStream(); | ||
480 | } | ||
481 | $cd = implode("", $this->cdRec); | ||
482 | |||
483 | $cdRecSize = pack("v", sizeof($this->cdRec)); | ||
484 | $cdRec = $cd . self::ZIP_END_OF_CENTRAL_DIRECTORY | ||
485 | . $cdRecSize . $cdRecSize | ||
486 | . pack("VV", strlen($cd), $this->offset); | ||
487 | if (!empty($this->zipComment)) { | ||
488 | $cdRec .= pack("v", strlen($this->zipComment)) . $this->zipComment; | ||
489 | } else { | ||
490 | $cdRec .= "\x00\x00"; | ||
491 | } | ||
492 | |||
493 | $this->zipwrite($cdRec); | ||
494 | |||
495 | $this->isFinalized = TRUE; | ||
496 | $this->cdRec = NULL; | ||
497 | |||
498 | return TRUE; | ||
499 | } | ||
500 | return FALSE; | ||
501 | } | ||
502 | |||
503 | /** | ||
504 | * Get the handle ressource for the archive zip file. | ||
505 | * If the zip haven't been finalized yet, this will cause it to become finalized | ||
506 | * | ||
507 | * @return zip file handle | ||
508 | */ | ||
509 | public function getZipFile() { | ||
510 | if (!$this->isFinalized) { | ||
511 | $this->finalize(); | ||
512 | } | ||
513 | |||
514 | $this->zipflush(); | ||
515 | |||
516 | rewind($this->zipFile); | ||
517 | |||
518 | return $this->zipFile; | ||
519 | } | ||
520 | |||
521 | /** | ||
522 | * Get the zip file contents | ||
523 | * If the zip haven't been finalized yet, this will cause it to become finalized | ||
524 | * | ||
525 | * @return zip data | ||
526 | */ | ||
527 | public function getZipData() { | ||
528 | if (!$this->isFinalized) { | ||
529 | $this->finalize(); | ||
530 | } | ||
531 | if (!is_resource($this->zipFile)) { | ||
532 | return $this->zipData; | ||
533 | } else { | ||
534 | rewind($this->zipFile); | ||
535 | $filestat = fstat($this->zipFile); | ||
536 | return fread($this->zipFile, $filestat['size']); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | /** | ||
541 | * Send the archive as a zip download | ||
542 | * | ||
543 | * @param String $fileName The name of the Zip archive, in ISO-8859-1 (or ASCII) encoding, ie. "archive.zip". Optional, defaults to NULL, which means that no ISO-8859-1 encoded file name will be specified. | ||
544 | * @param String $contentType Content mime type. Optional, defaults to "application/zip". | ||
545 | * @param String $utf8FileName The name of the Zip archive, in UTF-8 encoding. Optional, defaults to NULL, which means that no UTF-8 encoded file name will be specified. | ||
546 | * @param bool $inline Use Content-Disposition with "inline" instead of "attached". Optional, defaults to FALSE. | ||
547 | * @return bool $success | ||
548 | */ | ||
549 | function sendZip($fileName = null, $contentType = "application/zip", $utf8FileName = null, $inline = false) { | ||
550 | if (!$this->isFinalized) { | ||
551 | $this->finalize(); | ||
552 | } | ||
553 | |||
554 | $headerFile = null; | ||
555 | $headerLine = null; | ||
556 | if (!headers_sent($headerFile, $headerLine) or die("<p><strong>Error:</strong> Unable to send file $fileName. HTML Headers have already been sent from <strong>$headerFile</strong> in line <strong>$headerLine</strong></p>")) { | ||
557 | if ((ob_get_contents() === FALSE || ob_get_contents() == '') or die("\n<p><strong>Error:</strong> Unable to send file <strong>$fileName</strong>. Output buffer contains the following text (typically warnings or errors):<br>" . htmlentities(ob_get_contents()) . "</p>")) { | ||
558 | if (ini_get('zlib.output_compression')) { | ||
559 | ini_set('zlib.output_compression', 'Off'); | ||
560 | } | ||
561 | |||
562 | header("Pragma: public"); | ||
563 | header("Last-Modified: " . gmdate("D, d M Y H:i:s T")); | ||
564 | header("Expires: 0"); | ||
565 | header("Accept-Ranges: bytes"); | ||
566 | header("Connection: close"); | ||
567 | header("Content-Type: " . $contentType); | ||
568 | $cd = "Content-Disposition: "; | ||
569 | if ($inline) { | ||
570 | $cd .= "inline"; | ||
571 | } else{ | ||
572 | $cd .= "attached"; | ||
573 | } | ||
574 | if ($fileName) { | ||
575 | $cd .= '; filename="' . $fileName . '"'; | ||
576 | } | ||
577 | if ($utf8FileName) { | ||
578 | $cd .= "; filename*=UTF-8''" . rawurlencode($utf8FileName); | ||
579 | } | ||
580 | header($cd); | ||
581 | header("Content-Length: ". $this->getArchiveSize()); | ||
582 | |||
583 | if (!is_resource($this->zipFile)) { | ||
584 | echo $this->zipData; | ||
585 | } else { | ||
586 | rewind($this->zipFile); | ||
587 | |||
588 | while (!feof($this->zipFile)) { | ||
589 | echo fread($this->zipFile, $this->streamChunkSize); | ||
590 | } | ||
591 | } | ||
592 | } | ||
593 | return TRUE; | ||
594 | } | ||
595 | return FALSE; | ||
596 | } | ||
597 | |||
598 | /** | ||
599 | * Return the current size of the archive | ||
600 | * | ||
601 | * @return $size Size of the archive | ||
602 | */ | ||
603 | public function getArchiveSize() { | ||
604 | if (!is_resource($this->zipFile)) { | ||
605 | return strlen($this->zipData); | ||
606 | } | ||
607 | $filestat = fstat($this->zipFile); | ||
608 | |||
609 | return $filestat['size']; | ||
610 | } | ||
611 | |||
612 | /** | ||
613 | * Calculate the 2 byte dostime used in the zip entries. | ||
614 | * | ||
615 | * @param int $timestamp | ||
616 | * @return 2-byte encoded DOS Date | ||
617 | */ | ||
618 | private function getDosTime($timestamp = 0) { | ||
619 | $timestamp = (int)$timestamp; | ||
620 | $oldTZ = @date_default_timezone_get(); | ||
621 | date_default_timezone_set('UTC'); | ||
622 | $date = ($timestamp == 0 ? getdate() : getdate($timestamp)); | ||
623 | date_default_timezone_set($oldTZ); | ||
624 | if ($date["year"] >= 1980) { | ||
625 | return pack("V", (($date["mday"] + ($date["mon"] << 5) + (($date["year"]-1980) << 9)) << 16) | | ||
626 | (($date["seconds"] >> 1) + ($date["minutes"] << 5) + ($date["hours"] << 11))); | ||
627 | } | ||
628 | return "\x00\x00\x00\x00"; | ||
629 | } | ||
630 | |||
631 | /** | ||
632 | * Build the Zip file structures | ||
633 | * | ||
634 | * @param string $filePath | ||
635 | * @param string $fileComment | ||
636 | * @param string $gpFlags | ||
637 | * @param string $gzType | ||
638 | * @param int $timestamp | ||
639 | * @param string $fileCRC32 | ||
640 | * @param int $gzLength | ||
641 | * @param int $dataLength | ||
642 | * @param int $extFileAttr Use self::EXT_FILE_ATTR_FILE for files, self::EXT_FILE_ATTR_DIR for Directories. | ||
643 | */ | ||
644 | private function buildZipEntry($filePath, $fileComment, $gpFlags, $gzType, $timestamp, $fileCRC32, $gzLength, $dataLength, $extFileAttr) { | ||
645 | $filePath = str_replace("\\", "/", $filePath); | ||
646 | $fileCommentLength = (empty($fileComment) ? 0 : strlen($fileComment)); | ||
647 | $timestamp = (int)$timestamp; | ||
648 | $timestamp = ($timestamp == 0 ? time() : $timestamp); | ||
649 | |||
650 | $dosTime = $this->getDosTime($timestamp); | ||
651 | $tsPack = pack("V", $timestamp); | ||
652 | |||
653 | $ux = "\x75\x78\x0B\x00\x01\x04\xE8\x03\x00\x00\x04\x00\x00\x00\x00"; | ||
654 | |||
655 | if (!isset($gpFlags) || strlen($gpFlags) != 2) { | ||
656 | $gpFlags = "\x00\x00"; | ||
657 | } | ||
658 | |||
659 | $isFileUTF8 = mb_check_encoding($filePath, "UTF-8") && !mb_check_encoding($filePath, "ASCII"); | ||
660 | $isCommentUTF8 = !empty($fileComment) && mb_check_encoding($fileComment, "UTF-8") && !mb_check_encoding($fileComment, "ASCII"); | ||
661 | if ($isFileUTF8 || $isCommentUTF8) { | ||
662 | $flag = 0; | ||
663 | $gpFlagsV = unpack("vflags", $gpFlags); | ||
664 | if (isset($gpFlagsV['flags'])) { | ||
665 | $flag = $gpFlagsV['flags']; | ||
666 | } | ||
667 | $gpFlags = pack("v", $flag | (1 << 11)); | ||
668 | } | ||
669 | |||
670 | $header = $gpFlags . $gzType . $dosTime. $fileCRC32 | ||
671 | . pack("VVv", $gzLength, $dataLength, strlen($filePath)); // File name length | ||
672 | |||
673 | $zipEntry = self::ZIP_LOCAL_FILE_HEADER; | ||
674 | $zipEntry .= self::ATTR_VERSION_TO_EXTRACT; | ||
675 | $zipEntry .= $header; | ||
676 | $zipEntry .= pack("v", ($this->addExtraField ? 28 : 0)); // Extra field length | ||
677 | $zipEntry .= $filePath; // FileName | ||
678 | // Extra fields | ||
679 | if ($this->addExtraField) { | ||
680 | $zipEntry .= "\x55\x54\x09\x00\x03" . $tsPack . $tsPack . $ux; | ||
681 | } | ||
682 | $this->zipwrite($zipEntry); | ||
683 | |||
684 | $cdEntry = self::ZIP_CENTRAL_FILE_HEADER; | ||
685 | $cdEntry .= self::ATTR_MADE_BY_VERSION; | ||
686 | $cdEntry .= ($dataLength === 0 ? "\x0A\x00" : self::ATTR_VERSION_TO_EXTRACT); | ||
687 | $cdEntry .= $header; | ||
688 | $cdEntry .= pack("v", ($this->addExtraField ? 24 : 0)); // Extra field length | ||
689 | $cdEntry .= pack("v", $fileCommentLength); // File comment length | ||
690 | $cdEntry .= "\x00\x00"; // Disk number start | ||
691 | $cdEntry .= "\x00\x00"; // internal file attributes | ||
692 | $cdEntry .= pack("V", $extFileAttr); // External file attributes | ||
693 | $cdEntry .= pack("V", $this->offset); // Relative offset of local header | ||
694 | $cdEntry .= $filePath; // FileName | ||
695 | // Extra fields | ||
696 | if ($this->addExtraField) { | ||
697 | $cdEntry .= "\x55\x54\x05\x00\x03" . $tsPack . $ux; | ||
698 | } | ||
699 | if (!empty($fileComment)) { | ||
700 | $cdEntry .= $fileComment; // Comment | ||
701 | } | ||
702 | |||
703 | $this->cdRec[] = $cdEntry; | ||
704 | $this->offset += strlen($zipEntry) + $gzLength; | ||
705 | } | ||
706 | |||
707 | private function zipwrite($data) { | ||
708 | if (!is_resource($this->zipFile)) { | ||
709 | $this->zipData .= $data; | ||
710 | } else { | ||
711 | fwrite($this->zipFile, $data); | ||
712 | fflush($this->zipFile); | ||
713 | } | ||
714 | } | ||
715 | |||
716 | private function zipflush() { | ||
717 | if (!is_resource($this->zipFile)) { | ||
718 | $this->zipFile = tmpfile(); | ||
719 | fwrite($this->zipFile, $this->zipData); | ||
720 | $this->zipData = NULL; | ||
721 | } | ||
722 | } | ||
723 | |||
724 | /** | ||
725 | * Join $file to $dir path, and clean up any excess slashes. | ||
726 | * | ||
727 | * @param string $dir | ||
728 | * @param string $file | ||
729 | */ | ||
730 | public static function pathJoin($dir, $file) { | ||
731 | if (empty($dir) || empty($file)) { | ||
732 | return self::getRelativePath($dir . $file); | ||
733 | } | ||
734 | return self::getRelativePath($dir . '/' . $file); | ||
735 | } | ||
736 | |||
737 | /** | ||
738 | * Clean up a path, removing any unnecessary elements such as /./, // or redundant ../ segments. | ||
739 | * If the path starts with a "/", it is deemed an absolute path and any /../ in the beginning is stripped off. | ||
740 | * The returned path will not end in a "/". | ||
741 | * | ||
742 | * Sometimes, when a path is generated from multiple fragments, | ||
743 | * you can get something like "../data/html/../images/image.jpeg" | ||
744 | * This will normalize that example path to "../data/images/image.jpeg" | ||
745 | * | ||
746 | * @param string $path The path to clean up | ||
747 | * @return string the clean path | ||
748 | */ | ||
749 | public static function getRelativePath($path) { | ||
750 | $path = preg_replace("#/+\.?/+#", "/", str_replace("\\", "/", $path)); | ||
751 | $dirs = explode("/", rtrim(preg_replace('#^(?:\./)+#', '', $path), '/')); | ||
752 | |||
753 | $offset = 0; | ||
754 | $sub = 0; | ||
755 | $subOffset = 0; | ||
756 | $root = ""; | ||
757 | |||
758 | if (empty($dirs[0])) { | ||
759 | $root = "/"; | ||
760 | $dirs = array_splice($dirs, 1); | ||
761 | } else if (preg_match("#[A-Za-z]:#", $dirs[0])) { | ||
762 | $root = strtoupper($dirs[0]) . "/"; | ||
763 | $dirs = array_splice($dirs, 1); | ||
764 | } | ||
765 | |||
766 | $newDirs = array(); | ||
767 | foreach ($dirs as $dir) { | ||
768 | if ($dir !== "..") { | ||
769 | $subOffset--; | ||
770 | $newDirs[++$offset] = $dir; | ||
771 | } else { | ||
772 | $subOffset++; | ||
773 | if (--$offset < 0) { | ||
774 | $offset = 0; | ||
775 | if ($subOffset > $sub) { | ||
776 | $sub++; | ||
777 | } | ||
778 | } | ||
779 | } | ||
780 | } | ||
781 | |||
782 | if (empty($root)) { | ||
783 | $root = str_repeat("../", $sub); | ||
784 | } | ||
785 | return $root . implode("/", array_slice($newDirs, 0, $offset)); | ||
786 | } | ||
787 | |||
788 | /** | ||
789 | * Create the file permissions for a file or directory, for use in the extFileAttr parameters. | ||
790 | * | ||
791 | * @param int $owner Unix permisions for owner (octal from 00 to 07) | ||
792 | * @param int $group Unix permisions for group (octal from 00 to 07) | ||
793 | * @param int $other Unix permisions for others (octal from 00 to 07) | ||
794 | * @param bool $isFile | ||
795 | * @return EXTRERNAL_REF field. | ||
796 | */ | ||
797 | public static function generateExtAttr($owner = 07, $group = 05, $other = 05, $isFile = true) { | ||
798 | $fp = $isFile ? self::S_IFREG : self::S_IFDIR; | ||
799 | $fp |= (($owner & 07) << 6) | (($group & 07) << 3) | ($other & 07); | ||
800 | |||
801 | return ($fp << 16) | ($isFile ? self::S_DOS_A : self::S_DOS_D); | ||
802 | } | ||
803 | |||
804 | /** | ||
805 | * Get the file permissions for a file or directory, for use in the extFileAttr parameters. | ||
806 | * | ||
807 | * @param string $filename | ||
808 | * @return external ref field, or FALSE if the file is not found. | ||
809 | */ | ||
810 | public static function getFileExtAttr($filename) { | ||
811 | if (file_exists($filename)) { | ||
812 | $fp = fileperms($filename) << 16; | ||
813 | return $fp | (is_dir($filename) ? self::S_DOS_D : self::S_DOS_A); | ||
814 | } | ||
815 | return FALSE; | ||
816 | } | ||
817 | } | ||
818 | ?> | ||
diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt new file mode 100644 index 00000000..9424a83e --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.LICENCE.txt | |||
@@ -0,0 +1,31 @@ | |||
1 | DrUUID RFC4122 library for PHP5 | ||
2 | by J. King (http://jkingweb.ca/) | ||
3 | Licensed under MIT license | ||
4 | |||
5 | See http://jkingweb.ca/code/php/lib.uuid/ | ||
6 | for documentation | ||
7 | |||
8 | Last revised 2010-02-15 | ||
9 | |||
10 | Copyright (c) 2009 J. King | ||
11 | |||
12 | Permission is hereby granted, free of charge, to any person | ||
13 | obtaining a copy of this software and associated documentation | ||
14 | files (the "Software"), to deal in the Software without | ||
15 | restriction, including without limitation the rights to use, | ||
16 | copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
17 | copies of the Software, and to permit persons to whom the | ||
18 | Software is furnished to do so, subject to the following | ||
19 | conditions: | ||
20 | |||
21 | The above copyright notice and this permission notice shall be | ||
22 | included in all copies or substantial portions of the Software. | ||
23 | |||
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
26 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
27 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
28 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
29 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
30 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
31 | OTHER DEALINGS IN THE SOFTWARE. | ||
diff --git a/inc/3rdparty/libraries/PHPePub/lib.uuid.php b/inc/3rdparty/libraries/PHPePub/lib.uuid.php new file mode 100644 index 00000000..c6a8de52 --- /dev/null +++ b/inc/3rdparty/libraries/PHPePub/lib.uuid.php | |||
@@ -0,0 +1,314 @@ | |||
1 | <?php | ||
2 | /* | ||
3 | DrUUID RFC4122 library for PHP5 | ||
4 | by J. King (http://jkingweb.ca/) | ||
5 | Licensed under MIT license | ||
6 | |||
7 | See http://jkingweb.ca/code/php/lib.uuid/ | ||
8 | for documentation | ||
9 | |||
10 | Last revised 2010-02-15 | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | Copyright (c) 2009 J. King | ||
15 | |||
16 | Permission is hereby granted, free of charge, to any person | ||
17 | obtaining a copy of this software and associated documentation | ||
18 | files (the "Software"), to deal in the Software without | ||
19 | restriction, including without limitation the rights to use, | ||
20 | copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
21 | copies of the Software, and to permit persons to whom the | ||
22 | Software is furnished to do so, subject to the following | ||
23 | conditions: | ||
24 | |||
25 | The above copyright notice and this permission notice shall be | ||
26 | included in all copies or substantial portions of the Software. | ||
27 | |||
28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
29 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
30 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
31 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
32 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
33 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
34 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
35 | OTHER DEALINGS IN THE SOFTWARE. | ||
36 | */ | ||
37 | |||
38 | |||
39 | class UUID { | ||
40 | const MD5 = 3; | ||
41 | const SHA1 = 5; | ||
42 | const clearVer = 15; // 00001111 Clears all bits of version byte with AND | ||
43 | const clearVar = 63; // 00111111 Clears all relevant bits of variant byte with AND | ||
44 | const varRes = 224; // 11100000 Variant reserved for future use | ||
45 | const varMS = 192; // 11000000 Microsft GUID variant | ||
46 | const varRFC = 128; // 10000000 The RFC 4122 variant (this variant) | ||
47 | const varNCS = 0; // 00000000 The NCS compatibility variant | ||
48 | const version1 = 16; // 00010000 | ||
49 | const version3 = 48; // 00110000 | ||
50 | const version4 = 64; // 01000000 | ||
51 | const version5 = 80; // 01010000 | ||
52 | const interval = 0x01b21dd213814000; // Time (in 100ns steps) between the start of the UTC and Unix epochs | ||
53 | const nsDNS = '6ba7b810-9dad-11d1-80b4-00c04fd430c8'; | ||
54 | const nsURL = '6ba7b811-9dad-11d1-80b4-00c04fd430c8'; | ||
55 | const nsOID = '6ba7b812-9dad-11d1-80b4-00c04fd430c8'; | ||
56 | const nsX500 = '6ba7b814-9dad-11d1-80b4-00c04fd430c8'; | ||
57 | protected static $randomFunc = 'randomTwister'; | ||
58 | protected static $randomSource = NULL; | ||
59 | //instance properties | ||
60 | protected $bytes; | ||
61 | protected $hex; | ||
62 | protected $string; | ||
63 | protected $urn; | ||
64 | protected $version; | ||
65 | protected $variant; | ||
66 | protected $node; | ||
67 | protected $time; | ||
68 | |||
69 | public static function mint($ver = 1, $node = NULL, $ns = NULL) { | ||
70 | /* Create a new UUID based on provided data. */ | ||
71 | switch((int) $ver) { | ||
72 | case 1: | ||
73 | return new self(self::mintTime($node)); | ||
74 | case 2: | ||
75 | // Version 2 is not supported | ||
76 | throw new UUIDException("Version 2 is unsupported."); | ||
77 | case 3: | ||
78 | return new self(self::mintName(self::MD5, $node, $ns)); | ||
79 | case 4: | ||
80 | return new self(self::mintRand()); | ||
81 | case 5: | ||
82 | return new self(self::mintName(self::SHA1, $node, $ns)); | ||
83 | default: | ||
84 | throw new UUIDException("Selected version is invalid or unsupported."); | ||
85 | } | ||
86 | } | ||
87 | |||
88 | public static function import($uuid) { | ||
89 | /* Import an existing UUID. */ | ||
90 | return new self(self::makeBin($uuid, 16)); | ||
91 | } | ||
92 | |||
93 | public static function compare($a, $b) { | ||
94 | /* Compares the binary representations of two UUIDs. | ||
95 | The comparison will return true if they are bit-exact, | ||
96 | or if neither is valid. */ | ||
97 | if (self::makeBin($a, 16)==self::makeBin($b, 16)) { | ||
98 | return TRUE; | ||
99 | } else { | ||
100 | return FALSE; | ||
101 | } | ||
102 | } | ||
103 | |||
104 | public function __toString() { | ||
105 | return $this->string; | ||
106 | } | ||
107 | |||
108 | public function __get($var) { | ||
109 | switch($var) { | ||
110 | case "bytes": | ||
111 | return $this->bytes; | ||
112 | case "hex": | ||
113 | return bin2hex($this->bytes); | ||
114 | case "string": | ||
115 | return $this->__toString(); | ||
116 | case "urn": | ||
117 | return "urn:uuid:".$this->__toString(); | ||
118 | case "version": | ||
119 | return ord($this->bytes[6]) >> 4; | ||
120 | case "variant": | ||
121 | $byte = ord($this->bytes[8]); | ||
122 | if ($byte >= self::varRes) { | ||
123 | return 3; | ||
124 | } | ||
125 | if ($byte >= self::varMS) { | ||
126 | return 2; | ||
127 | } | ||
128 | if ($byte >= self::varRFC) { | ||
129 | return 1; | ||
130 | } | ||
131 | return 0; | ||
132 | case "node": | ||
133 | if (ord($this->bytes[6])>>4==1) { | ||
134 | return bin2hex(substr($this->bytes,10)); | ||
135 | } else { | ||
136 | return NULL; | ||
137 | } | ||
138 | case "time": | ||
139 | if (ord($this->bytes[6])>>4==1) { | ||
140 | // Restore contiguous big-endian byte order | ||
141 | $time = bin2hex($this->bytes[6].$this->bytes[7].$this->bytes[4].$this->bytes[5].$this->bytes[0].$this->bytes[1].$this->bytes[2].$this->bytes[3]); | ||
142 | // Clear version flag | ||
143 | $time[0] = "0"; | ||
144 | // Do some reverse arithmetic to get a Unix timestamp | ||
145 | $time = (hexdec($time) - self::interval) / 10000000; | ||
146 | return $time; | ||
147 | } else { | ||
148 | return NULL; | ||
149 | } | ||
150 | default: | ||
151 | return NULL; | ||
152 | } | ||
153 | } | ||
154 | |||
155 | protected function __construct($uuid) { | ||
156 | if (strlen($uuid) != 16) { | ||
157 | throw new UUIDException("Input must be a 128-bit integer."); | ||
158 | } | ||
159 | $this->bytes = $uuid; | ||
160 | // Optimize the most common use | ||
161 | $this->string = | ||
162 | bin2hex(substr($uuid,0,4))."-". | ||
163 | bin2hex(substr($uuid,4,2))."-". | ||
164 | bin2hex(substr($uuid,6,2))."-". | ||
165 | bin2hex(substr($uuid,8,2))."-". | ||
166 | bin2hex(substr($uuid,10,6)); | ||
167 | } | ||
168 | |||
169 | protected static function mintTime($node = NULL) { | ||
170 | /* Generates a Version 1 UUID. | ||
171 | These are derived from the time at which they were generated. */ | ||
172 | // Get time since Gregorian calendar reform in 100ns intervals | ||
173 | // This is exceedingly difficult because of PHP's (and pack()'s) | ||
174 | // integer size limits. | ||
175 | // Note that this will never be more accurate than to the microsecond. | ||
176 | $time = microtime(1) * 10000000 + self::interval; | ||
177 | // Convert to a string representation | ||
178 | $time = sprintf("%F", $time); | ||
179 | preg_match("/^\d+/", $time, $time); //strip decimal point | ||
180 | // And now to a 64-bit binary representation | ||
181 | $time = base_convert($time[0], 10, 16); | ||
182 | $time = pack("H*", str_pad($time, 16, "0", STR_PAD_LEFT)); | ||
183 | // Reorder bytes to their proper locations in the UUID | ||
184 | $uuid = $time[4].$time[5].$time[6].$time[7].$time[2].$time[3].$time[0].$time[1]; | ||
185 | // Generate a random clock sequence | ||
186 | $uuid .= self::randomBytes(2); | ||
187 | // set variant | ||
188 | $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); | ||
189 | // set version | ||
190 | $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version1); | ||
191 | // Set the final 'node' parameter, a MAC address | ||
192 | if ($node) { | ||
193 | $node = self::makeBin($node, 6); | ||
194 | } | ||
195 | if (!$node) { | ||
196 | // If no node was provided or if the node was invalid, | ||
197 | // generate a random MAC address and set the multicast bit | ||
198 | $node = self::randomBytes(6); | ||
199 | $node[0] = pack("C", ord($node[0]) | 1); | ||
200 | } | ||
201 | $uuid .= $node; | ||
202 | return $uuid; | ||
203 | } | ||
204 | |||
205 | protected static function mintRand() { | ||
206 | /* Generate a Version 4 UUID. | ||
207 | These are derived soly from random numbers. */ | ||
208 | // generate random fields | ||
209 | $uuid = self::randomBytes(16); | ||
210 | // set variant | ||
211 | $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); | ||
212 | // set version | ||
213 | $uuid[6] = chr(ord($uuid[6]) & self::clearVer | self::version4); | ||
214 | return $uuid; | ||
215 | } | ||
216 | |||
217 | protected static function mintName($ver, $node, $ns) { | ||
218 | /* Generates a Version 3 or Version 5 UUID. | ||
219 | These are derived from a hash of a name and its namespace, in binary form. */ | ||
220 | if (!$node) { | ||
221 | throw new UUIDException("A name-string is required for Version 3 or 5 UUIDs."); | ||
222 | } | ||
223 | // if the namespace UUID isn't binary, make it so | ||
224 | $ns = self::makeBin($ns, 16); | ||
225 | if (!$ns) { | ||
226 | throw new UUIDException("A binary namespace is required for Version 3 or 5 UUIDs."); | ||
227 | } | ||
228 | $uuid = null; | ||
229 | $version = self::version3; | ||
230 | switch($ver) { | ||
231 | case self::MD5: | ||
232 | $version = self::version3; | ||
233 | $uuid = md5($ns.$node,1); | ||
234 | break; | ||
235 | case self::SHA1: | ||
236 | $version = self::version5; | ||
237 | $uuid = substr(sha1($ns.$node,1),0, 16); | ||
238 | break; | ||
239 | } | ||
240 | // set variant | ||
241 | $uuid[8] = chr(ord($uuid[8]) & self::clearVar | self::varRFC); | ||
242 | // set version | ||
243 | $uuid[6] = chr(ord($uuid[6]) & self::clearVer | $version); | ||
244 | return ($uuid); | ||
245 | } | ||
246 | |||
247 | protected static function makeBin($str, $len) { | ||
248 | /* Insure that an input string is either binary or hexadecimal. | ||
249 | Returns binary representation, or false on failure. */ | ||
250 | if ($str instanceof self) { | ||
251 | return $str->bytes; | ||
252 | } | ||
253 | if (strlen($str)==$len) { | ||
254 | return $str; | ||
255 | } else { | ||
256 | $str = preg_replace("/^urn:uuid:/is", "", $str); // strip URN scheme and namespace | ||
257 | } | ||
258 | $str = preg_replace("/[^a-f0-9]/is", "", $str); // strip non-hex characters | ||
259 | if (strlen($str) != ($len * 2)) { | ||
260 | return FALSE; | ||
261 | } else { | ||
262 | return pack("H*", $str); | ||
263 | } | ||
264 | } | ||
265 | |||
266 | public static function initRandom() { | ||
267 | /* Look for a system-provided source of randomness, which is usually crytographically secure. | ||
268 | /dev/urandom is tried first simply out of bias for Linux systems. */ | ||
269 | if (is_readable('/dev/urandom')) { | ||
270 | self::$randomSource = fopen('/dev/urandom', 'rb'); | ||
271 | self::$randomFunc = 'randomFRead'; | ||
272 | } | ||
273 | else if (class_exists('COM', 0)) { | ||
274 | try { | ||
275 | self::$randomSource = new COM('CAPICOM.Utilities.1'); // See http://msdn.microsoft.com/en-us/library/aa388182(VS.85).aspx | ||
276 | self::$randomFunc = 'randomCOM'; | ||
277 | } | ||
278 | catch(Exception $e) { | ||
279 | } | ||
280 | } | ||
281 | return self::$randomFunc; | ||
282 | } | ||
283 | |||
284 | public static function randomBytes($bytes) { | ||
285 | return call_user_func(array('self', self::$randomFunc), $bytes); | ||
286 | } | ||
287 | |||
288 | protected static function randomTwister($bytes) { | ||
289 | /* Get the specified number of random bytes, using mt_rand(). | ||
290 | Randomness is returned as a string of bytes. */ | ||
291 | $rand = ""; | ||
292 | for ($a = 0; $a < $bytes; $a++) { | ||
293 | $rand .= chr(mt_rand(0, 255)); | ||
294 | } | ||
295 | return $rand; | ||
296 | } | ||
297 | |||
298 | protected static function randomFRead($bytes) { | ||
299 | /* Get the specified number of random bytes using a file handle | ||
300 | previously opened with UUID::initRandom(). | ||
301 | Randomness is returned as a string of bytes. */ | ||
302 | return fread(self::$randomSource, $bytes); | ||
303 | } | ||
304 | |||
305 | protected static function randomCOM($bytes) { | ||
306 | /* Get the specified number of random bytes using Windows' | ||
307 | randomness source via a COM object previously created by UUID::initRandom(). | ||
308 | Randomness is returned as a string of bytes. */ | ||
309 | return base64_decode(self::$randomSource->GetRandom($bytes,0)); // straight binary mysteriously doesn't work, hence the base64 | ||
310 | } | ||
311 | } | ||
312 | |||
313 | class UUIDException extends Exception { | ||
314 | } | ||
diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php index ddd33bb5..21e693e7 100644 --- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php +++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php | |||
@@ -1,728 +1,727 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Content Extractor | 3 | * Content Extractor |
4 | * | 4 | * |
5 | * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) | 5 | * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) |
6 | * to extract content from HTML files. | 6 | * to extract content from HTML files. |
7 | * | 7 | * |
8 | * @version 1.0 | 8 | * @version 1.0 |
9 | * @date 2013-02-05 | 9 | * @date 2013-02-05 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2013 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class ContentExtractor | 15 | class ContentExtractor |
16 | { | 16 | { |
17 | protected static $tidy_config = array( | 17 | protected static $tidy_config = array( |
18 | 'clean' => true, | 18 | 'clean' => true, |
19 | 'output-xhtml' => true, | 19 | 'output-xhtml' => true, |
20 | 'logical-emphasis' => true, | 20 | 'logical-emphasis' => true, |
21 | 'show-body-only' => false, | 21 | 'show-body-only' => false, |
22 | 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', | 22 | 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', |
23 | 'new-inline-tags' => 'mark, time, meter, progress, data', | 23 | 'new-inline-tags' => 'mark, time, meter, progress, data', |
24 | 'wrap' => 0, | 24 | 'wrap' => 0, |
25 | 'drop-empty-paras' => true, | 25 | 'drop-empty-paras' => true, |
26 | 'drop-proprietary-attributes' => false, | 26 | 'drop-proprietary-attributes' => false, |
27 | 'enclose-text' => true, | 27 | 'enclose-text' => true, |
28 | 'enclose-block-text' => true, | 28 | 'enclose-block-text' => true, |
29 | 'merge-divs' => true, | 29 | 'merge-divs' => true, |
30 | 'merge-spans' => true, | 30 | 'merge-spans' => true, |
31 | 'char-encoding' => 'utf8', | 31 | 'char-encoding' => 'utf8', |
32 | 'hide-comments' => true | 32 | 'hide-comments' => true |
33 | ); | 33 | ); |
34 | protected $html; | 34 | protected $html; |
35 | protected $config; | 35 | protected $config; |
36 | protected $title; | 36 | protected $title; |
37 | protected $author = array(); | 37 | protected $author = array(); |
38 | protected $language; | 38 | protected $language; |
39 | protected $date; | 39 | protected $date; |
40 | protected $body; | 40 | protected $body; |
41 | protected $success = false; | 41 | protected $success = false; |
42 | protected $nextPageUrl; | 42 | protected $nextPageUrl; |
43 | public $allowedParsers = array('libxml', 'html5lib'); | 43 | public $allowedParsers = array('libxml', 'html5lib'); |
44 | public $fingerprints = array(); | 44 | public $fingerprints = array(); |
45 | public $readability; | 45 | public $readability; |
46 | public $debug = false; | 46 | public $debug = false; |
47 | public $debugVerbose = false; | 47 | public $debugVerbose = false; |
48 | 48 | ||
49 | function __construct($path, $fallback=null) { | 49 | function __construct($path, $fallback=null) { |
50 | SiteConfig::set_config_path($path, $fallback); | 50 | SiteConfig::set_config_path($path, $fallback); |
51 | } | 51 | } |
52 | 52 | ||
53 | protected function debug($msg) { | 53 | protected function debug($msg) { |
54 | if ($this->debug) { | 54 | if ($this->debug) { |
55 | $mem = round(memory_get_usage()/1024, 2); | 55 | $mem = round(memory_get_usage()/1024, 2); |
56 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 56 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
57 | echo '* ',$msg; | 57 | echo '* ',$msg; |
58 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 58 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
59 | echo "\n"; | 59 | echo "\n"; |
60 | ob_flush(); | 60 | ob_flush(); |
61 | flush(); | 61 | flush(); |
62 | } | 62 | } |
63 | } | 63 | } |
64 | 64 | ||
65 | public function reset() { | 65 | public function reset() { |
66 | $this->html = null; | 66 | $this->html = null; |
67 | $this->readability = null; | 67 | $this->readability = null; |
68 | $this->config = null; | 68 | $this->config = null; |
69 | $this->title = null; | 69 | $this->title = null; |
70 | $this->body = null; | 70 | $this->body = null; |
71 | $this->author = array(); | 71 | $this->author = array(); |
72 | $this->language = null; | 72 | $this->language = null; |
73 | $this->date = null; | 73 | $this->date = null; |
74 | $this->nextPageUrl = null; | 74 | $this->nextPageUrl = null; |
75 | $this->success = false; | 75 | $this->success = false; |
76 | } | 76 | } |
77 | 77 | ||
78 | public function findHostUsingFingerprints($html) { | 78 | public function findHostUsingFingerprints($html) { |
79 | $this->debug('Checking fingerprints...'); | 79 | $this->debug('Checking fingerprints...'); |
80 | $head = substr($html, 0, 8000); | 80 | $head = substr($html, 0, 8000); |
81 | foreach ($this->fingerprints as $_fp => $_fphost) { | 81 | foreach ($this->fingerprints as $_fp => $_fphost) { |
82 | $lookin = 'html'; | 82 | $lookin = 'html'; |
83 | if (is_array($_fphost)) { | 83 | if (is_array($_fphost)) { |
84 | if (isset($_fphost['head']) && $_fphost['head']) { | 84 | if (isset($_fphost['head']) && $_fphost['head']) { |
85 | $lookin = 'head'; | 85 | $lookin = 'head'; |
86 | } | 86 | } |
87 | $_fphost = $_fphost['hostname']; | 87 | $_fphost = $_fphost['hostname']; |
88 | } | 88 | } |
89 | if (strpos($$lookin, $_fp) !== false) { | 89 | if (strpos($$lookin, $_fp) !== false) { |
90 | $this->debug("Found match: $_fphost"); | 90 | $this->debug("Found match: $_fphost"); |
91 | return $_fphost; | 91 | return $_fphost; |
92 | } | 92 | } |
93 | } | 93 | } |
94 | $this->debug('No fingerprint matches'); | 94 | $this->debug('No fingerprint matches'); |
95 | return false; | 95 | return false; |
96 | } | 96 | } |
97 | 97 | ||
98 | // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) | 98 | // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) |
99 | public function buildSiteConfig($url, $html='', $add_to_cache=true) { | 99 | public function buildSiteConfig($url, $html='', $add_to_cache=true) { |
100 | // extract host name | 100 | // extract host name |
101 | $host = @parse_url($url, PHP_URL_HOST); | 101 | $host = @parse_url($url, PHP_URL_HOST); |
102 | $host = strtolower($host); | 102 | $host = strtolower($host); |
103 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 103 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
104 | // is merged version already cached? | 104 | // is merged version already cached? |
105 | if (SiteConfig::is_cached("$host.merged")) { | 105 | if (SiteConfig::is_cached("$host.merged")) { |
106 | $this->debug("Returning cached and merged site config for $host"); | 106 | $this->debug("Returning cached and merged site config for $host"); |
107 | return SiteConfig::build("$host.merged"); | 107 | return SiteConfig::build("$host.merged"); |
108 | } | 108 | } |
109 | // let's build from site_config/custom/ and standard/ | 109 | // let's build from site_config/custom/ and standard/ |
110 | $config = SiteConfig::build($host); | 110 | $config = SiteConfig::build($host); |
111 | if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { | 111 | if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { |
112 | SiteConfig::add_to_cache($host, $config); | 112 | SiteConfig::add_to_cache($host, $config); |
113 | } | 113 | } |
114 | // if no match, use defaults | 114 | // if no match, use defaults |
115 | if (!$config) $config = new SiteConfig(); | 115 | if (!$config) $config = new SiteConfig(); |
116 | // load fingerprint config? | 116 | // load fingerprint config? |
117 | if ($config->autodetect_on_failure()) { | 117 | if ($config->autodetect_on_failure()) { |
118 | // check HTML for fingerprints | 118 | // check HTML for fingerprints |
119 | if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { | 119 | if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) { |
120 | if ($config_fingerprint = SiteConfig::build($_fphost)) { | 120 | if ($config_fingerprint = SiteConfig::build($_fphost)) { |
121 | $this->debug("Appending site config settings from $_fphost (fingerprint match)"); | 121 | $this->debug("Appending site config settings from $_fphost (fingerprint match)"); |
122 | $config->append($config_fingerprint); | 122 | $config->append($config_fingerprint); |
123 | if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { | 123 | if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { |
124 | //$config_fingerprint->cache_in_apc = true; | 124 | //$config_fingerprint->cache_in_apc = true; |
125 | SiteConfig::add_to_cache($_fphost, $config_fingerprint); | 125 | SiteConfig::add_to_cache($_fphost, $config_fingerprint); |
126 | } | 126 | } |
127 | } | 127 | } |
128 | } | 128 | } |
129 | } | 129 | } |
130 | // load global config? | 130 | // load global config? |
131 | if ($config->autodetect_on_failure()) { | 131 | if ($config->autodetect_on_failure()) { |
132 | if ($config_global = SiteConfig::build('global', true)) { | 132 | if ($config_global = SiteConfig::build('global', true)) { |
133 | $this->debug('Appending site config settings from global.txt'); | 133 | $this->debug('Appending site config settings from global.txt'); |
134 | $config->append($config_global); | 134 | $config->append($config_global); |
135 | if ($add_to_cache && !SiteConfig::is_cached('global')) { | 135 | if ($add_to_cache && !SiteConfig::is_cached('global')) { |
136 | //$config_global->cache_in_apc = true; | 136 | //$config_global->cache_in_apc = true; |
137 | SiteConfig::add_to_cache('global', $config_global); | 137 | SiteConfig::add_to_cache('global', $config_global); |
138 | } | 138 | } |
139 | } | 139 | } |
140 | } | 140 | } |
141 | // store copy of merged config | 141 | // store copy of merged config |
142 | if ($add_to_cache) { | 142 | if ($add_to_cache) { |
143 | // do not store in APC if wildcard match | 143 | // do not store in APC if wildcard match |
144 | $use_apc = ($host == $config->cache_key); | 144 | $use_apc = ($host == $config->cache_key); |
145 | $config->cache_key = null; | 145 | $config->cache_key = null; |
146 | SiteConfig::add_to_cache("$host.merged", $config, $use_apc); | 146 | SiteConfig::add_to_cache("$host.merged", $config, $use_apc); |
147 | } | 147 | } |
148 | return $config; | 148 | return $config; |
149 | } | 149 | } |
150 | 150 | ||
151 | // returns true on success, false on failure | 151 | // returns true on success, false on failure |
152 | // $smart_tidy indicates that if tidy is used and no results are produced, we will | 152 | // $smart_tidy indicates that if tidy is used and no results are produced, we will |
153 | // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time | 153 | // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time |
154 | // but it has problems of its own which we try to avoid with this option. | 154 | // but it has problems of its own which we try to avoid with this option. |
155 | public function process($html, $url, $smart_tidy=true) { | 155 | public function process($html, $url, $smart_tidy=true) { |
156 | $this->reset(); | 156 | $this->reset(); |
157 | $this->config = $this->buildSiteConfig($url, $html); | 157 | $this->config = $this->buildSiteConfig($url, $html); |
158 | 158 | ||
159 | // do string replacements | 159 | // do string replacements |
160 | if (!empty($this->config->find_string)) { | 160 | if (!empty($this->config->find_string)) { |
161 | if (count($this->config->find_string) == count($this->config->replace_string)) { | 161 | if (count($this->config->find_string) == count($this->config->replace_string)) { |
162 | $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); | 162 | $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count); |
163 | $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); | 163 | $this->debug("Strings replaced: $_count (find_string and/or replace_string)"); |
164 | } else { | 164 | } else { |
165 | $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); | 165 | $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); |
166 | } | 166 | } |
167 | unset($_count); | 167 | unset($_count); |
168 | } | 168 | } |
169 | 169 | ||
170 | // use tidy (if it exists)? | 170 | // use tidy (if it exists)? |
171 | // This fixes problems with some sites which would otherwise | 171 | // This fixes problems with some sites which would otherwise |
172 | // trouble DOMDocument's HTML parsing. (Although sometimes it | 172 | // trouble DOMDocument's HTML parsing. (Although sometimes it |
173 | // makes matters worse, which is why you can override it in site config files.) | 173 | // makes matters worse, which is why you can override it in site config files.) |
174 | $tidied = false; | 174 | $tidied = false; |
175 | if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { | 175 | if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) { |
176 | $this->debug('Using Tidy'); | 176 | $this->debug('Using Tidy'); |
177 | $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); | 177 | $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8'); |
178 | if (tidy_clean_repair($tidy)) { | 178 | if (tidy_clean_repair($tidy)) { |
179 | $original_html = $html; | 179 | $original_html = $html; |
180 | $tidied = true; | 180 | $tidied = true; |
181 | $html = $tidy->value; | 181 | $html = $tidy->value; |
182 | } | 182 | } |
183 | unset($tidy); | 183 | unset($tidy); |
184 | } | 184 | } |
185 | 185 | ||
186 | // load and parse html | 186 | // load and parse html |
187 | $_parser = $this->config->parser(); | 187 | $_parser = $this->config->parser(); |
188 | if (!in_array($_parser, $this->allowedParsers)) { | 188 | if (!in_array($_parser, $this->allowedParsers)) { |
189 | $this->debug("HTML parser $_parser not listed, using libxml instead"); | 189 | $this->debug("HTML parser $_parser not listed, using libxml instead"); |
190 | $_parser = 'libxml'; | 190 | $_parser = 'libxml'; |
191 | } | 191 | } |
192 | $this->debug("Attempting to parse HTML with $_parser"); | 192 | $this->debug("Attempting to parse HTML with $_parser"); |
193 | $this->readability = new Readability($html, $url, $_parser); | 193 | $this->readability = new Readability($html, $url, $_parser); |
194 | 194 | ||
195 | // we use xpath to find elements in the given HTML document | 195 | // we use xpath to find elements in the given HTML document |
196 | // see http://en.wikipedia.org/wiki/XPath_1.0 | 196 | // see http://en.wikipedia.org/wiki/XPath_1.0 |
197 | $xpath = new DOMXPath($this->readability->dom); | 197 | $xpath = new DOMXPath($this->readability->dom); |
198 | 198 | ||
199 | // try to get next page link | 199 | // try to get next page link |
200 | foreach ($this->config->next_page_link as $pattern) { | 200 | foreach ($this->config->next_page_link as $pattern) { |
201 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 201 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
202 | if (is_string($elems)) { | 202 | if (is_string($elems)) { |
203 | $this->nextPageUrl = trim($elems); | 203 | $this->nextPageUrl = trim($elems); |
204 | break; | 204 | break; |
205 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 205 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
206 | foreach ($elems as $item) { | 206 | foreach ($elems as $item) { |
207 | if ($item instanceof DOMElement && $item->hasAttribute('href')) { | 207 | if ($item instanceof DOMElement && $item->hasAttribute('href')) { |
208 | $this->nextPageUrl = $item->getAttribute('href'); | 208 | $this->nextPageUrl = $item->getAttribute('href'); |
209 | break 2; | 209 | break 2; |
210 | } elseif ($item instanceof DOMAttr && $item->value) { | 210 | } elseif ($item instanceof DOMAttr && $item->value) { |
211 | $this->nextPageUrl = $item->value; | 211 | $this->nextPageUrl = $item->value; |
212 | break 2; | 212 | break 2; |
213 | } | 213 | } |
214 | } | 214 | } |
215 | } | 215 | } |
216 | } | 216 | } |
217 | 217 | ||
218 | // try to get title | 218 | // try to get title |
219 | foreach ($this->config->title as $pattern) { | 219 | foreach ($this->config->title as $pattern) { |
220 | // $this->debug("Trying $pattern"); | 220 | // $this->debug("Trying $pattern"); |
221 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 221 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
222 | if (is_string($elems)) { | 222 | if (is_string($elems)) { |
223 | $this->title = trim($elems); | 223 | $this->title = trim($elems); |
224 | $this->debug('Title expression evaluated as string: '.$this->title); | 224 | $this->debug('Title expression evaluated as string: '.$this->title); |
225 | $this->debug("...XPath match: $pattern"); | 225 | $this->debug("...XPath match: $pattern"); |
226 | break; | 226 | break; |
227 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 227 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
228 | $this->title = $elems->item(0)->textContent; | 228 | $this->title = $elems->item(0)->textContent; |
229 | $this->debug('Title matched: '.$this->title); | 229 | $this->debug('Title matched: '.$this->title); |
230 | $this->debug("...XPath match: $pattern"); | 230 | $this->debug("...XPath match: $pattern"); |
231 | // remove title from document | 231 | // remove title from document |
232 | try { | 232 | try { |
233 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 233 | @$elems->item(0)->parentNode->removeChild($elems->item(0)); |
234 | } catch (DOMException $e) { | 234 | } catch (DOMException $e) { |
235 | // do nothing | 235 | // do nothing |
236 | } | 236 | } |
237 | break; | 237 | break; |
238 | } | 238 | } |
239 | } | 239 | } |
240 | 240 | ||
241 | // try to get author (if it hasn't already been set) | 241 | // try to get author (if it hasn't already been set) |
242 | if (empty($this->author)) { | 242 | if (empty($this->author)) { |
243 | foreach ($this->config->author as $pattern) { | 243 | foreach ($this->config->author as $pattern) { |
244 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 244 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
245 | if (is_string($elems)) { | 245 | if (is_string($elems)) { |
246 | if (trim($elems) != '') { | 246 | if (trim($elems) != '') { |
247 | $this->author[] = trim($elems); | 247 | $this->author[] = trim($elems); |
248 | $this->debug('Author expression evaluated as string: '.trim($elems)); | 248 | $this->debug('Author expression evaluated as string: '.trim($elems)); |
249 | $this->debug("...XPath match: $pattern"); | 249 | $this->debug("...XPath match: $pattern"); |
250 | break; | 250 | break; |
251 | } | 251 | } |
252 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 252 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
253 | foreach ($elems as $elem) { | 253 | foreach ($elems as $elem) { |
254 | if (!isset($elem->parentNode)) continue; | 254 | if (!isset($elem->parentNode)) continue; |
255 | $this->author[] = trim($elem->textContent); | 255 | $this->author[] = trim($elem->textContent); |
256 | $this->debug('Author matched: '.trim($elem->textContent)); | 256 | $this->debug('Author matched: '.trim($elem->textContent)); |
257 | } | 257 | } |
258 | if (!empty($this->author)) { | 258 | if (!empty($this->author)) { |
259 | $this->debug("...XPath match: $pattern"); | 259 | $this->debug("...XPath match: $pattern"); |
260 | break; | 260 | break; |
261 | } | 261 | } |
262 | } | 262 | } |
263 | } | 263 | } |
264 | } | 264 | } |
265 | 265 | ||
266 | // try to get language | 266 | // try to get language |
267 | $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); | 267 | $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content'); |
268 | foreach ($_lang_xpath as $pattern) { | 268 | foreach ($_lang_xpath as $pattern) { |
269 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 269 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
270 | if (is_string($elems)) { | 270 | if (is_string($elems)) { |
271 | if (trim($elems) != '') { | 271 | if (trim($elems) != '') { |
272 | $this->language = trim($elems); | 272 | $this->language = trim($elems); |
273 | $this->debug('Language matched: '.$this->language); | 273 | $this->debug('Language matched: '.$this->language); |
274 | break; | 274 | break; |
275 | } | 275 | } |
276 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 276 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
277 | foreach ($elems as $elem) { | 277 | foreach ($elems as $elem) { |
278 | if (!isset($elem->parentNode)) continue; | 278 | if (!isset($elem->parentNode)) continue; |
279 | $this->language = trim($elem->textContent); | 279 | $this->language = trim($elem->textContent); |
280 | $this->debug('Language matched: '.$this->language); | 280 | $this->debug('Language matched: '.$this->language); |
281 | } | 281 | } |
282 | if ($this->language) break; | 282 | if ($this->language) break; |
283 | } | 283 | } |
284 | } | 284 | } |
285 | 285 | ||
286 | // try to get date | 286 | // try to get date |
287 | foreach ($this->config->date as $pattern) { | 287 | foreach ($this->config->date as $pattern) { |
288 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); | 288 | $elems = @$xpath->evaluate($pattern, $this->readability->dom); |
289 | if (is_string($elems)) { | 289 | if (is_string($elems)) { |
290 | $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); | 290 | $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B")); |
291 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { | 291 | } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { |
292 | $this->date = $elems->item(0)->textContent; | 292 | $this->date = $elems->item(0)->textContent; |
293 | $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); | 293 | $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B")); |
294 | // remove date from document | 294 | // remove date from document |
295 | // $elems->item(0)->parentNode->removeChild($elems->item(0)); | 295 | // $elems->item(0)->parentNode->removeChild($elems->item(0)); |
296 | } | 296 | } |
297 | if (!$this->date) { | 297 | if (!$this->date) { |
298 | $this->date = null; | 298 | $this->date = null; |
299 | } else { | 299 | } else { |
300 | $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); | 300 | $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date)); |
301 | $this->debug("...XPath match: $pattern"); | 301 | $this->debug("...XPath match: $pattern"); |
302 | break; | 302 | break; |
303 | } | 303 | } |
304 | } | 304 | } |
305 | 305 | ||
306 | // strip elements (using xpath expressions) | 306 | // strip elements (using xpath expressions) |
307 | foreach ($this->config->strip as $pattern) { | 307 | foreach ($this->config->strip as $pattern) { |
308 | $elems = @$xpath->query($pattern, $this->readability->dom); | 308 | $elems = @$xpath->query($pattern, $this->readability->dom); |
309 | // check for matches | 309 | // check for matches |
310 | if ($elems && $elems->length > 0) { | 310 | if ($elems && $elems->length > 0) { |
311 | $this->debug('Stripping '.$elems->length.' elements (strip)'); | 311 | $this->debug('Stripping '.$elems->length.' elements (strip)'); |
312 | for ($i=$elems->length-1; $i >= 0; $i--) { | 312 | for ($i=$elems->length-1; $i >= 0; $i--) { |
313 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 313 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
314 | } | 314 | } |
315 | } | 315 | } |
316 | } | 316 | } |
317 | 317 | ||
318 | // strip elements (using id and class attribute values) | 318 | // strip elements (using id and class attribute values) |
319 | foreach ($this->config->strip_id_or_class as $string) { | 319 | foreach ($this->config->strip_id_or_class as $string) { |
320 | $string = strtr($string, array("'"=>'', '"'=>'')); | 320 | $string = strtr($string, array("'"=>'', '"'=>'')); |
321 | $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); | 321 | $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); |
322 | // check for matches | 322 | // check for matches |
323 | if ($elems && $elems->length > 0) { | 323 | if ($elems && $elems->length > 0) { |
324 | $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); | 324 | $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); |
325 | for ($i=$elems->length-1; $i >= 0; $i--) { | 325 | for ($i=$elems->length-1; $i >= 0; $i--) { |
326 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 326 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
327 | } | 327 | } |
328 | } | 328 | } |
329 | } | 329 | } |
330 | 330 | ||
331 | // strip images (using src attribute values) | 331 | // strip images (using src attribute values) |
332 | foreach ($this->config->strip_image_src as $string) { | 332 | foreach ($this->config->strip_image_src as $string) { |
333 | $string = strtr($string, array("'"=>'', '"'=>'')); | 333 | $string = strtr($string, array("'"=>'', '"'=>'')); |
334 | $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); | 334 | $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); |
335 | // check for matches | 335 | // check for matches |
336 | if ($elems && $elems->length > 0) { | 336 | if ($elems && $elems->length > 0) { |
337 | $this->debug('Stripping '.$elems->length.' image elements'); | 337 | $this->debug('Stripping '.$elems->length.' image elements'); |
338 | for ($i=$elems->length-1; $i >= 0; $i--) { | 338 | for ($i=$elems->length-1; $i >= 0; $i--) { |
339 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 339 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
340 | } | 340 | } |
341 | } | 341 | } |
342 | } | 342 | } |
343 | // strip elements using Readability.com and Instapaper.com ignore class names | 343 | // strip elements using Readability.com and Instapaper.com ignore class names |
344 | // .entry-unrelated and .instapaper_ignore | 344 | // .entry-unrelated and .instapaper_ignore |
345 | // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines | 345 | // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines |
346 | // and http://blog.instapaper.com/post/730281947 | 346 | // and http://blog.instapaper.com/post/730281947 |
347 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); | 347 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom); |
348 | // check for matches | 348 | // check for matches |
349 | if ($elems && $elems->length > 0) { | 349 | if ($elems && $elems->length > 0) { |
350 | $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); | 350 | $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements'); |
351 | for ($i=$elems->length-1; $i >= 0; $i--) { | 351 | for ($i=$elems->length-1; $i >= 0; $i--) { |
352 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 352 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
353 | } | 353 | } |
354 | } | 354 | } |
355 | 355 | ||
356 | // strip elements that contain style="display: none;" | 356 | // strip elements that contain style="display: none;" |
357 | $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); | 357 | $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom); |
358 | // check for matches | 358 | // check for matches |
359 | if ($elems && $elems->length > 0) { | 359 | if ($elems && $elems->length > 0) { |
360 | $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); | 360 | $this->debug('Stripping '.$elems->length.' elements with inline display:none style'); |
361 | for ($i=$elems->length-1; $i >= 0; $i--) { | 361 | for ($i=$elems->length-1; $i >= 0; $i--) { |
362 | $elems->item($i)->parentNode->removeChild($elems->item($i)); | 362 | $elems->item($i)->parentNode->removeChild($elems->item($i)); |
363 | } | 363 | } |
364 | } | 364 | } |
365 | 365 | ||
366 | // try to get body | 366 | // try to get body |
367 | foreach ($this->config->body as $pattern) { | 367 | foreach ($this->config->body as $pattern) { |
368 | $elems = @$xpath->query($pattern, $this->readability->dom); | 368 | $elems = @$xpath->query($pattern, $this->readability->dom); |
369 | // check for matches | 369 | // check for matches |
370 | if ($elems && $elems->length > 0) { | 370 | if ($elems && $elems->length > 0) { |
371 | $this->debug('Body matched'); | 371 | $this->debug('Body matched'); |
372 | $this->debug("...XPath match: $pattern"); | 372 | $this->debug("...XPath match: $pattern"); |
373 | if ($elems->length == 1) { | 373 | if ($elems->length == 1) { |
374 | $this->body = $elems->item(0); | 374 | $this->body = $elems->item(0); |
375 | // prune (clean up elements that may not be content) | 375 | // prune (clean up elements that may not be content) |
376 | if ($this->config->prune()) { | 376 | if ($this->config->prune()) { |
377 | $this->debug('...pruning content'); | 377 | $this->debug('...pruning content'); |
378 | $this->readability->prepArticle($this->body); | 378 | $this->readability->prepArticle($this->body); |
379 | } | 379 | } |
380 | break; | 380 | break; |
381 | } else { | 381 | } else { |
382 | $this->body = $this->readability->dom->createElement('div'); | 382 | $this->body = $this->readability->dom->createElement('div'); |
383 | $this->debug($elems->length.' body elems found'); | 383 | $this->debug($elems->length.' body elems found'); |
384 | foreach ($elems as $elem) { | 384 | foreach ($elems as $elem) { |
385 | if (!isset($elem->parentNode)) continue; | 385 | if (!isset($elem->parentNode)) continue; |
386 | $isDescendant = false; | 386 | $isDescendant = false; |
387 | foreach ($this->body->childNodes as $parent) { | 387 | foreach ($this->body->childNodes as $parent) { |
388 | if ($this->isDescendant($parent, $elem)) { | 388 | if ($this->isDescendant($parent, $elem)) { |
389 | $isDescendant = true; | 389 | $isDescendant = true; |
390 | break; | 390 | break; |
391 | } | 391 | } |
392 | } | 392 | } |
393 | if ($isDescendant) { | 393 | if ($isDescendant) { |
394 | $this->debug('...element is child of another body element, skipping.'); | 394 | $this->debug('...element is child of another body element, skipping.'); |
395 | } else { | 395 | } else { |
396 | // prune (clean up elements that may not be content) | 396 | // prune (clean up elements that may not be content) |
397 | if ($this->config->prune()) { | 397 | if ($this->config->prune()) { |
398 | $this->debug('Pruning content'); | 398 | $this->debug('Pruning content'); |
399 | $this->readability->prepArticle($elem); | 399 | $this->readability->prepArticle($elem); |
400 | } | 400 | } |
401 | $this->debug('...element added to body'); | 401 | $this->debug('...element added to body'); |
402 | $this->body->appendChild($elem); | 402 | $this->body->appendChild($elem); |
403 | } | 403 | } |
404 | } | 404 | } |
405 | if ($this->body->hasChildNodes()) break; | 405 | if ($this->body->hasChildNodes()) break; |
406 | } | 406 | } |
407 | } | 407 | } |
408 | } | 408 | } |
409 | 409 | ||
410 | // auto detect? | 410 | // auto detect? |
411 | $detect_title = $detect_body = $detect_author = $detect_date = false; | 411 | $detect_title = $detect_body = $detect_author = $detect_date = false; |
412 | // detect title? | 412 | // detect title? |
413 | if (!isset($this->title)) { | 413 | if (!isset($this->title)) { |
414 | if (empty($this->config->title) || $this->config->autodetect_on_failure()) { | 414 | if (empty($this->config->title) || $this->config->autodetect_on_failure()) { |
415 | $detect_title = true; | 415 | $detect_title = true; |
416 | } | 416 | } |
417 | } | 417 | } |
418 | // detect body? | 418 | // detect body? |
419 | if (!isset($this->body)) { | 419 | if (!isset($this->body)) { |
420 | if (empty($this->config->body) || $this->config->autodetect_on_failure()) { | 420 | if (empty($this->config->body) || $this->config->autodetect_on_failure()) { |
421 | $detect_body = true; | 421 | $detect_body = true; |
422 | } | 422 | } |
423 | } | 423 | } |
424 | // detect author? | 424 | // detect author? |
425 | if (empty($this->author)) { | 425 | if (empty($this->author)) { |
426 | if (empty($this->config->author) || $this->config->autodetect_on_failure()) { | 426 | if (empty($this->config->author) || $this->config->autodetect_on_failure()) { |
427 | $detect_author = true; | 427 | $detect_author = true; |
428 | } | 428 | } |
429 | } | 429 | } |
430 | // detect date? | 430 | // detect date? |
431 | if (!isset($this->date)) { | 431 | if (!isset($this->date)) { |
432 | if (empty($this->config->date) || $this->config->autodetect_on_failure()) { | 432 | if (empty($this->config->date) || $this->config->autodetect_on_failure()) { |
433 | $detect_date = true; | 433 | $detect_date = true; |
434 | } | 434 | } |
435 | } | 435 | } |
436 | 436 | ||
437 | // check for hNews | 437 | // check for hNews |
438 | if ($detect_title || $detect_body) { | 438 | if ($detect_title || $detect_body) { |
439 | // check for hentry | 439 | // check for hentry |
440 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); | 440 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom); |
441 | if ($elems && $elems->length > 0) { | 441 | if ($elems && $elems->length > 0) { |
442 | $this->debug('hNews: found hentry'); | 442 | $this->debug('hNews: found hentry'); |
443 | $hentry = $elems->item(0); | 443 | $hentry = $elems->item(0); |
444 | 444 | ||
445 | if ($detect_title) { | 445 | if ($detect_title) { |
446 | // check for entry-title | 446 | // check for entry-title |
447 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); | 447 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry); |
448 | if ($elems && $elems->length > 0) { | 448 | if ($elems && $elems->length > 0) { |
449 | $this->title = $elems->item(0)->textContent; | 449 | $this->title = $elems->item(0)->textContent; |
450 | $this->debug('hNews: found entry-title: '.$this->title); | 450 | $this->debug('hNews: found entry-title: '.$this->title); |
451 | // remove title from document | 451 | // remove title from document |
452 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 452 | $elems->item(0)->parentNode->removeChild($elems->item(0)); |
453 | $detect_title = false; | 453 | $detect_title = false; |
454 | } | 454 | } |
455 | } | 455 | } |
456 | 456 | ||
457 | if ($detect_date) { | 457 | if ($detect_date) { |
458 | // check for time element with pubdate attribute | 458 | // check for time element with pubdate attribute |
459 | $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); | 459 | $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry); |
460 | if ($elems && $elems->length > 0) { | 460 | if ($elems && $elems->length > 0) { |
461 | $this->date = strtotime(trim($elems->item(0)->textContent)); | 461 | $this->date = strtotime(trim($elems->item(0)->textContent)); |
462 | // remove date from document | 462 | // remove date from document |
463 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); | 463 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); |
464 | if ($this->date) { | 464 | if ($this->date) { |
465 | $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); | 465 | $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date)); |
466 | $detect_date = false; | 466 | $detect_date = false; |
467 | } else { | 467 | } else { |
468 | $this->date = null; | 468 | $this->date = null; |
469 | } | 469 | } |
470 | } | 470 | } |
471 | } | 471 | } |
472 | 472 | ||
473 | if ($detect_author) { | 473 | if ($detect_author) { |
474 | // check for time element with pubdate attribute | 474 | // check for time element with pubdate attribute |
475 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); | 475 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry); |
476 | if ($elems && $elems->length > 0) { | 476 | if ($elems && $elems->length > 0) { |
477 | $author = $elems->item(0); | 477 | $author = $elems->item(0); |
478 | $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); | 478 | $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author); |
479 | if ($fn && $fn->length > 0) { | 479 | if ($fn && $fn->length > 0) { |
480 | foreach ($fn as $_fn) { | 480 | foreach ($fn as $_fn) { |
481 | if (trim($_fn->textContent) != '') { | 481 | if (trim($_fn->textContent) != '') { |
482 | $this->author[] = trim($_fn->textContent); | 482 | $this->author[] = trim($_fn->textContent); |
483 | $this->debug('hNews: found author: '.trim($_fn->textContent)); | 483 | $this->debug('hNews: found author: '.trim($_fn->textContent)); |
484 | } | 484 | } |
485 | } | 485 | } |
486 | } else { | 486 | } else { |
487 | if (trim($author->textContent) != '') { | 487 | if (trim($author->textContent) != '') { |
488 | $this->author[] = trim($author->textContent); | 488 | $this->author[] = trim($author->textContent); |
489 | $this->debug('hNews: found author: '.trim($author->textContent)); | 489 | $this->debug('hNews: found author: '.trim($author->textContent)); |
490 | } | 490 | } |
491 | } | 491 | } |
492 | $detect_author = empty($this->author); | 492 | $detect_author = empty($this->author); |
493 | } | 493 | } |
494 | } | 494 | } |
495 | 495 | ||
496 | // check for entry-content. | 496 | // check for entry-content. |
497 | // according to hAtom spec, if there are multiple elements marked entry-content, | 497 | // according to hAtom spec, if there are multiple elements marked entry-content, |
498 | // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content | 498 | // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content |
499 | if ($detect_body) { | 499 | if ($detect_body) { |
500 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); | 500 | $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry); |
501 | if ($elems && $elems->length > 0) { | 501 | if ($elems && $elems->length > 0) { |
502 | $this->debug('hNews: found entry-content'); | 502 | $this->debug('hNews: found entry-content'); |
503 | if ($elems->length == 1) { | 503 | if ($elems->length == 1) { |
504 | // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) | 504 | // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element) |
505 | $e = $elems->item(0); | 505 | $e = $elems->item(0); |
506 | if (($e->tagName == 'img') || (trim($e->textContent) != '')) { | 506 | if (($e->tagName == 'img') || (trim($e->textContent) != '')) { |
507 | $this->body = $elems->item(0); | 507 | $this->body = $elems->item(0); |
508 | // prune (clean up elements that may not be content) | 508 | // prune (clean up elements that may not be content) |
509 | if ($this->config->prune()) { | 509 | if ($this->config->prune()) { |
510 | $this->debug('Pruning content'); | 510 | $this->debug('Pruning content'); |
511 | $this->readability->prepArticle($this->body); | 511 | $this->readability->prepArticle($this->body); |
512 | } | 512 | } |
513 | $detect_body = false; | 513 | $detect_body = false; |
514 | } else { | 514 | } else { |
515 | $this->debug('hNews: skipping entry-content - appears not to contain content'); | 515 | $this->debug('hNews: skipping entry-content - appears not to contain content'); |
516 | } | 516 | } |
517 | unset($e); | 517 | unset($e); |
518 | } else { | 518 | } else { |
519 | $this->body = $this->readability->dom->createElement('div'); | 519 | $this->body = $this->readability->dom->createElement('div'); |
520 | $this->debug($elems->length.' entry-content elems found'); | 520 | $this->debug($elems->length.' entry-content elems found'); |
521 | foreach ($elems as $elem) { | 521 | foreach ($elems as $elem) { |
522 | if (!isset($elem->parentNode)) continue; | 522 | if (!isset($elem->parentNode)) continue; |
523 | $isDescendant = false; | 523 | $isDescendant = false; |
524 | foreach ($this->body->childNodes as $parent) { | 524 | foreach ($this->body->childNodes as $parent) { |
525 | if ($this->isDescendant($parent, $elem)) { | 525 | if ($this->isDescendant($parent, $elem)) { |
526 | $isDescendant = true; | 526 | $isDescendant = true; |
527 | break; | 527 | break; |
528 | } | 528 | } |
529 | } | 529 | } |
530 | if ($isDescendant) { | 530 | if ($isDescendant) { |
531 | $this->debug('Element is child of another body element, skipping.'); | 531 | $this->debug('Element is child of another body element, skipping.'); |
532 | } else { | 532 | } else { |
533 | // prune (clean up elements that may not be content) | 533 | // prune (clean up elements that may not be content) |
534 | if ($this->config->prune()) { | 534 | if ($this->config->prune()) { |
535 | $this->debug('Pruning content'); | 535 | $this->debug('Pruning content'); |
536 | $this->readability->prepArticle($elem); | 536 | $this->readability->prepArticle($elem); |
537 | } | 537 | } |
538 | $this->debug('Element added to body'); | 538 | $this->debug('Element added to body'); |
539 | $this->body->appendChild($elem); | 539 | $this->body->appendChild($elem); |
540 | } | 540 | } |
541 | } | 541 | } |
542 | $detect_body = false; | 542 | $detect_body = false; |
543 | } | 543 | } |
544 | } | 544 | } |
545 | } | 545 | } |
546 | } | 546 | } |
547 | } | 547 | } |
548 | 548 | ||
549 | // check for elements marked with instapaper_title | 549 | // check for elements marked with instapaper_title |
550 | if ($detect_title) { | 550 | if ($detect_title) { |
551 | // check for instapaper_title | 551 | // check for instapaper_title |
552 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); | 552 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom); |
553 | if ($elems && $elems->length > 0) { | 553 | if ($elems && $elems->length > 0) { |
554 | $this->title = $elems->item(0)->textContent; | 554 | $this->title = $elems->item(0)->textContent; |
555 | $this->debug('Title found (.instapaper_title): '.$this->title); | 555 | $this->debug('Title found (.instapaper_title): '.$this->title); |
556 | // remove title from document | 556 | // remove title from document |
557 | $elems->item(0)->parentNode->removeChild($elems->item(0)); | 557 | $elems->item(0)->parentNode->removeChild($elems->item(0)); |
558 | $detect_title = false; | 558 | $detect_title = false; |
559 | } | 559 | } |
560 | } | 560 | } |
561 | // check for elements marked with instapaper_body | 561 | // check for elements marked with instapaper_body |
562 | if ($detect_body) { | 562 | if ($detect_body) { |
563 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); | 563 | $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom); |
564 | if ($elems && $elems->length > 0) { | 564 | if ($elems && $elems->length > 0) { |
565 | $this->debug('body found (.instapaper_body)'); | 565 | $this->debug('body found (.instapaper_body)'); |
566 | $this->body = $elems->item(0); | 566 | $this->body = $elems->item(0); |
567 | // prune (clean up elements that may not be content) | 567 | // prune (clean up elements that may not be content) |
568 | if ($this->config->prune()) { | 568 | if ($this->config->prune()) { |
569 | $this->debug('Pruning content'); | 569 | $this->debug('Pruning content'); |
570 | $this->readability->prepArticle($this->body); | 570 | $this->readability->prepArticle($this->body); |
571 | } | 571 | } |
572 | $detect_body = false; | 572 | $detect_body = false; |
573 | } | 573 | } |
574 | } | 574 | } |
575 | 575 | ||
576 | // Find author in rel="author" marked element | 576 | // Find author in rel="author" marked element |
577 | // We only use this if there's exactly one. | 577 | // We only use this if there's exactly one. |
578 | // If there's more than one, it could indicate more than | 578 | // If there's more than one, it could indicate more than |
579 | // one author, but it could also indicate that we're processing | 579 | // one author, but it could also indicate that we're processing |
580 | // a page listing different articles with different authors. | 580 | // a page listing different articles with different authors. |
581 | if ($detect_author) { | 581 | if ($detect_author) { |
582 | $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); | 582 | $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom); |
583 | if ($elems && $elems->length == 1) { | 583 | if ($elems && $elems->length == 1) { |
584 | $author = trim($elems->item(0)->textContent); | 584 | $author = trim($elems->item(0)->textContent); |
585 | if ($author != '') { | 585 | if ($author != '') { |
586 | $this->debug("Author found (rel=\"author\"): $author"); | 586 | $this->debug("Author found (rel=\"author\"): $author"); |
587 | $this->author[] = $author; | 587 | $this->author[] = $author; |
588 | $detect_author = false; | 588 | $detect_author = false; |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | // Find date in pubdate marked time element | 593 | // Find date in pubdate marked time element |
594 | // For the same reason given above, we only use this | 594 | // For the same reason given above, we only use this |
595 | // if there's exactly one element. | 595 | // if there's exactly one element. |
596 | if ($detect_date) { | 596 | if ($detect_date) { |
597 | $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); | 597 | $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom); |
598 | if ($elems && $elems->length == 1) { | 598 | if ($elems && $elems->length == 1) { |
599 | $this->date = strtotime(trim($elems->item(0)->textContent)); | 599 | $this->date = strtotime(trim($elems->item(0)->textContent)); |
600 | // remove date from document | 600 | // remove date from document |
601 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); | 601 | //$elems->item(0)->parentNode->removeChild($elems->item(0)); |
602 | if ($this->date) { | 602 | if ($this->date) { |
603 | $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); | 603 | $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date)); |
604 | $detect_date = false; | 604 | $detect_date = false; |
605 | } else { | 605 | } else { |
606 | $this->date = null; | 606 | $this->date = null; |
607 | } | 607 | } |
608 | } | 608 | } |
609 | } | 609 | } |
610 | 610 | ||
611 | // still missing title or body, so we detect using Readability | 611 | // still missing title or body, so we detect using Readability |
612 | if ($detect_title || $detect_body) { | 612 | if ($detect_title || $detect_body) { |
613 | $this->debug('Using Readability'); | 613 | $this->debug('Using Readability'); |
614 | // clone body if we're only using Readability for title (otherwise it may interfere with body element) | 614 | // clone body if we're only using Readability for title (otherwise it may interfere with body element) |
615 | if (isset($this->body)) $this->body = $this->body->cloneNode(true); | 615 | if (isset($this->body)) $this->body = $this->body->cloneNode(true); |
616 | $success = $this->readability->init(); | 616 | $success = $this->readability->init(); |
617 | } | 617 | } |
618 | if ($detect_title) { | 618 | if ($detect_title) { |
619 | $this->debug('Detecting title'); | 619 | $this->debug('Detecting title'); |
620 | $this->title = $this->readability->getTitle()->textContent; | 620 | $this->title = $this->readability->getTitle()->textContent; |
621 | } | 621 | } |
622 | if ($detect_body && $success) { | 622 | if ($detect_body && $success) { |
623 | $this->debug('Detecting body'); | 623 | $this->debug('Detecting body'); |
624 | $this->body = $this->readability->getContent(); | 624 | $this->body = $this->readability->getContent(); |
625 | if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { | 625 | if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) { |
626 | $this->body = $this->body->firstChild; | 626 | $this->body = $this->body->firstChild; |
627 | } | 627 | } |
628 | // prune (clean up elements that may not be content) | 628 | // prune (clean up elements that may not be content) |
629 | if ($this->config->prune()) { | 629 | if ($this->config->prune()) { |
630 | $this->debug('Pruning content'); | 630 | $this->debug('Pruning content'); |
631 | $this->readability->prepArticle($this->body); | 631 | $this->readability->prepArticle($this->body); |
632 | } | 632 | } |
633 | } | 633 | } |
634 | if (isset($this->body)) { | 634 | if (isset($this->body)) { |
635 | // remove scripts | 635 | // remove scripts |
636 | $this->readability->removeScripts($this->body); | 636 | $this->readability->removeScripts($this->body); |
637 | // remove any h1-h6 elements that appear as first thing in the body | 637 | // remove any h1-h6 elements that appear as first thing in the body |
638 | // and which match our title | 638 | // and which match our title |
639 | if (isset($this->title) && ($this->title != '')) { | 639 | if (isset($this->title) && ($this->title != '')) { |
640 | $firstChild = $this->body->firstChild; | 640 | $firstChild = $this->body->firstChild; |
641 | while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { | 641 | while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) { |
642 | $firstChild = $firstChild->nextSibling; | 642 | $firstChild = $firstChild->nextSibling; |
643 | } | 643 | } |
644 | if (($firstChild->nodeType === XML_ELEMENT_NODE) | 644 | if (($firstChild->nodeType === XML_ELEMENT_NODE) |
645 | && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) | 645 | && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6')) |
646 | && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { | 646 | && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) { |
647 | $this->body->removeChild($firstChild); | 647 | $this->body->removeChild($firstChild); |
648 | } | 648 | } |
649 | } | 649 | } |
650 | // prevent self-closing iframes | 650 | // prevent self-closing iframes |
651 | $elems = $this->body->getElementsByTagName('iframe'); | 651 | $elems = $this->body->getElementsByTagName('iframe'); |
652 | for ($i = $elems->length-1; $i >= 0; $i--) { | 652 | for ($i = $elems->length-1; $i >= 0; $i--) { |
653 | $e = $elems->item($i); | 653 | $e = $elems->item($i); |
654 | if (!$e->hasChildNodes()) { | 654 | if (!$e->hasChildNodes()) { |
655 | $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); | 655 | $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); |
656 | } | 656 | } |
657 | } | 657 | } |
658 | // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ | 658 | // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ |
659 | // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src | 659 | // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src |
660 | // inside the data-lazy-src attribute. It also places the original image inside a noscript element | 660 | // inside the data-lazy-src attribute. It also places the original image inside a noscript element |
661 | // next to the amended one. | 661 | // next to the amended one. |
662 | $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); | 662 | $elems = @$xpath->query("//img[@data-lazy-src]", $this->body); |
663 | for ($i = $elems->length-1; $i >= 0; $i--) { | 663 | for ($i = $elems->length-1; $i >= 0; $i--) { |
664 | $e = $elems->item($i); | 664 | $e = $elems->item($i); |
665 | // let's see if we can grab image from noscript | 665 | // let's see if we can grab image from noscript |
666 | if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { | 666 | if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') { |
667 | $_new_elem = $e->ownerDocument->createDocumentFragment(); | 667 | $_new_elem = $e->ownerDocument->createDocumentFragment(); |
668 | @$_new_elem->appendXML($e->nextSibling->innerHTML); | 668 | @$_new_elem->appendXML($e->nextSibling->innerHTML); |
669 | $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); | 669 | $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling); |
670 | $e->parentNode->removeChild($e); | 670 | $e->parentNode->removeChild($e); |
671 | } else { | 671 | } else { |
672 | // Use data-lazy-src as src value | 672 | // Use data-lazy-src as src value |
673 | $e->setAttribute('src', $e->getAttribute('data-lazy-src')); | 673 | $e->setAttribute('src', $e->getAttribute('data-lazy-src')); |
674 | $e->removeAttribute('data-lazy-src'); | 674 | $e->removeAttribute('data-lazy-src'); |
675 | } | 675 | } |
676 | } | 676 | } |
677 | 677 | ||
678 | $this->success = true; | 678 | $this->success = true; |
679 | } | 679 | } |
680 | 680 | ||
681 | // if we've had no success and we've used tidy, there's a chance | 681 | // if we've had no success and we've used tidy, there's a chance |
682 | // that tidy has messed up. So let's try again without tidy... | 682 | // that tidy has messed up. So let's try again without tidy... |
683 | if (!$this->success && $tidied && $smart_tidy) { | 683 | if (!$this->success && $tidied && $smart_tidy) { |
684 | $this->debug('Trying again without tidy'); | 684 | $this->debug('Trying again without tidy'); |
685 | $this->process($original_html, $url, false); | 685 | $this->process($original_html, $url, false); |
686 | } | 686 | } |
687 | 687 | ||
688 | return $this->success; | 688 | return $this->success; |
689 | } | 689 | } |
690 | 690 | ||
691 | private function isDescendant(DOMElement $parent, DOMElement $child) { | 691 | private function isDescendant(DOMElement $parent, DOMElement $child) { |
692 | $node = $child->parentNode; | 692 | $node = $child->parentNode; |
693 | while ($node != null) { | 693 | while ($node != null) { |
694 | if ($node->isSameNode($parent)) return true; | 694 | if ($node->isSameNode($parent)) return true; |
695 | $node = $node->parentNode; | 695 | $node = $node->parentNode; |
696 | } | 696 | } |
697 | return false; | 697 | return false; |
698 | } | 698 | } |
699 | 699 | ||
700 | public function getContent() { | 700 | public function getContent() { |
701 | return $this->body; | 701 | return $this->body; |
702 | } | 702 | } |
703 | 703 | ||
704 | public function getTitle() { | 704 | public function getTitle() { |
705 | return $this->title; | 705 | return $this->title; |
706 | } | 706 | } |
707 | 707 | ||
708 | public function getAuthors() { | 708 | public function getAuthors() { |
709 | return $this->author; | 709 | return $this->author; |
710 | } | 710 | } |
711 | 711 | ||
712 | public function getLanguage() { | 712 | public function getLanguage() { |
713 | return $this->language; | 713 | return $this->language; |
714 | } | 714 | } |
715 | 715 | ||
716 | public function getDate() { | 716 | public function getDate() { |
717 | return $this->date; | 717 | return $this->date; |
718 | } | 718 | } |
719 | 719 | ||
720 | public function getSiteConfig() { | 720 | public function getSiteConfig() { |
721 | return $this->config; | 721 | return $this->config; |
722 | } | 722 | } |
723 | 723 | ||
724 | public function getNextPageUrl() { | 724 | public function getNextPageUrl() { |
725 | return $this->nextPageUrl; | 725 | return $this->nextPageUrl; |
726 | } | 726 | } |
727 | } | 727 | } \ No newline at end of file |
728 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php index c5e300d7..1f6a7603 100644 --- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php | |||
@@ -1,338 +1,343 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Site Config | 3 | * Site Config |
4 | * | 4 | * |
5 | * Each instance of this class should hold extraction patterns and other directives | 5 | * Each instance of this class should hold extraction patterns and other directives |
6 | * for a website. See ContentExtractor class to see how it's used. | 6 | * for a website. See ContentExtractor class to see how it's used. |
7 | * | 7 | * |
8 | * @version 0.7 | 8 | * @version 0.8 |
9 | * @date 2012-08-27 | 9 | * @date 2013-04-16 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2012 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class SiteConfig | 15 | class SiteConfig |
16 | { | 16 | { |
17 | // Use first matching element as title (0 or more xpath expressions) | 17 | // Use first matching element as title (0 or more xpath expressions) |
18 | public $title = array(); | 18 | public $title = array(); |
19 | 19 | ||
20 | // Use first matching element as body (0 or more xpath expressions) | 20 | // Use first matching element as body (0 or more xpath expressions) |
21 | public $body = array(); | 21 | public $body = array(); |
22 | 22 | ||
23 | // Use first matching element as author (0 or more xpath expressions) | 23 | // Use first matching element as author (0 or more xpath expressions) |
24 | public $author = array(); | 24 | public $author = array(); |
25 | 25 | ||
26 | // Use first matching element as date (0 or more xpath expressions) | 26 | // Use first matching element as date (0 or more xpath expressions) |
27 | public $date = array(); | 27 | public $date = array(); |
28 | 28 | ||
29 | // Strip elements matching these xpath expressions (0 or more) | 29 | // Strip elements matching these xpath expressions (0 or more) |
30 | public $strip = array(); | 30 | public $strip = array(); |
31 | 31 | ||
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute | 32 | // Strip elements which contain these strings (0 or more) in the id or class attribute |
33 | public $strip_id_or_class = array(); | 33 | public $strip_id_or_class = array(); |
34 | 34 | ||
35 | // Strip images which contain these strings (0 or more) in the src attribute | 35 | // Strip images which contain these strings (0 or more) in the src attribute |
36 | public $strip_image_src = array(); | 36 | public $strip_image_src = array(); |
37 | 37 | ||
38 | // Additional HTTP headers to send | 38 | // Additional HTTP headers to send |
39 | // NOT YET USED | 39 | // NOT YET USED |
40 | public $http_header = array(); | 40 | public $http_header = array(); |
41 | 41 | ||
42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) | 42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) |
43 | public $tidy = null; | 43 | public $tidy = null; |
44 | 44 | ||
45 | protected $default_tidy = true; // used if undeclared | 45 | protected $default_tidy = true; // used if undeclared |
46 | 46 | ||
47 | // Autodetect title/body if xpath expressions fail to produce results. | 47 | // Autodetect title/body if xpath expressions fail to produce results. |
48 | // Note that this applies to title and body separately, ie. | 48 | // Note that this applies to title and body separately, ie. |
49 | // * if we get a body match but no title match, this option will determine whether we autodetect title | 49 | // * if we get a body match but no title match, this option will determine whether we autodetect title |
50 | // * if neither match, this determines whether we autodetect title and body. | 50 | // * if neither match, this determines whether we autodetect title and body. |
51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. | 51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. |
52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) | 52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) |
53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. | 53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. |
54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). | 54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). |
55 | // bool or null if undeclared | 55 | // bool or null if undeclared |
56 | public $autodetect_on_failure = null; | 56 | public $autodetect_on_failure = null; |
57 | protected $default_autodetect_on_failure = true; // used if undeclared | 57 | protected $default_autodetect_on_failure = true; // used if undeclared |
58 | 58 | ||
59 | // Clean up content block - attempt to remove elements that appear to be superfluous | 59 | // Clean up content block - attempt to remove elements that appear to be superfluous |
60 | // bool or null if undeclared | 60 | // bool or null if undeclared |
61 | public $prune = null; | 61 | public $prune = null; |
62 | protected $default_prune = true; // used if undeclared | 62 | protected $default_prune = true; // used if undeclared |
63 | 63 | ||
64 | // Test URL - if present, can be used to test the config above | 64 | // Test URL - if present, can be used to test the config above |
65 | public $test_url = array(); | 65 | public $test_url = array(); |
66 | 66 | ||
67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article | 67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article |
68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to | 68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to |
69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page | 69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page |
70 | // which displays the entire article on one page (e.g. 'print view'). | 70 | // which displays the entire article on one page (e.g. 'print view'). |
71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, | 71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, |
72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. | 72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. |
73 | public $single_page_link = array(); | 73 | public $single_page_link = array(); |
74 | 74 | ||
75 | public $next_page_link = array(); | 75 | public $next_page_link = array(); |
76 | 76 | ||
77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed | 77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed |
78 | public $single_page_link_in_feed = array(); | 78 | public $single_page_link_in_feed = array(); |
79 | 79 | ||
80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
81 | // string or null if undeclared | 81 | // string or null if undeclared |
82 | public $parser = null; | 82 | public $parser = null; |
83 | protected $default_parser = 'libxml'; // used if undeclared | 83 | protected $default_parser = 'libxml'; // used if undeclared |
84 | 84 | ||
85 | // Strings to search for in HTML before processing begins (used with $replace_string) | 85 | // Strings to search for in HTML before processing begins (used with $replace_string) |
86 | public $find_string = array(); | 86 | public $find_string = array(); |
87 | // Strings to replace those found in $find_string before HTML processing begins | 87 | // Strings to replace those found in $find_string before HTML processing begins |
88 | public $replace_string = array(); | 88 | public $replace_string = array(); |
89 | 89 | ||
90 | // the options below cannot be set in the config files which this class represents | 90 | // the options below cannot be set in the config files which this class represents |
91 | 91 | ||
92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not | 92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not |
93 | public $cache_key = null; | 93 | public $cache_key = null; |
94 | public static $debug = false; | 94 | public static $debug = false; |
95 | protected static $apc = false; | 95 | protected static $apc = false; |
96 | protected static $config_path; | 96 | protected static $config_path; |
97 | protected static $config_path_fallback; | 97 | protected static $config_path_fallback; |
98 | protected static $config_cache = array(); | 98 | protected static $config_cache = array(); |
99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; | 99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; |
100 | 100 | ||
101 | protected static function debug($msg) { | 101 | protected static function debug($msg) { |
102 | if (self::$debug) { | 102 | if (self::$debug) { |
103 | //$mem = round(memory_get_usage()/1024, 2); | 103 | //$mem = round(memory_get_usage()/1024, 2); |
104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); |
105 | echo '* ',$msg; | 105 | echo '* ',$msg; |
106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
107 | echo "\n"; | 107 | echo "\n"; |
108 | ob_flush(); | 108 | ob_flush(); |
109 | flush(); | 109 | flush(); |
110 | } | 110 | } |
111 | } | 111 | } |
112 | 112 | ||
113 | // enable APC caching of certain site config files? | 113 | // enable APC caching of certain site config files? |
114 | // If enabled the following site config files will be | 114 | // If enabled the following site config files will be |
115 | // cached in APC cache (when requested for first time): | 115 | // cached in APC cache (when requested for first time): |
116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ | 116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ |
117 | // * the site config files associated with HTML fingerprints | 117 | // * the site config files associated with HTML fingerprints |
118 | // * the global site config file | 118 | // * the global site config file |
119 | // returns true if enabled, false otherwise | 119 | // returns true if enabled, false otherwise |
120 | public static function use_apc($apc=true) { | 120 | public static function use_apc($apc=true) { |
121 | if (!function_exists('apc_add')) { | 121 | if (!function_exists('apc_add')) { |
122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); | 122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); |
123 | return false; | 123 | return false; |
124 | } | 124 | } |
125 | self::$apc = $apc; | 125 | self::$apc = $apc; |
126 | return $apc; | 126 | return $apc; |
127 | } | 127 | } |
128 | 128 | ||
129 | // return bool or null | 129 | // return bool or null |
130 | public function tidy($use_default=true) { | 130 | public function tidy($use_default=true) { |
131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; | 131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; |
132 | return $this->tidy; | 132 | return $this->tidy; |
133 | } | 133 | } |
134 | 134 | ||
135 | // return bool or null | 135 | // return bool or null |
136 | public function prune($use_default=true) { | 136 | public function prune($use_default=true) { |
137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; | 137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; |
138 | return $this->prune; | 138 | return $this->prune; |
139 | } | 139 | } |
140 | 140 | ||
141 | // return string or null | 141 | // return string or null |
142 | public function parser($use_default=true) { | 142 | public function parser($use_default=true) { |
143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; | 143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; |
144 | return $this->parser; | 144 | return $this->parser; |
145 | } | 145 | } |
146 | 146 | ||
147 | // return bool or null | 147 | // return bool or null |
148 | public function autodetect_on_failure($use_default=true) { | 148 | public function autodetect_on_failure($use_default=true) { |
149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; | 149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; |
150 | return $this->autodetect_on_failure; | 150 | return $this->autodetect_on_failure; |
151 | } | 151 | } |
152 | 152 | ||
153 | public static function set_config_path($path, $fallback=null) { | 153 | public static function set_config_path($path, $fallback=null) { |
154 | self::$config_path = $path; | 154 | self::$config_path = $path; |
155 | self::$config_path_fallback = $fallback; | 155 | self::$config_path_fallback = $fallback; |
156 | } | 156 | } |
157 | 157 | ||
158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { | 158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { |
159 | $key = strtolower($key); | 159 | $key = strtolower($key); |
160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
161 | if ($config->cache_key) $key = $config->cache_key; | 161 | if ($config->cache_key) $key = $config->cache_key; |
162 | self::$config_cache[$key] = $config; | 162 | self::$config_cache[$key] = $config; |
163 | if (self::$apc && $use_apc) { | 163 | if (self::$apc && $use_apc) { |
164 | self::debug("Adding site config to APC cache with key sc.$key"); | 164 | self::debug("Adding site config to APC cache with key sc.$key"); |
165 | apc_add("sc.$key", $config); | 165 | apc_add("sc.$key", $config); |
166 | } | 166 | } |
167 | self::debug("Cached site config with key $key"); | 167 | self::debug("Cached site config with key $key"); |
168 | } | 168 | } |
169 | 169 | ||
170 | public static function is_cached($key) { | 170 | public static function is_cached($key) { |
171 | $key = strtolower($key); | 171 | $key = strtolower($key); |
172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
173 | if (array_key_exists($key, self::$config_cache)) { | 173 | if (array_key_exists($key, self::$config_cache)) { |
174 | return true; | 174 | return true; |
175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { | 175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { |
176 | return true; | 176 | return true; |
177 | } | 177 | } |
178 | return false; | 178 | return false; |
179 | } | 179 | } |
180 | 180 | ||
181 | public function append(SiteConfig $newconfig) { | 181 | public function append(SiteConfig $newconfig) { |
182 | // check for commands where we accept multiple statements (no test_url) | 182 | // check for commands where we accept multiple statements (no test_url) |
183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { | 183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { |
184 | // append array elements for this config variable from $newconfig to this config | 184 | // append array elements for this config variable from $newconfig to this config |
185 | //$this->$var = $this->$var + $newconfig->$var; | 185 | //$this->$var = $this->$var + $newconfig->$var; |
186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); | 186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); |
187 | } | 187 | } |
188 | // check for single statement commands | 188 | // check for single statement commands |
189 | // we do not overwrite existing non null values | 189 | // we do not overwrite existing non null values |
190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { | 190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { |
191 | if ($this->$var === null) $this->$var = $newconfig->$var; | 191 | if ($this->$var === null) $this->$var = $newconfig->$var; |
192 | } | 192 | } |
193 | } | 193 | // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) |
194 | 194 | foreach (array('find_string', 'replace_string') as $var) { | |
195 | // returns SiteConfig instance if an appropriate one is found, false otherwise | 195 | // append array elements for this config variable from $newconfig to this config |
196 | // if $exact_host_match is true, we will not look for wildcard config matches | 196 | //$this->$var = $this->$var + $newconfig->$var; |
197 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists | 197 | $this->$var = array_merge($this->$var, $newconfig->$var); |
198 | public static function build($host, $exact_host_match=false) { | 198 | } |
199 | $host = strtolower($host); | 199 | } |
200 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 200 | |
201 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; | 201 | // returns SiteConfig instance if an appropriate one is found, false otherwise |
202 | // check for site configuration | 202 | // if $exact_host_match is true, we will not look for wildcard config matches |
203 | $try = array($host); | 203 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists |
204 | // should we look for wildcard matches | 204 | public static function build($host, $exact_host_match=false) { |
205 | if (!$exact_host_match) { | 205 | $host = strtolower($host); |
206 | $split = explode('.', $host); | 206 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
207 | if (count($split) > 1) { | 207 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; |
208 | array_shift($split); | 208 | // check for site configuration |
209 | $try[] = '.'.implode('.', $split); | 209 | $try = array($host); |
210 | } | 210 | // should we look for wildcard matches |
211 | } | 211 | if (!$exact_host_match) { |
212 | 212 | $split = explode('.', $host); | |
213 | // look for site config file in primary folder | 213 | if (count($split) > 1) { |
214 | self::debug(". looking for site config for $host in primary folder"); | 214 | array_shift($split); |
215 | foreach ($try as $h) { | 215 | $try[] = '.'.implode('.', $split); |
216 | if (array_key_exists($h, self::$config_cache)) { | 216 | } |
217 | self::debug("... site config for $h already loaded in this request"); | 217 | } |
218 | return self::$config_cache[$h]; | 218 | |
219 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { | 219 | // look for site config file in primary folder |
220 | self::debug("... site config for $h in APC cache"); | 220 | self::debug(". looking for site config for $host in primary folder"); |
221 | return $sconfig; | 221 | foreach ($try as $h) { |
222 | } elseif (file_exists(self::$config_path."/$h.txt")) { | 222 | if (array_key_exists($h, self::$config_cache)) { |
223 | self::debug("... found site config ($h.txt)"); | 223 | self::debug("... site config for $h already loaded in this request"); |
224 | $file_primary = self::$config_path."/$h.txt"; | 224 | return self::$config_cache[$h]; |
225 | $matched_name = $h; | 225 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { |
226 | break; | 226 | self::debug("... site config for $h in APC cache"); |
227 | } | 227 | return $sconfig; |
228 | } | 228 | } elseif (file_exists(self::$config_path."/$h.txt")) { |
229 | 229 | self::debug("... found site config ($h.txt)"); | |
230 | // if we found site config, process it | 230 | $file_primary = self::$config_path."/$h.txt"; |
231 | if (isset($file_primary)) { | 231 | $matched_name = $h; |
232 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 232 | break; |
233 | if (!$config_lines || !is_array($config_lines)) return false; | 233 | } |
234 | $config = self::build_from_array($config_lines); | 234 | } |
235 | // if APC caching is available and enabled, mark this for cache | 235 | |
236 | //$config->cache_in_apc = true; | 236 | // if we found site config, process it |
237 | $config->cache_key = $matched_name; | 237 | if (isset($file_primary)) { |
238 | 238 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
239 | // if autodetec on failure is off (on by default) we do not need to look | 239 | if (!$config_lines || !is_array($config_lines)) return false; |
240 | // in secondary folder | 240 | $config = self::build_from_array($config_lines); |
241 | if (!$config->autodetect_on_failure()) { | 241 | // if APC caching is available and enabled, mark this for cache |
242 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); | 242 | //$config->cache_in_apc = true; |
243 | return $config; | 243 | $config->cache_key = $matched_name; |
244 | } | 244 | |
245 | } | 245 | // if autodetec on failure is off (on by default) we do not need to look |
246 | 246 | // in secondary folder | |
247 | // look for site config file in secondary folder | 247 | if (!$config->autodetect_on_failure()) { |
248 | if (isset(self::$config_path_fallback)) { | 248 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); |
249 | self::debug(". looking for site config for $host in secondary folder"); | 249 | return $config; |
250 | foreach ($try as $h) { | 250 | } |
251 | if (file_exists(self::$config_path_fallback."/$h.txt")) { | 251 | } |
252 | self::debug("... found site config in secondary folder ($h.txt)"); | 252 | |
253 | $file_secondary = self::$config_path_fallback."/$h.txt"; | 253 | // look for site config file in secondary folder |
254 | $matched_name = $h; | 254 | if (isset(self::$config_path_fallback)) { |
255 | break; | 255 | self::debug(". looking for site config for $host in secondary folder"); |
256 | } | 256 | foreach ($try as $h) { |
257 | } | 257 | if (file_exists(self::$config_path_fallback."/$h.txt")) { |
258 | if (!isset($file_secondary)) { | 258 | self::debug("... found site config in secondary folder ($h.txt)"); |
259 | self::debug("... no site config match in secondary folder"); | 259 | $file_secondary = self::$config_path_fallback."/$h.txt"; |
260 | } | 260 | $matched_name = $h; |
261 | } | 261 | break; |
262 | 262 | } | |
263 | // return false if no config file found | 263 | } |
264 | if (!isset($file_primary) && !isset($file_secondary)) { | 264 | if (!isset($file_secondary)) { |
265 | self::debug("... no site config match for $host"); | 265 | self::debug("... no site config match in secondary folder"); |
266 | return false; | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
269 | // return primary config if secondary not found | 269 | // return false if no config file found |
270 | if (!isset($file_secondary) && isset($config)) { | 270 | if (!isset($file_primary) && !isset($file_secondary)) { |
271 | return $config; | 271 | self::debug("... no site config match for $host"); |
272 | } | 272 | return false; |
273 | 273 | } | |
274 | // process secondary config file | 274 | |
275 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 275 | // return primary config if secondary not found |
276 | if (!$config_lines || !is_array($config_lines)) { | 276 | if (!isset($file_secondary) && isset($config)) { |
277 | // failed to process secondary | 277 | return $config; |
278 | if (isset($config)) { | 278 | } |
279 | // return primary config | 279 | |
280 | return $config; | 280 | // process secondary config file |
281 | } else { | 281 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
282 | return false; | 282 | if (!$config_lines || !is_array($config_lines)) { |
283 | } | 283 | // failed to process secondary |
284 | } | 284 | if (isset($config)) { |
285 | 285 | // return primary config | |
286 | // merge with primary and return | 286 | return $config; |
287 | if (isset($config)) { | 287 | } else { |
288 | self::debug('. merging config files'); | 288 | return false; |
289 | $config->append(self::build_from_array($config_lines)); | 289 | } |
290 | return $config; | 290 | } |
291 | } else { | 291 | |
292 | // return just secondary | 292 | // merge with primary and return |
293 | $config = self::build_from_array($config_lines); | 293 | if (isset($config)) { |
294 | // if APC caching is available and enabled, mark this for cache | 294 | self::debug('. merging config files'); |
295 | //$config->cache_in_apc = true; | 295 | $config->append(self::build_from_array($config_lines)); |
296 | $config->cache_key = $matched_name; | 296 | return $config; |
297 | return $config; | 297 | } else { |
298 | } | 298 | // return just secondary |
299 | } | 299 | $config = self::build_from_array($config_lines); |
300 | 300 | // if APC caching is available and enabled, mark this for cache | |
301 | public static function build_from_array(array $lines) { | 301 | //$config->cache_in_apc = true; |
302 | $config = new SiteConfig(); | 302 | $config->cache_key = $matched_name; |
303 | foreach ($lines as $line) { | 303 | return $config; |
304 | $line = trim($line); | 304 | } |
305 | 305 | } | |
306 | // skip comments, empty lines | 306 | |
307 | if ($line == '' || $line[0] == '#') continue; | 307 | public static function build_from_array(array $lines) { |
308 | 308 | $config = new SiteConfig(); | |
309 | // get command | 309 | foreach ($lines as $line) { |
310 | $command = explode(':', $line, 2); | 310 | $line = trim($line); |
311 | // if there's no colon ':', skip this line | 311 | |
312 | if (count($command) != 2) continue; | 312 | // skip comments, empty lines |
313 | $val = trim($command[1]); | 313 | if ($line == '' || $line[0] == '#') continue; |
314 | $command = trim($command[0]); | 314 | |
315 | if ($command == '' || $val == '') continue; | 315 | // get command |
316 | 316 | $command = explode(':', $line, 2); | |
317 | // check for commands where we accept multiple statements | 317 | // if there's no colon ':', skip this line |
318 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { | 318 | if (count($command) != 2) continue; |
319 | array_push($config->$command, $val); | 319 | $val = trim($command[1]); |
320 | // check for single statement commands that evaluate to true or false | 320 | $command = trim($command[0]); |
321 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { | 321 | if ($command == '' || $val == '') continue; |
322 | $config->$command = ($val == 'yes'); | 322 | |
323 | // check for single statement commands stored as strings | 323 | // check for commands where we accept multiple statements |
324 | } elseif (in_array($command, array('parser'))) { | 324 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { |
325 | $config->$command = $val; | 325 | array_push($config->$command, $val); |
326 | // check for replace_string(find): replace | 326 | // check for single statement commands that evaluate to true or false |
327 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { | 327 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { |
328 | if (in_array($match[1], array('replace_string'))) { | 328 | $config->$command = ($val == 'yes'); |
329 | $command = $match[1]; | 329 | // check for single statement commands stored as strings |
330 | array_push($config->find_string, $match[2]); | 330 | } elseif (in_array($command, array('parser'))) { |
331 | array_push($config->$command, $val); | 331 | $config->$command = $val; |
332 | } | 332 | // check for replace_string(find): replace |
333 | } | 333 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { |
334 | } | 334 | if (in_array($match[1], array('replace_string'))) { |
335 | return $config; | 335 | $command = $match[1]; |
336 | } | 336 | array_push($config->find_string, $match[2]); |
337 | } | 337 | array_push($config->$command, $val); |
338 | ?> \ No newline at end of file | 338 | } |
339 | } | ||
340 | } | ||
341 | return $config; | ||
342 | } | ||
343 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php index 3487423f..40786598 100644..100755 --- a/inc/3rdparty/libraries/feedwriter/FeedItem.php +++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php | |||
@@ -1,7 +1,7 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Univarsel Feed Writer | 3 | * Univarsel Feed Writer |
4 | * | 4 | * |
5 | * FeedItem class - Used as feed element in FeedWriter class | 5 | * FeedItem class - Used as feed element in FeedWriter class |
6 | * | 6 | * |
7 | * @package UnivarselFeedWriter | 7 | * @package UnivarselFeedWriter |
@@ -12,20 +12,20 @@ | |||
12 | { | 12 | { |
13 | private $elements = array(); //Collection of feed elements | 13 | private $elements = array(); //Collection of feed elements |
14 | private $version; | 14 | private $version; |
15 | 15 | ||
16 | /** | 16 | /** |
17 | * Constructor | 17 | * Constructor |
18 | * | 18 | * |
19 | * @param contant (RSS1/RSS2/ATOM) RSS2 is default. | 19 | * @param contant (RSS1/RSS2/ATOM) RSS2 is default. |
20 | */ | 20 | */ |
21 | function __construct($version = RSS2) | 21 | function __construct($version = RSS2) |
22 | { | 22 | { |
23 | $this->version = $version; | 23 | $this->version = $version; |
24 | } | 24 | } |
25 | 25 | ||
26 | /** | 26 | /** |
27 | * Set element (overwrites existing elements with $elementName) | 27 | * Set element (overwrites existing elements with $elementName) |
28 | * | 28 | * |
29 | * @access public | 29 | * @access public |
30 | * @param srting The tag name of an element | 30 | * @param srting The tag name of an element |
31 | * @param srting The content of tag | 31 | * @param srting The content of tag |
@@ -38,11 +38,11 @@ | |||
38 | unset($this->elements[$elementName]); | 38 | unset($this->elements[$elementName]); |
39 | } | 39 | } |
40 | $this->addElement($elementName, $content, $attributes); | 40 | $this->addElement($elementName, $content, $attributes); |
41 | } | 41 | } |
42 | 42 | ||
43 | /** | 43 | /** |
44 | * Add an element to elements array | 44 | * Add an element to elements array |
45 | * | 45 | * |
46 | * @access public | 46 | * @access public |
47 | * @param srting The tag name of an element | 47 | * @param srting The tag name of an element |
48 | * @param srting The content of tag | 48 | * @param srting The content of tag |
@@ -61,11 +61,11 @@ | |||
61 | $this->elements[$elementName][$i]['content'] = $content; | 61 | $this->elements[$elementName][$i]['content'] = $content; |
62 | $this->elements[$elementName][$i]['attributes'] = $attributes; | 62 | $this->elements[$elementName][$i]['attributes'] = $attributes; |
63 | } | 63 | } |
64 | 64 | ||
65 | /** | 65 | /** |
66 | * Set multiple feed elements from an array. | 66 | * Set multiple feed elements from an array. |
67 | * Elements which have attributes cannot be added by this method | 67 | * Elements which have attributes cannot be added by this method |
68 | * | 68 | * |
69 | * @access public | 69 | * @access public |
70 | * @param array array of elements in 'tagName' => 'tagContent' format. | 70 | * @param array array of elements in 'tagName' => 'tagContent' format. |
71 | * @return void | 71 | * @return void |
@@ -73,15 +73,15 @@ | |||
73 | public function addElementArray($elementArray) | 73 | public function addElementArray($elementArray) |
74 | { | 74 | { |
75 | if(! is_array($elementArray)) return; | 75 | if(! is_array($elementArray)) return; |
76 | foreach ($elementArray as $elementName => $content) | 76 | foreach ($elementArray as $elementName => $content) |
77 | { | 77 | { |
78 | $this->addElement($elementName, $content); | 78 | $this->addElement($elementName, $content); |
79 | } | 79 | } |
80 | } | 80 | } |
81 | 81 | ||
82 | /** | 82 | /** |
83 | * Return the collection of elements in this feed item | 83 | * Return the collection of elements in this feed item |
84 | * | 84 | * |
85 | * @access public | 85 | * @access public |
86 | * @return array | 86 | * @return array |
87 | */ | 87 | */ |
@@ -89,68 +89,74 @@ | |||
89 | { | 89 | { |
90 | return $this->elements; | 90 | return $this->elements; |
91 | } | 91 | } |
92 | 92 | ||
93 | // Wrapper functions ------------------------------------------------------ | 93 | // Wrapper functions ------------------------------------------------------ |
94 | 94 | ||
95 | /** | 95 | /** |
96 | * Set the 'dscription' element of feed item | 96 | * Set the 'dscription' element of feed item |
97 | * | 97 | * |
98 | * @access public | 98 | * @access public |
99 | * @param string The content of 'description' element | 99 | * @param string The content of 'description' element |
100 | * @return void | 100 | * @return void |
101 | */ | 101 | */ |
102 | public function setDescription($description) | 102 | public function setDescription($description) |
103 | { | 103 | { |
104 | $this->setElement('description', $description); | 104 | $tag = ($this->version == ATOM)? 'summary' : 'description'; |
105 | $this->setElement($tag, $description); | ||
105 | } | 106 | } |
106 | 107 | ||
107 | /** | 108 | /** |
108 | * @desc Set the 'title' element of feed item | 109 | * @desc Set the 'title' element of feed item |
109 | * @access public | 110 | * @access public |
110 | * @param string The content of 'title' element | 111 | * @param string The content of 'title' element |
111 | * @return void | 112 | * @return void |
112 | */ | 113 | */ |
113 | public function setTitle($title) | 114 | public function setTitle($title) |
114 | { | 115 | { |
115 | $this->setElement('title', $title); | 116 | $this->setElement('title', $title); |
116 | } | 117 | } |
117 | 118 | ||
118 | /** | 119 | /** |
119 | * Set the 'date' element of feed item | 120 | * Set the 'date' element of feed item |
120 | * | 121 | * |
121 | * @access public | 122 | * @access public |
122 | * @param string The content of 'date' element | 123 | * @param string The content of 'date' element |
123 | * @return void | 124 | * @return void |
124 | */ | 125 | */ |
125 | public function setDate($date) | 126 | public function setDate($date) |
126 | { | 127 | { |
127 | if(! is_numeric($date)) | 128 | if(! is_numeric($date)) |
128 | { | 129 | { |
129 | $date = strtotime($date); | 130 | $date = strtotime($date); |
130 | } | 131 | } |
131 | 132 | ||
132 | if($this->version == RSS2) | 133 | if($this->version == ATOM) |
134 | { | ||
135 | $tag = 'updated'; | ||
136 | $value = date(DATE_ATOM, $date); | ||
137 | } | ||
138 | elseif($this->version == RSS2) | ||
133 | { | 139 | { |
134 | $tag = 'pubDate'; | 140 | $tag = 'pubDate'; |
135 | $value = date(DATE_RSS, $date); | 141 | $value = date(DATE_RSS, $date); |
136 | } | 142 | } |
137 | else | 143 | else |
138 | { | 144 | { |
139 | $tag = 'dc:date'; | 145 | $tag = 'dc:date'; |
140 | $value = date("Y-m-d", $date); | 146 | $value = date("Y-m-d", $date); |
141 | } | 147 | } |
142 | 148 | ||
143 | $this->setElement($tag, $value); | 149 | $this->setElement($tag, $value); |
144 | } | 150 | } |
145 | 151 | ||
146 | /** | 152 | /** |
147 | * Set the 'link' element of feed item | 153 | * Set the 'link' element of feed item |
148 | * | 154 | * |
149 | * @access public | 155 | * @access public |
150 | * @param string The content of 'link' element | 156 | * @param string The content of 'link' element |
151 | * @return void | 157 | * @return void |
152 | */ | 158 | */ |
153 | public function setLink($link) | 159 | public function setLink($link) |
154 | { | 160 | { |
155 | if($this->version == RSS2 || $this->version == RSS1) | 161 | if($this->version == RSS2 || $this->version == RSS1) |
156 | { | 162 | { |
@@ -161,26 +167,27 @@ | |||
161 | { | 167 | { |
162 | $this->setElement('link','',array('href'=>$link)); | 168 | $this->setElement('link','',array('href'=>$link)); |
163 | $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); | 169 | $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:')); |
164 | } | 170 | } |
165 | 171 | ||
166 | } | 172 | } |
167 | 173 | ||
168 | /** | 174 | /** |
169 | * Set the 'source' element of feed item | 175 | * Set the 'source' element of feed item |
170 | * | 176 | * |
171 | * @access public | 177 | * @access public |
172 | * @param string The content of 'source' element | 178 | * @param string The content of 'source' element |
173 | * @return void | 179 | * @return void |
174 | */ | 180 | */ |
175 | public function setSource($link) | 181 | public function setSource($link) |
176 | { | 182 | { |
177 | $this->setElement('source', $link); | 183 | $attributes = array('url'=>$link); |
184 | $this->setElement('source', "wallabag",$attributes); | ||
178 | } | 185 | } |
179 | 186 | ||
180 | /** | 187 | /** |
181 | * Set the 'encloser' element of feed item | 188 | * Set the 'encloser' element of feed item |
182 | * For RSS 2.0 only | 189 | * For RSS 2.0 only |
183 | * | 190 | * |
184 | * @access public | 191 | * @access public |
185 | * @param string The url attribute of encloser tag | 192 | * @param string The url attribute of encloser tag |
186 | * @param string The length attribute of encloser tag | 193 | * @param string The length attribute of encloser tag |
@@ -192,6 +199,6 @@ | |||
192 | $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); | 199 | $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type); |
193 | $this->setElement('enclosure','',$attributes); | 200 | $this->setElement('enclosure','',$attributes); |
194 | } | 201 | } |
195 | 202 | ||
196 | } // end of class FeedItem | 203 | } // end of class FeedItem |
197 | ?> \ No newline at end of file | 204 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index df4c8b4b..aa064afb 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php | |||
@@ -87,20 +87,26 @@ define('JSONP', 3, true); | |||
87 | * @access public | 87 | * @access public |
88 | * @return void | 88 | * @return void |
89 | */ | 89 | */ |
90 | public function genarateFeed() | 90 | public function genarateFeed($withHeaders = true) |
91 | { | 91 | { |
92 | if ($this->version == RSS2) { | 92 | if ($withHeaders) { |
93 | // header('Content-type: text/xml; charset=UTF-8'); | 93 | if ($this->version == RSS2) { |
94 | // this line prevents Chrome 20 from prompting download | 94 | header('Content-type: text/xml; charset=UTF-8'); |
95 | // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss | 95 | // this line prevents Chrome 20 from prompting download |
96 | // header('X-content-type-options: nosniff'); | 96 | // used by Google: https://news.google.com/news/feeds?ned=us&topic=b&output=rss |
97 | } elseif ($this->version == JSON) { | 97 | header('X-content-type-options: nosniff'); |
98 | // header('Content-type: application/json; charset=UTF-8'); | 98 | } elseif ($this->version == JSON) { |
99 | $this->json = new stdClass(); | 99 | header('Content-type: application/json; charset=UTF-8'); |
100 | } elseif ($this->version == JSONP) { | 100 | } elseif ($this->version == JSONP) { |
101 | // header('Content-type: application/javascript; charset=UTF-8'); | 101 | header('Content-type: application/javascript; charset=UTF-8'); |
102 | $this->json = new stdClass(); | 102 | } |
103 | } | 103 | } |
104 | |||
105 | if ($this->version == JSON || $this->version == JSONP) { | ||
106 | $this->json = new stdClass(); | ||
107 | } | ||
108 | |||
109 | |||
104 | $this->printHead(); | 110 | $this->printHead(); |
105 | $this->printChannels(); | 111 | $this->printChannels(); |
106 | $this->printItems(); | 112 | $this->printItems(); |
@@ -110,6 +116,11 @@ define('JSONP', 3, true); | |||
110 | } | 116 | } |
111 | } | 117 | } |
112 | 118 | ||
119 | public function &getItems() | ||
120 | { | ||
121 | return $this->items; | ||
122 | } | ||
123 | |||
113 | /** | 124 | /** |
114 | * Create a new FeedItem. | 125 | * Create a new FeedItem. |
115 | * | 126 | * |
@@ -193,7 +204,8 @@ define('JSONP', 3, true); | |||
193 | */ | 204 | */ |
194 | public function setDescription($description) | 205 | public function setDescription($description) |
195 | { | 206 | { |
196 | $this->setChannelElement('description', $description); | 207 | $tag = ($this->version == ATOM)? 'subtitle' : 'description'; |
208 | $this->setChannelElement($tag, $description); | ||
197 | } | 209 | } |
198 | 210 | ||
199 | /** | 211 | /** |
@@ -238,7 +250,7 @@ define('JSONP', 3, true); | |||
238 | { | 250 | { |
239 | $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; | 251 | $out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; |
240 | if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; | 252 | if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; |
241 | $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; | 253 | $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; |
242 | echo $out; | 254 | echo $out; |
243 | } | 255 | } |
244 | elseif ($this->version == JSON || $this->version == JSONP) | 256 | elseif ($this->version == JSON || $this->version == JSONP) |
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php index 2f5244f9..c4a48b21 100644 --- a/inc/3rdparty/libraries/html5/TreeBuilder.php +++ b/inc/3rdparty/libraries/html5/TreeBuilder.php | |||
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder { | |||
134 | 134 | ||
135 | // Namespaces for foreign content | 135 | // Namespaces for foreign content |
136 | const NS_HTML = null; // to prevent DOM from requiring NS on everything | 136 | const NS_HTML = null; // to prevent DOM from requiring NS on everything |
137 | const NS_XHTML = 'http://www.w3.org/1999/xhtml'; | ||
137 | const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; | 138 | const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
138 | const NS_SVG = 'http://www.w3.org/2000/svg'; | 139 | const NS_SVG = 'http://www.w3.org/2000/svg'; |
139 | const NS_XLINK = 'http://www.w3.org/1999/xlink'; | 140 | const NS_XLINK = 'http://www.w3.org/1999/xlink'; |
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder { | |||
3157 | } | 3158 | } |
3158 | 3159 | ||
3159 | private function insertElement($token, $append = true) { | 3160 | private function insertElement($token, $append = true) { |
3160 | $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); | 3161 | //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']); |
3162 | $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML; | ||
3163 | $el = $this->dom->createElementNS($namespaceURI, $token['name']); | ||
3161 | 3164 | ||
3162 | if (!empty($token['attr'])) { | 3165 | if (!empty($token['attr'])) { |
3163 | foreach($token['attr'] as $attr) { | 3166 | foreach($token['attr'] as $attr) { |
3164 | if(!$el->hasAttribute($attr['name'])) { | 3167 | |
3168 | // mike@macgirvin.com 2011-11-17, check attribute name for | ||
3169 | // validity (ignoring extenders and combiners) as illegal chars in names | ||
3170 | // causes everything to abort | ||
3171 | |||
3172 | $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']); | ||
3173 | if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) { | ||
3165 | $el->setAttribute($attr['name'], $attr['value']); | 3174 | $el->setAttribute($attr['name'], $attr['value']); |
3166 | } | 3175 | } |
3167 | } | 3176 | } |
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php index 83e94f14..e4d5f495 100644 --- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php +++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php | |||
@@ -1,404 +1,403 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Cookie Jar | 3 | * Cookie Jar |
4 | * | 4 | * |
5 | * PHP class for handling cookies, as defined by the Netscape spec: | 5 | * PHP class for handling cookies, as defined by the Netscape spec: |
6 | * <http://curl.haxx.se/rfc/cookie_spec.html> | 6 | * <http://curl.haxx.se/rfc/cookie_spec.html> |
7 | * | 7 | * |
8 | * This class should be used to handle cookies (storing cookies from HTTP response messages, and | 8 | * This class should be used to handle cookies (storing cookies from HTTP response messages, and |
9 | * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org | 9 | * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org |
10 | * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ | 10 | * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/ |
11 | * | 11 | * |
12 | * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ | 12 | * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/ |
13 | * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. | 13 | * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>. |
14 | * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. | 14 | * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965. |
15 | * | 15 | * |
16 | * @version 0.5 | 16 | * @version 0.5 |
17 | * @date 2011-03-15 | 17 | * @date 2011-03-15 |
18 | * @see http://php.net/HttpRequestPool | 18 | * @see http://php.net/HttpRequestPool |
19 | * @author Keyvan Minoukadeh | 19 | * @author Keyvan Minoukadeh |
20 | * @copyright 2011 Keyvan Minoukadeh | 20 | * @copyright 2011 Keyvan Minoukadeh |
21 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 21 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
22 | */ | 22 | */ |
23 | 23 | ||
24 | class CookieJar | 24 | class CookieJar |
25 | { | 25 | { |
26 | /** | 26 | /** |
27 | * Cookies - array containing all cookies. | 27 | * Cookies - array containing all cookies. |
28 | * | 28 | * |
29 | * <pre> | 29 | * <pre> |
30 | * Cookies are stored like this: | 30 | * Cookies are stored like this: |
31 | * [domain][path][name] = array | 31 | * [domain][path][name] = array |
32 | * where array is: | 32 | * where array is: |
33 | * 0 => value, 1 => secure, 2 => expires | 33 | * 0 => value, 1 => secure, 2 => expires |
34 | * </pre> | 34 | * </pre> |
35 | * @var array | 35 | * @var array |
36 | * @access private | 36 | * @access private |
37 | */ | 37 | */ |
38 | public $cookies = array(); | 38 | public $cookies = array(); |
39 | public $debug = false; | 39 | public $debug = false; |
40 | 40 | ||
41 | /** | 41 | /** |
42 | * Constructor | 42 | * Constructor |
43 | */ | 43 | */ |
44 | function __construct() { | 44 | function __construct() { |
45 | } | 45 | } |
46 | 46 | ||
47 | protected function debug($msg, $file=null, $line=null) { | 47 | protected function debug($msg, $file=null, $line=null) { |
48 | if ($this->debug) { | 48 | if ($this->debug) { |
49 | $mem = round(memory_get_usage()/1024, 2); | 49 | $mem = round(memory_get_usage()/1024, 2); |
50 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 50 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
51 | echo '* ',$msg; | 51 | echo '* ',$msg; |
52 | if (isset($file, $line)) echo " ($file line $line)"; | 52 | if (isset($file, $line)) echo " ($file line $line)"; |
53 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 53 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
54 | ob_flush(); | 54 | ob_flush(); |
55 | flush(); | 55 | flush(); |
56 | } | 56 | } |
57 | } | 57 | } |
58 | 58 | ||
59 | /** | 59 | /** |
60 | * Get matching cookies | 60 | * Get matching cookies |
61 | * | 61 | * |
62 | * Only use this method if you cannot use add_cookie_header(), for example, if you want to use | 62 | * Only use this method if you cannot use add_cookie_header(), for example, if you want to use |
63 | * this cookie jar class without using the request class. | 63 | * this cookie jar class without using the request class. |
64 | * | 64 | * |
65 | * @param array $param associative array containing 'domain', 'path', 'secure' keys | 65 | * @param array $param associative array containing 'domain', 'path', 'secure' keys |
66 | * @return string | 66 | * @return string |
67 | * @see add_cookie_header() | 67 | * @see add_cookie_header() |
68 | */ | 68 | */ |
69 | public function getMatchingCookies($url) | 69 | public function getMatchingCookies($url) |
70 | { | 70 | { |
71 | if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { | 71 | if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) { |
72 | $param['domain'] = $parts['host']; | 72 | $param['domain'] = $parts['host']; |
73 | $param['path'] = $parts['path']; | 73 | $param['path'] = $parts['path']; |
74 | $param['secure'] = (strtolower($parts['scheme']) == 'https'); | 74 | $param['secure'] = (strtolower($parts['scheme']) == 'https'); |
75 | unset($parts); | 75 | unset($parts); |
76 | } else { | 76 | } else { |
77 | return false; | 77 | return false; |
78 | } | 78 | } |
79 | // RFC 2965 notes: | 79 | // RFC 2965 notes: |
80 | // If multiple cookies satisfy the criteria above, they are ordered in | 80 | // If multiple cookies satisfy the criteria above, they are ordered in |
81 | // the Cookie header such that those with more specific Path attributes | 81 | // the Cookie header such that those with more specific Path attributes |
82 | // precede those with less specific. Ordering with respect to other | 82 | // precede those with less specific. Ordering with respect to other |
83 | // attributes (e.g., Domain) is unspecified. | 83 | // attributes (e.g., Domain) is unspecified. |
84 | $domain = $param['domain']; | 84 | $domain = $param['domain']; |
85 | if (strpos($domain, '.') === false) $domain .= '.local'; | 85 | if (strpos($domain, '.') === false) $domain .= '.local'; |
86 | $request_path = $param['path']; | 86 | $request_path = $param['path']; |
87 | if ($request_path == '') $request_path = '/'; | 87 | if ($request_path == '') $request_path = '/'; |
88 | $request_secure = $param['secure']; | 88 | $request_secure = $param['secure']; |
89 | $now = time(); | 89 | $now = time(); |
90 | $matched_cookies = array(); | 90 | $matched_cookies = array(); |
91 | // domain - find matching domains | 91 | // domain - find matching domains |
92 | $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); | 92 | $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__); |
93 | while (strpos($domain, '.') !== false) { | 93 | while (strpos($domain, '.') !== false) { |
94 | if (isset($this->cookies[$domain])) { | 94 | if (isset($this->cookies[$domain])) { |
95 | $this->debug(' domain match found: '.$domain); | 95 | $this->debug(' domain match found: '.$domain); |
96 | $cookies =& $this->cookies[$domain]; | 96 | $cookies =& $this->cookies[$domain]; |
97 | } else { | 97 | } else { |
98 | $domain = $this->_reduce_domain($domain); | 98 | $domain = $this->_reduce_domain($domain); |
99 | continue; | 99 | continue; |
100 | } | 100 | } |
101 | // paths - find matching paths starting from most specific | 101 | // paths - find matching paths starting from most specific |
102 | $this->debug(' - Finding matching paths for '.$request_path); | 102 | $this->debug(' - Finding matching paths for '.$request_path); |
103 | $paths = array_keys($cookies); | 103 | $paths = array_keys($cookies); |
104 | usort($paths, array($this, '_cmp_length')); | 104 | usort($paths, array($this, '_cmp_length')); |
105 | foreach ($paths as $path) { | 105 | foreach ($paths as $path) { |
106 | // continue to next cookie if request path does not path-match cookie path | 106 | // continue to next cookie if request path does not path-match cookie path |
107 | if (!$this->_path_match($request_path, $path)) continue; | 107 | if (!$this->_path_match($request_path, $path)) continue; |
108 | // loop through cookie names | 108 | // loop through cookie names |
109 | $this->debug(' path match found: '.$path); | 109 | $this->debug(' path match found: '.$path); |
110 | foreach ($cookies[$path] as $name => $values) { | 110 | foreach ($cookies[$path] as $name => $values) { |
111 | // if this cookie is secure but request isn't, continue to next cookie | 111 | // if this cookie is secure but request isn't, continue to next cookie |
112 | if ($values[1] && !$request_secure) continue; | 112 | if ($values[1] && !$request_secure) continue; |
113 | // if cookie is not a session cookie and has expired, continue to next cookie | 113 | // if cookie is not a session cookie and has expired, continue to next cookie |
114 | if (is_int($values[2]) && ($values[2] < $now)) continue; | 114 | if (is_int($values[2]) && ($values[2] < $now)) continue; |
115 | // cookie matches request | 115 | // cookie matches request |
116 | $this->debug(' cookie match: '.$name.'='.$values[0]); | 116 | $this->debug(' cookie match: '.$name.'='.$values[0]); |
117 | $matched_cookies[] = $name.'='.$values[0]; | 117 | $matched_cookies[] = $name.'='.$values[0]; |
118 | } | 118 | } |
119 | } | 119 | } |
120 | $domain = $this->_reduce_domain($domain); | 120 | $domain = $this->_reduce_domain($domain); |
121 | } | 121 | } |
122 | // return cookies | 122 | // return cookies |
123 | return implode('; ', $matched_cookies); | 123 | return implode('; ', $matched_cookies); |
124 | } | 124 | } |
125 | 125 | ||
126 | /** | 126 | /** |
127 | * Parse Set-Cookie values. | 127 | * Parse Set-Cookie values. |
128 | * | 128 | * |
129 | * Only use this method if you cannot use extract_cookies(), for example, if you want to use | 129 | * Only use this method if you cannot use extract_cookies(), for example, if you want to use |
130 | * this cookie jar class without using the response class. | 130 | * this cookie jar class without using the response class. |
131 | * | 131 | * |
132 | * @param array $set_cookies array holding 1 or more "Set-Cookie" header values | 132 | * @param array $set_cookies array holding 1 or more "Set-Cookie" header values |
133 | * @param array $param associative array containing 'host', 'path' keys | 133 | * @param array $param associative array containing 'host', 'path' keys |
134 | * @return void | 134 | * @return void |
135 | * @see extract_cookies() | 135 | * @see extract_cookies() |
136 | */ | 136 | */ |
137 | public function storeCookies($url, $set_cookies) | 137 | public function storeCookies($url, $set_cookies) |
138 | { | 138 | { |
139 | if (count($set_cookies) == 0) return; | 139 | if (count($set_cookies) == 0) return; |
140 | $param = @parse_url($url); | 140 | $param = @parse_url($url); |
141 | if (!is_array($param) || !isset($param['host'])) return; | 141 | if (!is_array($param) || !isset($param['host'])) return; |
142 | $request_host = $param['host']; | 142 | $request_host = $param['host']; |
143 | if (strpos($request_host, '.') === false) $request_host .= '.local'; | 143 | if (strpos($request_host, '.') === false) $request_host .= '.local'; |
144 | $request_path = @$param['path']; | 144 | $request_path = @$param['path']; |
145 | if ($request_path == '') $request_path = '/'; | 145 | if ($request_path == '') $request_path = '/'; |
146 | // | 146 | // |
147 | // loop through set-cookie headers | 147 | // loop through set-cookie headers |
148 | // | 148 | // |
149 | foreach ($set_cookies as $set_cookie) { | 149 | foreach ($set_cookies as $set_cookie) { |
150 | $this->debug('Parsing: '.$set_cookie); | 150 | $this->debug('Parsing: '.$set_cookie); |
151 | // temporary cookie store (before adding to jar) | 151 | // temporary cookie store (before adding to jar) |
152 | $tmp_cookie = array(); | 152 | $tmp_cookie = array(); |
153 | $param = explode(';', $set_cookie); | 153 | $param = explode(';', $set_cookie); |
154 | // loop through params | 154 | // loop through params |
155 | for ($x=0; $x<count($param); $x++) { | 155 | for ($x=0; $x<count($param); $x++) { |
156 | $key_val = explode('=', $param[$x], 2); | 156 | $key_val = explode('=', $param[$x], 2); |
157 | if (count($key_val) != 2) { | 157 | if (count($key_val) != 2) { |
158 | // if the first param isn't a name=value pair, continue to the next set-cookie | 158 | // if the first param isn't a name=value pair, continue to the next set-cookie |
159 | // header | 159 | // header |
160 | if ($x == 0) continue 2; | 160 | if ($x == 0) continue 2; |
161 | // check for secure flag | 161 | // check for secure flag |
162 | if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; | 162 | if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true; |
163 | // continue to next param | 163 | // continue to next param |
164 | continue; | 164 | continue; |
165 | } | 165 | } |
166 | list($key, $val) = array_map('trim', $key_val); | 166 | list($key, $val) = array_map('trim', $key_val); |
167 | // first name=value pair is the cookie name and value | 167 | // first name=value pair is the cookie name and value |
168 | // the name and value are stored under 'name' and 'value' to avoid conflicts | 168 | // the name and value are stored under 'name' and 'value' to avoid conflicts |
169 | // with later parameters. | 169 | // with later parameters. |
170 | if ($x == 0) { | 170 | if ($x == 0) { |
171 | $tmp_cookie = array('name'=>$key, 'value'=>$val); | 171 | $tmp_cookie = array('name'=>$key, 'value'=>$val); |
172 | continue; | 172 | continue; |
173 | } | 173 | } |
174 | $key = strtolower($key); | 174 | $key = strtolower($key); |
175 | if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { | 175 | if (in_array($key, array('expires', 'path', 'domain', 'secure'))) { |
176 | $tmp_cookie[$key] = $val; | 176 | $tmp_cookie[$key] = $val; |
177 | } | 177 | } |
178 | } | 178 | } |
179 | // | 179 | // |
180 | // set cookie | 180 | // set cookie |
181 | // | 181 | // |
182 | // check domain | 182 | // check domain |
183 | if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && | 183 | if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) && |
184 | ($tmp_cookie['domain'] != ".$request_host")) { | 184 | ($tmp_cookie['domain'] != ".$request_host")) { |
185 | $domain = $tmp_cookie['domain']; | 185 | $domain = $tmp_cookie['domain']; |
186 | if ((strpos($domain, '.') === false) && ($domain != 'local')) { | 186 | if ((strpos($domain, '.') === false) && ($domain != 'local')) { |
187 | $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); | 187 | $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain'); |
188 | continue; | 188 | continue; |
189 | } | 189 | } |
190 | if (preg_match('/\.[0-9]+$/', $domain)) { | 190 | if (preg_match('/\.[0-9]+$/', $domain)) { |
191 | $this->debug(' - domain "'.$domain.'" appears to be an ip address'); | 191 | $this->debug(' - domain "'.$domain.'" appears to be an ip address'); |
192 | continue; | 192 | continue; |
193 | } | 193 | } |
194 | if (substr($domain, 0, 1) != '.') $domain = ".$domain"; | 194 | if (substr($domain, 0, 1) != '.') $domain = ".$domain"; |
195 | if (!$this->_domain_match($request_host, $domain)) { | 195 | if (!$this->_domain_match($request_host, $domain)) { |
196 | $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); | 196 | $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"'); |
197 | continue; | 197 | continue; |
198 | } | 198 | } |
199 | } else { | 199 | } else { |
200 | // if domain is not specified in the set-cookie header, domain will default to | 200 | // if domain is not specified in the set-cookie header, domain will default to |
201 | // the request host | 201 | // the request host |
202 | $domain = $request_host; | 202 | $domain = $request_host; |
203 | } | 203 | } |
204 | // check path | 204 | // check path |
205 | if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { | 205 | if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) { |
206 | $path = urldecode($tmp_cookie['path']); | 206 | $path = urldecode($tmp_cookie['path']); |
207 | if (!$this->_path_match($request_path, $path)) { | 207 | if (!$this->_path_match($request_path, $path)) { |
208 | $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); | 208 | $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"'); |
209 | continue; | 209 | continue; |
210 | } | 210 | } |
211 | } else { | 211 | } else { |
212 | $path = $request_path; | 212 | $path = $request_path; |
213 | $path = substr($path, 0, strrpos($path, '/')); | 213 | $path = substr($path, 0, strrpos($path, '/')); |
214 | if ($path == '') $path = '/'; | 214 | if ($path == '') $path = '/'; |
215 | } | 215 | } |
216 | // check if secure | 216 | // check if secure |
217 | $secure = (isset($tmp_cookie['secure'])) ? true : false; | 217 | $secure = (isset($tmp_cookie['secure'])) ? true : false; |
218 | // check expiry | 218 | // check expiry |
219 | if (isset($tmp_cookie['expires'])) { | 219 | if (isset($tmp_cookie['expires'])) { |
220 | if (($expires = strtotime($tmp_cookie['expires'])) < 0) { | 220 | if (($expires = strtotime($tmp_cookie['expires'])) < 0) { |
221 | $expires = null; | 221 | $expires = null; |
222 | } | 222 | } |
223 | } else { | 223 | } else { |
224 | $expires = null; | 224 | $expires = null; |
225 | } | 225 | } |
226 | // set cookie | 226 | // set cookie |
227 | $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); | 227 | $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires); |
228 | } | 228 | } |
229 | } | 229 | } |
230 | 230 | ||
231 | // return array of set-cookie values extracted from HTTP response headers (string $h) | 231 | // return array of set-cookie values extracted from HTTP response headers (string $h) |
232 | public function extractCookies($h) { | 232 | public function extractCookies($h) { |
233 | $x = 0; | 233 | $x = 0; |
234 | $lines = 0; | 234 | $lines = 0; |
235 | $headers = array(); | 235 | $headers = array(); |
236 | $last_match = false; | 236 | $last_match = false; |
237 | $h = explode("\n", $h); | 237 | $h = explode("\n", $h); |
238 | foreach ($h as $line) { | 238 | foreach ($h as $line) { |
239 | $line = rtrim($line); | 239 | $line = rtrim($line); |
240 | $lines++; | 240 | $lines++; |
241 | 241 | ||
242 | $trimmed_line = trim($line); | 242 | $trimmed_line = trim($line); |
243 | if (isset($line_last)) { | 243 | if (isset($line_last)) { |
244 | // check if we have \r\n\r\n (indicating the end of headers) | 244 | // check if we have \r\n\r\n (indicating the end of headers) |
245 | // some servers will not use CRLF (\r\n), so we make CR (\r) optional. | 245 | // some servers will not use CRLF (\r\n), so we make CR (\r) optional. |
246 | // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { | 246 | // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) { |
247 | // break; | 247 | // break; |
248 | // } | 248 | // } |
249 | // As an alternative, we can check if the current trimmed line is empty | 249 | // As an alternative, we can check if the current trimmed line is empty |
250 | if ($trimmed_line == '') { | 250 | if ($trimmed_line == '') { |
251 | break; | 251 | break; |
252 | } | 252 | } |
253 | 253 | ||
254 | // check for continuation line... | 254 | // check for continuation line... |
255 | // RFC 2616 Section 2.2 "Basic Rules": | 255 | // RFC 2616 Section 2.2 "Basic Rules": |
256 | // HTTP/1.1 header field values can be folded onto multiple lines if the | 256 | // HTTP/1.1 header field values can be folded onto multiple lines if the |
257 | // continuation line begins with a space or horizontal tab. All linear | 257 | // continuation line begins with a space or horizontal tab. All linear |
258 | // white space, including folding, has the same semantics as SP. A | 258 | // white space, including folding, has the same semantics as SP. A |
259 | // recipient MAY replace any linear white space with a single SP before | 259 | // recipient MAY replace any linear white space with a single SP before |
260 | // interpreting the field value or forwarding the message downstream. | 260 | // interpreting the field value or forwarding the message downstream. |
261 | if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { | 261 | if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) { |
262 | // append to previous header value | 262 | // append to previous header value |
263 | $headers[$x-1] .= ' '.rtrim($match[1]); | 263 | $headers[$x-1] .= ' '.rtrim($match[1]); |
264 | continue; | 264 | continue; |
265 | } | 265 | } |
266 | } | 266 | } |
267 | $line_last = $line; | 267 | $line_last = $line; |
268 | 268 | ||
269 | // split header name and value | 269 | // split header name and value |
270 | if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { | 270 | if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) { |
271 | $headers[$x++] = rtrim($match[1]); | 271 | $headers[$x++] = rtrim($match[1]); |
272 | $last_match = true; | 272 | $last_match = true; |
273 | } else { | 273 | } else { |
274 | $last_match = false; | 274 | $last_match = false; |
275 | } | 275 | } |
276 | } | 276 | } |
277 | return $headers; | 277 | return $headers; |
278 | } | 278 | } |
279 | 279 | ||
280 | /** | 280 | /** |
281 | * Set Cookie | 281 | * Set Cookie |
282 | * @param string $domain | 282 | * @param string $domain |
283 | * @param string $path | 283 | * @param string $path |
284 | * @param string $name cookie name | 284 | * @param string $name cookie name |
285 | * @param string $value cookie value | 285 | * @param string $value cookie value |
286 | * @param bool $secure | 286 | * @param bool $secure |
287 | * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) | 287 | * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie) |
288 | * @return void | 288 | * @return void |
289 | */ | 289 | */ |
290 | function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) | 290 | function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null) |
291 | { | 291 | { |
292 | if ($domain == '') return; | 292 | if ($domain == '') return; |
293 | if ($path == '') return; | 293 | if ($path == '') return; |
294 | if ($name == '') return; | 294 | if ($name == '') return; |
295 | // check if cookie needs to go | 295 | // check if cookie needs to go |
296 | if (isset($expires) && ($expires <= 0)) { | 296 | if (isset($expires) && ($expires <= 0)) { |
297 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); | 297 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); |
298 | return; | 298 | return; |
299 | } | 299 | } |
300 | if ($value == '') return; | 300 | if ($value == '') return; |
301 | $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); | 301 | $this->cookies[$domain][$path][$name] = array($value, $secure, $expires); |
302 | return; | 302 | return; |
303 | } | 303 | } |
304 | 304 | ||
305 | /** | 305 | /** |
306 | * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. | 306 | * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies. |
307 | * @param string $domain | 307 | * @param string $domain |
308 | * @param string $path | 308 | * @param string $path |
309 | * @param string $name | 309 | * @param string $name |
310 | * @return void | 310 | * @return void |
311 | */ | 311 | */ |
312 | function clear($domain=null, $path=null, $name=null) | 312 | function clear($domain=null, $path=null, $name=null) |
313 | { | 313 | { |
314 | if (!isset($domain)) { | 314 | if (!isset($domain)) { |
315 | $this->cookies = array(); | 315 | $this->cookies = array(); |
316 | } elseif (!isset($path)) { | 316 | } elseif (!isset($path)) { |
317 | if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); | 317 | if (isset($this->cookies[$domain])) unset($this->cookies[$domain]); |
318 | } elseif (!isset($name)) { | 318 | } elseif (!isset($name)) { |
319 | if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); | 319 | if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]); |
320 | } elseif (isset($name)) { | 320 | } elseif (isset($name)) { |
321 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); | 321 | if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]); |
322 | } | 322 | } |
323 | } | 323 | } |
324 | 324 | ||
325 | /** | 325 | /** |
326 | * Compare string length - used for sorting | 326 | * Compare string length - used for sorting |
327 | * @access private | 327 | * @access private |
328 | * @return int | 328 | * @return int |
329 | */ | 329 | */ |
330 | function _cmp_length($a, $b) | 330 | function _cmp_length($a, $b) |
331 | { | 331 | { |
332 | $la = strlen($a); $lb = strlen($b); | 332 | $la = strlen($a); $lb = strlen($b); |
333 | if ($la == $lb) return 0; | 333 | if ($la == $lb) return 0; |
334 | return ($la > $lb) ? -1 : 1; | 334 | return ($la > $lb) ? -1 : 1; |
335 | } | 335 | } |
336 | 336 | ||
337 | /** | 337 | /** |
338 | * Reduce domain | 338 | * Reduce domain |
339 | * @param string $domain | 339 | * @param string $domain |
340 | * @return string | 340 | * @return string |
341 | * @access private | 341 | * @access private |
342 | */ | 342 | */ |
343 | function _reduce_domain($domain) | 343 | function _reduce_domain($domain) |
344 | { | 344 | { |
345 | if ($domain == '') return ''; | 345 | if ($domain == '') return ''; |
346 | if (substr($domain, 0, 1) == '.') return substr($domain, 1); | 346 | if (substr($domain, 0, 1) == '.') return substr($domain, 1); |
347 | return substr($domain, strpos($domain, '.')); | 347 | return substr($domain, strpos($domain, '.')); |
348 | } | 348 | } |
349 | 349 | ||
350 | /** | 350 | /** |
351 | * Path match - check if path1 path-matches path2 | 351 | * Path match - check if path1 path-matches path2 |
352 | * | 352 | * |
353 | * From RFC 2965: | 353 | * From RFC 2965: |
354 | * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 | 354 | * <i>For two strings that represent paths, P1 and P2, P1 path-matches P2 |
355 | * if P2 is a prefix of P1 (including the case where P1 and P2 string- | 355 | * if P2 is a prefix of P1 (including the case where P1 and P2 string- |
356 | * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> | 356 | * compare equal). Thus, the string /tec/waldo path-matches /tec.</i> |
357 | * @param string $path1 | 357 | * @param string $path1 |
358 | * @param string $path2 | 358 | * @param string $path2 |
359 | * @return bool | 359 | * @return bool |
360 | * @access private | 360 | * @access private |
361 | */ | 361 | */ |
362 | function _path_match($path1, $path2) | 362 | function _path_match($path1, $path2) |
363 | { | 363 | { |
364 | return (substr($path1, 0, strlen($path2)) == $path2); | 364 | return (substr($path1, 0, strlen($path2)) == $path2); |
365 | } | 365 | } |
366 | 366 | ||
367 | /** | 367 | /** |
368 | * Domain match - check if domain1 domain-matches domain2 | 368 | * Domain match - check if domain1 domain-matches domain2 |
369 | * | 369 | * |
370 | * A few extracts from RFC 2965: | 370 | * A few extracts from RFC 2965: |
371 | * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com | 371 | * - A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com |
372 | * would be rejected, because H is y.x and contains a dot. | 372 | * would be rejected, because H is y.x and contains a dot. |
373 | * | 373 | * |
374 | * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com | 374 | * - A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com |
375 | * would be accepted. | 375 | * would be accepted. |
376 | * | 376 | * |
377 | * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be | 377 | * - A Set-Cookie2 with Domain=.com or Domain=.com., will always be |
378 | * rejected, because there is no embedded dot. | 378 | * rejected, because there is no embedded dot. |
379 | * | 379 | * |
380 | * - A Set-Cookie2 from request-host example for Domain=.local will | 380 | * - A Set-Cookie2 from request-host example for Domain=.local will |
381 | * be accepted, because the effective host name for the request- | 381 | * be accepted, because the effective host name for the request- |
382 | * host is example.local, and example.local domain-matches .local. | 382 | * host is example.local, and example.local domain-matches .local. |
383 | * | 383 | * |
384 | * I'm ignoring the first point for now (must check to see how other browsers handle | 384 | * I'm ignoring the first point for now (must check to see how other browsers handle |
385 | * this rule for Set-Cookie headers) | 385 | * this rule for Set-Cookie headers) |
386 | * | 386 | * |
387 | * @param string $domain1 | 387 | * @param string $domain1 |
388 | * @param string $domain2 | 388 | * @param string $domain2 |
389 | * @return bool | 389 | * @return bool |
390 | * @access private | 390 | * @access private |
391 | */ | 391 | */ |
392 | function _domain_match($domain1, $domain2) | 392 | function _domain_match($domain1, $domain2) |
393 | { | 393 | { |
394 | $domain1 = strtolower($domain1); | 394 | $domain1 = strtolower($domain1); |
395 | $domain2 = strtolower($domain2); | 395 | $domain2 = strtolower($domain2); |
396 | while (strpos($domain1, '.') !== false) { | 396 | while (strpos($domain1, '.') !== false) { |
397 | if ($domain1 == $domain2) return true; | 397 | if ($domain1 == $domain2) return true; |
398 | $domain1 = $this->_reduce_domain($domain1); | 398 | $domain1 = $this->_reduce_domain($domain1); |
399 | continue; | 399 | continue; |
400 | } | 400 | } |
401 | return false; | 401 | return false; |
402 | } | 402 | } |
403 | } | 403 | } \ No newline at end of file |
404 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php index e4f1b3b3..963f0c05 100644 --- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php | |||
@@ -1,779 +1,810 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent | 3 | * Humble HTTP Agent |
4 | * | 4 | * |
5 | * This class is designed to take advantage of parallel HTTP requests | 5 | * This class is designed to take advantage of parallel HTTP requests |
6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. | 6 | * offered by PHP's PECL HTTP extension or the curl_multi_* functions. |
7 | * For environments which do not have these options, it reverts to standard sequential | 7 | * For environments which do not have these options, it reverts to standard sequential |
8 | * requests (using file_get_contents()) | 8 | * requests (using file_get_contents()) |
9 | * | 9 | * |
10 | * @version 1.1 | 10 | * @version 1.4 |
11 | * @date 2012-08-20 | 11 | * @date 2013-05-10 |
12 | * @see http://php.net/HttpRequestPool | 12 | * @see http://php.net/HttpRequestPool |
13 | * @author Keyvan Minoukadeh | 13 | * @author Keyvan Minoukadeh |
14 | * @copyright 2011-2012 Keyvan Minoukadeh | 14 | * @copyright 2011-2013 Keyvan Minoukadeh |
15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 15 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
16 | */ | 16 | */ |
17 | 17 | ||
18 | class HumbleHttpAgent | 18 | class HumbleHttpAgent |
19 | { | 19 | { |
20 | const METHOD_REQUEST_POOL = 1; | 20 | const METHOD_REQUEST_POOL = 1; |
21 | const METHOD_CURL_MULTI = 2; | 21 | const METHOD_CURL_MULTI = 2; |
22 | const METHOD_FILE_GET_CONTENTS = 4; | 22 | const METHOD_FILE_GET_CONTENTS = 4; |
23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; | 23 | //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; |
24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; | 24 | const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; |
25 | const UA_PHP = 'PHP/5.2'; | 25 | const UA_PHP = 'PHP/5.4'; |
26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; | 26 | const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; |
27 | 27 | ||
28 | protected $requests = array(); | 28 | protected $requests = array(); |
29 | protected $redirectQueue = array(); | 29 | protected $redirectQueue = array(); |
30 | protected $requestOptions; | 30 | protected $requestOptions; |
31 | protected $maxParallelRequests = 5; | 31 | protected $maxParallelRequests = 5; |
32 | protected $cache = null; //TODO | 32 | protected $cache = null; //TODO |
33 | protected $httpContext; | 33 | protected $httpContext; |
34 | protected $minimiseMemoryUse = false; //TODO | 34 | protected $minimiseMemoryUse = false; //TODO |
35 | protected $method; | 35 | protected $method; |
36 | protected $cookieJar; | 36 | protected $cookieJar; |
37 | public $debug = false; | 37 | public $debug = false; |
38 | public $debugVerbose = false; | 38 | public $debugVerbose = false; |
39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html | 39 | public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html |
40 | public $maxRedirects = 5; | 40 | public $maxRedirects = 5; |
41 | public $userAgentMap = array(); | 41 | public $userAgentMap = array(); |
42 | public $rewriteUrls = array(); | 42 | public $rewriteUrls = array(); |
43 | public $userAgentDefault; | 43 | public $userAgentDefault; |
44 | public $referer; | 44 | public $referer; |
45 | //public $userAgent = 'Mozilla/5.0'; | 45 | //public $userAgent = 'Mozilla/5.0'; |
46 | 46 | ||
47 | // Prevent certain file/mime types | 47 | // Prevent certain file/mime types |
48 | // HTTP responses which match these content types will | 48 | // HTTP responses which match these content types will |
49 | // be returned without body. | 49 | // be returned without body. |
50 | public $headerOnlyTypes = array(); | 50 | public $headerOnlyTypes = array(); |
51 | // URLs ending with one of these extensions will | 51 | // URLs ending with one of these extensions will |
52 | // prompt Humble HTTP Agent to send a HEAD request first | 52 | // prompt Humble HTTP Agent to send a HEAD request first |
53 | // to see if returned content type matches $headerOnlyTypes. | 53 | // to see if returned content type matches $headerOnlyTypes. |
54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); | 54 | public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov'); |
55 | // AJAX triggers to search for. | 55 | // AJAX triggers to search for. |
56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 56 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); | 57 | public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"'); |
58 | 58 | ||
59 | //TODO: set max file size | 59 | //TODO: set max file size |
60 | //TODO: normalise headers | 60 | //TODO: normalise headers |
61 | 61 | ||
62 | function __construct($requestOptions=null, $method=null) { | 62 | function __construct($requestOptions=null, $method=null) { |
63 | $this->userAgentDefault = self::UA_BROWSER; | 63 | $this->userAgentDefault = self::UA_BROWSER; |
64 | $this->referer = self::REF_GOOGLE; | 64 | $this->referer = self::REF_GOOGLE; |
65 | // set the request method | 65 | // set the request method |
66 | if (in_array($method, array(1,2,4))) { | 66 | if (in_array($method, array(1,2,4))) { |
67 | $this->method = $method; | 67 | $this->method = $method; |
68 | } else { | 68 | } else { |
69 | if (class_exists('HttpRequestPool')) { | 69 | if (class_exists('HttpRequestPool')) { |
70 | $this->method = self::METHOD_REQUEST_POOL; | 70 | $this->method = self::METHOD_REQUEST_POOL; |
71 | } elseif (function_exists('curl_multi_init')) { | 71 | } elseif (function_exists('curl_multi_init')) { |
72 | $this->method = self::METHOD_CURL_MULTI; | 72 | $this->method = self::METHOD_CURL_MULTI; |
73 | } else { | 73 | } else { |
74 | $this->method = self::METHOD_FILE_GET_CONTENTS; | 74 | $this->method = self::METHOD_FILE_GET_CONTENTS; |
75 | } | 75 | } |
76 | } | 76 | } |
77 | if ($this->method == self::METHOD_CURL_MULTI) { | 77 | if ($this->method == self::METHOD_CURL_MULTI) { |
78 | require_once(dirname(__FILE__).'/RollingCurl.php'); | 78 | require_once(dirname(__FILE__).'/RollingCurl.php'); |
79 | } | 79 | } |
80 | // create cookie jar | 80 | // create cookie jar |
81 | $this->cookieJar = new CookieJar(); | 81 | $this->cookieJar = new CookieJar(); |
82 | // set request options (redirect must be 0) | 82 | // set request options (redirect must be 0) |
83 | $this->requestOptions = array( | 83 | $this->requestOptions = array( |
84 | 'timeout' => 15, | 84 | 'timeout' => 15, |
85 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web | 85 | 'connecttimeout' => 15, |
86 | // TODO: test onprogress? | 86 | 'dns_cache_timeout' => 300, |
87 | ); | 87 | 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web |
88 | if (is_array($requestOptions)) { | 88 | // TODO: test onprogress? |
89 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); | 89 | ); |
90 | } | 90 | if (is_array($requestOptions)) { |
91 | $this->httpContext = array( | 91 | $this->requestOptions = array_merge($this->requestOptions, $requestOptions); |
92 | 'http' => array( | 92 | } |
93 | 'ignore_errors' => true, | 93 | $this->httpContext = array( |
94 | 'timeout' => $this->requestOptions['timeout'], | 94 | 'http' => array( |
95 | 'max_redirects' => $this->requestOptions['redirect'], | 95 | 'ignore_errors' => true, |
96 | 'header' => "Accept: */*\r\n" | 96 | 'timeout' => $this->requestOptions['timeout'], |
97 | ) | 97 | 'max_redirects' => $this->requestOptions['redirect'], |
98 | ); | 98 | 'header' => "Accept: */*\r\n" |
99 | } | 99 | ) |
100 | 100 | ); | |
101 | protected function debug($msg) { | 101 | } |
102 | if ($this->debug) { | 102 | |
103 | $mem = round(memory_get_usage()/1024, 2); | 103 | protected function debug($msg) { |
104 | $memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | if ($this->debug) { |
105 | echo '* ',$msg; | 105 | $mem = round(memory_get_usage()/1024, 2); |
106 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; | 106 | $memPeak = round(memory_get_peak_usage()/1024, 2); |
107 | echo "\n"; | 107 | echo '* ',$msg; |
108 | ob_flush(); | 108 | if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)"; |
109 | flush(); | 109 | echo "\n"; |
110 | } | 110 | ob_flush(); |
111 | } | 111 | flush(); |
112 | 112 | } | |
113 | protected function getUserAgent($url, $asArray=false) { | 113 | } |
114 | $host = @parse_url($url, PHP_URL_HOST); | 114 | |
115 | if (strtolower(substr($host, 0, 4)) == 'www.') { | 115 | protected function getUserAgent($url, $asArray=false) { |
116 | $host = substr($host, 4); | 116 | $host = @parse_url($url, PHP_URL_HOST); |
117 | } | 117 | if (strtolower(substr($host, 0, 4)) == 'www.') { |
118 | if ($host) { | 118 | $host = substr($host, 4); |
119 | $try = array($host); | 119 | } |
120 | $split = explode('.', $host); | 120 | if ($host) { |
121 | if (count($split) > 1) { | 121 | $try = array($host); |
122 | array_shift($split); | 122 | $split = explode('.', $host); |
123 | $try[] = '.'.implode('.', $split); | 123 | if (count($split) > 1) { |
124 | } | 124 | array_shift($split); |
125 | foreach ($try as $h) { | 125 | $try[] = '.'.implode('.', $split); |
126 | if (isset($this->userAgentMap[$h])) { | 126 | } |
127 | $ua = $this->userAgentMap[$h]; | 127 | foreach ($try as $h) { |
128 | break; | 128 | if (isset($this->userAgentMap[$h])) { |
129 | } | 129 | $ua = $this->userAgentMap[$h]; |
130 | } | 130 | break; |
131 | } | 131 | } |
132 | if (!isset($ua)) $ua = $this->userAgentDefault; | 132 | } |
133 | if ($asArray) { | 133 | } |
134 | return array('User-Agent' => $ua); | 134 | if (!isset($ua)) $ua = $this->userAgentDefault; |
135 | } else { | 135 | if ($asArray) { |
136 | return 'User-Agent: '.$ua; | 136 | return array('User-Agent' => $ua); |
137 | } | 137 | } else { |
138 | } | 138 | return 'User-Agent: '.$ua; |
139 | 139 | } | |
140 | public function rewriteHashbangFragment($url) { | 140 | } |
141 | // return $url if there's no '#!' | 141 | |
142 | if (strpos($url, '#!') === false) return $url; | 142 | public function rewriteHashbangFragment($url) { |
143 | // split $url and rewrite | 143 | // return $url if there's no '#!' |
144 | // TODO: is SimplePie_IRI included? | 144 | if (strpos($url, '#!') === false) return $url; |
145 | $iri = new SimplePie_IRI($url); | 145 | // split $url and rewrite |
146 | $fragment = substr($iri->fragment, 1); // strip '!' | 146 | // TODO: is SimplePie_IRI included? |
147 | $iri->fragment = null; | 147 | $iri = new SimplePie_IRI($url); |
148 | if (isset($iri->query)) { | 148 | $fragment = substr($iri->fragment, 1); // strip '!' |
149 | parse_str($iri->query, $query); | 149 | $iri->fragment = null; |
150 | } else { | 150 | if (isset($iri->query)) { |
151 | $query = array(); | 151 | parse_str($iri->query, $query); |
152 | } | 152 | } else { |
153 | $query['_escaped_fragment_'] = (string)$fragment; | 153 | $query = array(); |
154 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 154 | } |
155 | return $iri->get_iri(); | 155 | $query['_escaped_fragment_'] = (string)$fragment; |
156 | } | 156 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
157 | 157 | return $iri->get_iri(); | |
158 | public function getUglyURL($url, $html) { | 158 | } |
159 | if ($html == '') return false; | 159 | |
160 | $found = false; | 160 | public function getRedirectURLfromHTML($url, $html) { |
161 | foreach ($this->ajaxTriggers as $string) { | 161 | $redirect_url = $this->getMetaRefreshURL($url, $html); |
162 | if (stripos($html, $string)) { | 162 | if (!$redirect_url) { |
163 | $found = true; | 163 | $redirect_url = $this->getUglyURL($url, $html); |
164 | break; | 164 | } |
165 | } | 165 | return $redirect_url; |
166 | } | 166 | } |
167 | if (!$found) return false; | 167 | |
168 | $iri = new SimplePie_IRI($url); | 168 | public function getMetaRefreshURL($url, $html) { |
169 | if (isset($iri->query)) { | 169 | if ($html == '') return false; |
170 | parse_str($iri->query, $query); | 170 | // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513"> |
171 | } else { | 171 | if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) { |
172 | $query = array(); | 172 | return false; |
173 | } | 173 | } |
174 | $query['_escaped_fragment_'] = ''; | 174 | $redirect_url = $match[1]; |
175 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites | 175 | if (preg_match('!^https?://!i', $redirect_url)) { |
176 | return $iri->get_iri(); | 176 | // already absolute |
177 | } | 177 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); |
178 | 178 | return $redirect_url; | |
179 | public function removeFragment($url) { | 179 | } |
180 | $pos = strpos($url, '#'); | 180 | // absolutize redirect URL |
181 | if ($pos === false) { | 181 | $base = new SimplePie_IRI($url); |
182 | return $url; | 182 | // remove '//' in URL path (causes URLs not to resolve properly) |
183 | } else { | 183 | if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); |
184 | return substr($url, 0, $pos); | 184 | if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { |
185 | } | 185 | $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); |
186 | } | 186 | return $absolute; |
187 | 187 | } | |
188 | public function rewriteUrls($url) { | 188 | return false; |
189 | foreach ($this->rewriteUrls as $find => $action) { | 189 | } |
190 | if (strpos($url, $find) !== false) { | 190 | |
191 | if (is_array($action)) { | 191 | public function getUglyURL($url, $html) { |
192 | return strtr($url, $action); | 192 | if ($html == '') return false; |
193 | } | 193 | $found = false; |
194 | } | 194 | foreach ($this->ajaxTriggers as $string) { |
195 | } | 195 | if (stripos($html, $string)) { |
196 | return $url; | 196 | $found = true; |
197 | } | 197 | break; |
198 | 198 | } | |
199 | public function enableDebug($bool=true) { | 199 | } |
200 | $this->debug = (bool)$bool; | 200 | if (!$found) return false; |
201 | } | 201 | $iri = new SimplePie_IRI($url); |
202 | 202 | if (isset($iri->query)) { | |
203 | public function minimiseMemoryUse($bool = true) { | 203 | parse_str($iri->query, $query); |
204 | $this->minimiseMemoryUse = $bool; | 204 | } else { |
205 | } | 205 | $query = array(); |
206 | 206 | } | |
207 | public function setMaxParallelRequests($max) { | 207 | $query['_escaped_fragment_'] = ''; |
208 | $this->maxParallelRequests = $max; | 208 | $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites |
209 | } | 209 | $ugly_url = $iri->get_iri(); |
210 | 210 | $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); | |
211 | public function validateUrl($url) { | 211 | return $ugly_url; |
212 | $url = filter_var($url, FILTER_SANITIZE_URL); | 212 | } |
213 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 213 | |
214 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) | 214 | public function removeFragment($url) { |
215 | if ($test === false) { | 215 | $pos = strpos($url, '#'); |
216 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); | 216 | if ($pos === false) { |
217 | } | 217 | return $url; |
218 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { | 218 | } else { |
219 | return $url; | 219 | return substr($url, 0, $pos); |
220 | } else { | 220 | } |
221 | return false; | 221 | } |
222 | } | 222 | |
223 | } | 223 | public function rewriteUrls($url) { |
224 | 224 | foreach ($this->rewriteUrls as $find => $action) { | |
225 | public function fetchAll(array $urls) { | 225 | if (strpos($url, $find) !== false) { |
226 | $this->fetchAllOnce($urls, $isRedirect=false); | 226 | if (is_array($action)) { |
227 | $redirects = 0; | 227 | return strtr($url, $action); |
228 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { | 228 | } |
229 | $this->debug("Following redirects #$redirects..."); | 229 | } |
230 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); | 230 | } |
231 | } | 231 | return $url; |
232 | } | 232 | } |
233 | 233 | ||
234 | // fetch all URLs without following redirects | 234 | public function enableDebug($bool=true) { |
235 | public function fetchAllOnce(array $urls, $isRedirect=false) { | 235 | $this->debug = (bool)$bool; |
236 | if (!$isRedirect) $urls = array_unique($urls); | 236 | } |
237 | if (empty($urls)) return; | 237 | |
238 | 238 | public function minimiseMemoryUse($bool = true) { | |
239 | ////////////////////////////////////////////////////// | 239 | $this->minimiseMemoryUse = $bool; |
240 | // parallel (HttpRequestPool) | 240 | } |
241 | if ($this->method == self::METHOD_REQUEST_POOL) { | 241 | |
242 | $this->debug('Starting parallel fetch (HttpRequestPool)'); | 242 | public function setMaxParallelRequests($max) { |
243 | try { | 243 | $this->maxParallelRequests = $max; |
244 | while (count($urls) > 0) { | 244 | } |
245 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 245 | |
246 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 246 | public function validateUrl($url) { |
247 | $pool = new HttpRequestPool(); | 247 | $url = filter_var($url, FILTER_SANITIZE_URL); |
248 | foreach ($subset as $orig => $url) { | 248 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
249 | if (!$isRedirect) $orig = $url; | 249 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) |
250 | unset($this->redirectQueue[$orig]); | 250 | if ($test === false) { |
251 | $this->debug("...$url"); | 251 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); |
252 | if (!$isRedirect && isset($this->requests[$url])) { | 252 | } |
253 | $this->debug("......in memory"); | 253 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { |
254 | /* | 254 | return $url; |
255 | } elseif ($this->isCached($url)) { | 255 | } else { |
256 | $this->debug("......is cached"); | 256 | return false; |
257 | if (!$this->minimiseMemoryUse) { | 257 | } |
258 | $this->requests[$url] = $this->getCached($url); | 258 | } |
259 | } | 259 | |
260 | */ | 260 | public function fetchAll(array $urls) { |
261 | } else { | 261 | $this->fetchAllOnce($urls, $isRedirect=false); |
262 | $this->debug("......adding to pool"); | 262 | $redirects = 0; |
263 | $req_url = $this->rewriteUrls($url); | 263 | while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { |
264 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 264 | $this->debug("Following redirects #$redirects..."); |
265 | $req_url = $this->removeFragment($req_url); | 265 | $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); |
266 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 266 | } |
267 | $_meth = HttpRequest::METH_HEAD; | 267 | } |
268 | } else { | 268 | |
269 | $_meth = HttpRequest::METH_GET; | 269 | // fetch all URLs without following redirects |
270 | unset($this->requests[$orig]['wrongGuess']); | 270 | public function fetchAllOnce(array $urls, $isRedirect=false) { |
271 | } | 271 | if (!$isRedirect) $urls = array_unique($urls); |
272 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); | 272 | if (empty($urls)) return; |
273 | // send cookies, if we have any | 273 | |
274 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 274 | ////////////////////////////////////////////////////// |
275 | $this->debug("......sending cookies: $cookies"); | 275 | // parallel (HttpRequestPool) |
276 | $httpRequest->addHeaders(array('Cookie' => $cookies)); | 276 | if ($this->method == self::METHOD_REQUEST_POOL) { |
277 | } | 277 | $this->debug('Starting parallel fetch (HttpRequestPool)'); |
278 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); | 278 | try { |
279 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); | 279 | while (count($urls) > 0) { |
280 | // add referer for picky sites | 280 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
281 | $httpRequest->addheaders(array('Referer' => $this->referer)); | 281 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
282 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 282 | $pool = new HttpRequestPool(); |
283 | $this->requests[$orig]['original_url'] = $orig; | 283 | foreach ($subset as $orig => $url) { |
284 | $pool->attach($httpRequest); | 284 | if (!$isRedirect) $orig = $url; |
285 | } | 285 | unset($this->redirectQueue[$orig]); |
286 | } | 286 | $this->debug("...$url"); |
287 | // did we get anything into the pool? | 287 | if (!$isRedirect && isset($this->requests[$url])) { |
288 | if (count($pool) > 0) { | 288 | $this->debug("......in memory"); |
289 | $this->debug('Sending request...'); | 289 | /* |
290 | try { | 290 | } elseif ($this->isCached($url)) { |
291 | $pool->send(); | 291 | $this->debug("......is cached"); |
292 | } catch (HttpRequestPoolException $e) { | 292 | if (!$this->minimiseMemoryUse) { |
293 | // do nothing | 293 | $this->requests[$url] = $this->getCached($url); |
294 | } | 294 | } |
295 | $this->debug('Received responses'); | 295 | */ |
296 | foreach($subset as $orig => $url) { | 296 | } else { |
297 | if (!$isRedirect) $orig = $url; | 297 | $this->debug("......adding to pool"); |
298 | $request = $this->requests[$orig]['httpRequest']; | 298 | $req_url = $this->rewriteUrls($url); |
299 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); | 299 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
300 | // getResponseHeader() doesn't return status line, so, for consistency... | 300 | $req_url = $this->removeFragment($req_url); |
301 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); | 301 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
302 | // check content type | 302 | $_meth = HttpRequest::METH_HEAD; |
303 | // TODO: use getResponseHeader('content-type') or getResponseInfo() | 303 | } else { |
304 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 304 | $_meth = HttpRequest::METH_GET; |
305 | $this->requests[$orig]['body'] = ''; | 305 | unset($this->requests[$orig]['wrongGuess']); |
306 | $_header_only_type = true; | 306 | } |
307 | $this->debug('Header only type returned'); | 307 | $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); |
308 | } else { | 308 | // send cookies, if we have any |
309 | $this->requests[$orig]['body'] = $request->getResponseBody(); | 309 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
310 | $_header_only_type = false; | 310 | $this->debug("......sending cookies: $cookies"); |
311 | } | 311 | $httpRequest->addHeaders(array('Cookie' => $cookies)); |
312 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); | 312 | } |
313 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); | 313 | //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); |
314 | // is redirect? | 314 | $httpRequest->addHeaders($this->getUserAgent($req_url, true)); |
315 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { | 315 | // add referer for picky sites |
316 | $redirectURL = $request->getResponseHeader('location'); | 316 | $httpRequest->addheaders(array('Referer' => $this->referer)); |
317 | if (!preg_match('!^https?://!i', $redirectURL)) { | 317 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
318 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 318 | $this->requests[$orig]['original_url'] = $orig; |
319 | } | 319 | $pool->attach($httpRequest); |
320 | if ($this->validateURL($redirectURL)) { | 320 | } |
321 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 321 | } |
322 | // store any cookies | 322 | // did we get anything into the pool? |
323 | $cookies = $request->getResponseHeader('set-cookie'); | 323 | if (count($pool) > 0) { |
324 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); | 324 | $this->debug('Sending request...'); |
325 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); | 325 | try { |
326 | $this->redirectQueue[$orig] = $redirectURL; | 326 | $pool->send(); |
327 | } else { | 327 | } catch (HttpRequestPoolException $e) { |
328 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 328 | // do nothing |
329 | } | 329 | } |
330 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { | 330 | $this->debug('Received responses'); |
331 | // the response content-type did not match our 'header only' types, | 331 | foreach($subset as $orig => $url) { |
332 | // but we'd issues a HEAD request because we assumed it would. So | 332 | if (!$isRedirect) $orig = $url; |
333 | // let's queue a proper GET request for this item... | 333 | $request = $this->requests[$orig]['httpRequest']; |
334 | $this->debug('Wrong guess at content-type, queing GET request'); | 334 | //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); |
335 | $this->requests[$orig]['wrongGuess'] = true; | 335 | // getResponseHeader() doesn't return status line, so, for consistency... |
336 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 336 | $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); |
337 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 337 | // check content type |
338 | // check for <meta name='fragment' content='!'/> | 338 | // TODO: use getResponseHeader('content-type') or getResponseInfo() |
339 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 339 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
340 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 340 | $this->requests[$orig]['body'] = ''; |
341 | if (isset($this->requests[$orig]['body'])) { | 341 | $_header_only_type = true; |
342 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 342 | $this->debug('Header only type returned'); |
343 | if ($redirectURL) { | 343 | } else { |
344 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 344 | $this->requests[$orig]['body'] = $request->getResponseBody(); |
345 | $this->redirectQueue[$orig] = $redirectURL; | 345 | $_header_only_type = false; |
346 | } | 346 | } |
347 | } | 347 | $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); |
348 | } | 348 | $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); |
349 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 349 | // is redirect? |
350 | $pool->detach($request); | 350 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { |
351 | unset($this->requests[$orig]['httpRequest'], $request); | 351 | $redirectURL = $request->getResponseHeader('location'); |
352 | /* | 352 | if (!preg_match('!^https?://!i', $redirectURL)) { |
353 | if ($this->minimiseMemoryUse) { | 353 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
354 | if ($this->cache($url)) { | 354 | } |
355 | unset($this->requests[$url]); | 355 | if ($this->validateURL($redirectURL)) { |
356 | } | 356 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
357 | } | 357 | // store any cookies |
358 | */ | 358 | $cookies = $request->getResponseHeader('set-cookie'); |
359 | } | 359 | if ($cookies && !is_array($cookies)) $cookies = array($cookies); |
360 | } | 360 | if ($cookies) $this->cookieJar->storeCookies($url, $cookies); |
361 | } | 361 | $this->redirectQueue[$orig] = $redirectURL; |
362 | } catch (HttpException $e) { | 362 | } else { |
363 | $this->debug($e); | 363 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
364 | return false; | 364 | } |
365 | } | 365 | } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { |
366 | } | 366 | // the response content-type did not match our 'header only' types, |
367 | 367 | // but we'd issues a HEAD request because we assumed it would. So | |
368 | ////////////////////////////////////////////////////////// | 368 | // let's queue a proper GET request for this item... |
369 | // parallel (curl_multi_*) | 369 | $this->debug('Wrong guess at content-type, queing GET request'); |
370 | elseif ($this->method == self::METHOD_CURL_MULTI) { | 370 | $this->requests[$orig]['wrongGuess'] = true; |
371 | $this->debug('Starting parallel fetch (curl_multi_*)'); | 371 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
372 | while (count($urls) > 0) { | 372 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
373 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); | 373 | // check for <meta name='fragment' content='!'/> |
374 | $subset = array_splice($urls, 0, $this->maxParallelRequests); | 374 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
375 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); | 375 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
376 | $pool->window_size = count($subset); | 376 | if (isset($this->requests[$orig]['body'])) { |
377 | 377 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | |
378 | foreach ($subset as $orig => $url) { | 378 | if ($redirectURL) { |
379 | if (!$isRedirect) $orig = $url; | 379 | $this->redirectQueue[$orig] = $redirectURL; |
380 | unset($this->redirectQueue[$orig]); | 380 | } |
381 | $this->debug("...$url"); | 381 | } |
382 | if (!$isRedirect && isset($this->requests[$url])) { | 382 | } |
383 | $this->debug("......in memory"); | 383 | //die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
384 | /* | 384 | $pool->detach($request); |
385 | } elseif ($this->isCached($url)) { | 385 | unset($this->requests[$orig]['httpRequest'], $request); |
386 | $this->debug("......is cached"); | 386 | /* |
387 | if (!$this->minimiseMemoryUse) { | 387 | if ($this->minimiseMemoryUse) { |
388 | $this->requests[$url] = $this->getCached($url); | 388 | if ($this->cache($url)) { |
389 | } | 389 | unset($this->requests[$url]); |
390 | */ | 390 | } |
391 | } else { | 391 | } |
392 | $this->debug("......adding to pool"); | 392 | */ |
393 | $req_url = $this->rewriteUrls($url); | 393 | } |
394 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 394 | } |
395 | $req_url = $this->removeFragment($req_url); | 395 | } |
396 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { | 396 | } catch (HttpException $e) { |
397 | $_meth = 'HEAD'; | 397 | $this->debug($e); |
398 | } else { | 398 | return false; |
399 | $_meth = 'GET'; | 399 | } |
400 | unset($this->requests[$orig]['wrongGuess']); | 400 | } |
401 | } | 401 | |
402 | $headers = array(); | 402 | ////////////////////////////////////////////////////////// |
403 | //$headers[] = 'User-Agent: '.$this->userAgent; | 403 | // parallel (curl_multi_*) |
404 | $headers[] = $this->getUserAgent($req_url); | 404 | elseif ($this->method == self::METHOD_CURL_MULTI) { |
405 | // add referer for picky sites | 405 | $this->debug('Starting parallel fetch (curl_multi_*)'); |
406 | $headers[] = 'Referer: '.$this->referer; | 406 | while (count($urls) > 0) { |
407 | // send cookies, if we have any | 407 | $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); |
408 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 408 | $subset = array_splice($urls, 0, $this->maxParallelRequests); |
409 | $this->debug("......sending cookies: $cookies"); | 409 | $pool = new RollingCurl(array($this, 'handleCurlResponse')); |
410 | $headers[] = 'Cookie: '.$cookies; | 410 | $pool->window_size = count($subset); |
411 | } | 411 | |
412 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( | 412 | foreach ($subset as $orig => $url) { |
413 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], | 413 | if (!$isRedirect) $orig = $url; |
414 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] | 414 | unset($this->redirectQueue[$orig]); |
415 | )); | 415 | $this->debug("...$url"); |
416 | $httpRequest->set_original_url($orig); | 416 | if (!$isRedirect && isset($this->requests[$url])) { |
417 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); | 417 | $this->debug("......in memory"); |
418 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? | 418 | /* |
419 | $pool->add($httpRequest); | 419 | } elseif ($this->isCached($url)) { |
420 | } | 420 | $this->debug("......is cached"); |
421 | } | 421 | if (!$this->minimiseMemoryUse) { |
422 | // did we get anything into the pool? | 422 | $this->requests[$url] = $this->getCached($url); |
423 | if (count($pool) > 0) { | 423 | } |
424 | $this->debug('Sending request...'); | 424 | */ |
425 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] | 425 | } else { |
426 | $this->debug('Received responses'); | 426 | $this->debug("......adding to pool"); |
427 | foreach($subset as $orig => $url) { | 427 | $req_url = $this->rewriteUrls($url); |
428 | if (!$isRedirect) $orig = $url; | 428 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
429 | // $this->requests[$orig]['headers'] | 429 | $req_url = $this->removeFragment($req_url); |
430 | // $this->requests[$orig]['body'] | 430 | if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { |
431 | // $this->requests[$orig]['effective_url'] | 431 | $_meth = 'HEAD'; |
432 | // check content type | 432 | } else { |
433 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 433 | $_meth = 'GET'; |
434 | $this->requests[$orig]['body'] = ''; | 434 | unset($this->requests[$orig]['wrongGuess']); |
435 | $_header_only_type = true; | 435 | } |
436 | $this->debug('Header only type returned'); | 436 | $headers = array(); |
437 | } else { | 437 | //$headers[] = 'User-Agent: '.$this->userAgent; |
438 | $_header_only_type = false; | 438 | $headers[] = $this->getUserAgent($req_url); |
439 | } | 439 | // add referer for picky sites |
440 | $status_code = $this->requests[$orig]['status_code']; | 440 | $headers[] = 'Referer: '.$this->referer; |
441 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 441 | // send cookies, if we have any |
442 | $redirectURL = $this->requests[$orig]['location']; | 442 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
443 | if (!preg_match('!^https?://!i', $redirectURL)) { | 443 | $this->debug("......sending cookies: $cookies"); |
444 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 444 | $headers[] = 'Cookie: '.$cookies; |
445 | } | 445 | } |
446 | if ($this->validateURL($redirectURL)) { | 446 | $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array( |
447 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 447 | CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], |
448 | // store any cookies | 448 | CURLOPT_TIMEOUT => $this->requestOptions['timeout'] |
449 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 449 | )); |
450 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 450 | $httpRequest->set_original_url($orig); |
451 | $this->redirectQueue[$orig] = $redirectURL; | 451 | $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); |
452 | } else { | 452 | $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? |
453 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 453 | $pool->add($httpRequest); |
454 | } | 454 | } |
455 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { | 455 | } |
456 | // the response content-type did not match our 'header only' types, | 456 | // did we get anything into the pool? |
457 | // but we'd issues a HEAD request because we assumed it would. So | 457 | if (count($pool) > 0) { |
458 | // let's queue a proper GET request for this item... | 458 | $this->debug('Sending request...'); |
459 | $this->debug('Wrong guess at content-type, queing GET request'); | 459 | $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] |
460 | $this->requests[$orig]['wrongGuess'] = true; | 460 | $this->debug('Received responses'); |
461 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; | 461 | foreach($subset as $orig => $url) { |
462 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 462 | if (!$isRedirect) $orig = $url; |
463 | // check for <meta name='fragment' content='!'/> | 463 | // $this->requests[$orig]['headers'] |
464 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 464 | // $this->requests[$orig]['body'] |
465 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 465 | // $this->requests[$orig]['effective_url'] |
466 | if (isset($this->requests[$orig]['body'])) { | 466 | // check content type |
467 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 467 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
468 | if ($redirectURL) { | 468 | $this->requests[$orig]['body'] = ''; |
469 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 469 | $_header_only_type = true; |
470 | $this->redirectQueue[$orig] = $redirectURL; | 470 | $this->debug('Header only type returned'); |
471 | } | 471 | } else { |
472 | } | 472 | $_header_only_type = false; |
473 | } | 473 | } |
474 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); | 474 | $status_code = $this->requests[$orig]['status_code']; |
475 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); | 475 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
476 | } | 476 | $redirectURL = $this->requests[$orig]['location']; |
477 | } | 477 | if (!preg_match('!^https?://!i', $redirectURL)) { |
478 | } | 478 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
479 | } | 479 | } |
480 | 480 | if ($this->validateURL($redirectURL)) { | |
481 | ////////////////////////////////////////////////////// | 481 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); |
482 | // sequential (file_get_contents) | 482 | // store any cookies |
483 | else { | 483 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
484 | $this->debug('Starting sequential fetch (file_get_contents)'); | 484 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
485 | $this->debug('Processing set of '.count($urls)); | 485 | $this->redirectQueue[$orig] = $redirectURL; |
486 | foreach ($urls as $orig => $url) { | 486 | } else { |
487 | if (!$isRedirect) $orig = $url; | 487 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
488 | unset($this->redirectQueue[$orig]); | 488 | } |
489 | $this->debug("...$url"); | 489 | } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') { |
490 | if (!$isRedirect && isset($this->requests[$url])) { | 490 | // the response content-type did not match our 'header only' types, |
491 | $this->debug("......in memory"); | 491 | // but we'd issues a HEAD request because we assumed it would. So |
492 | /* | 492 | // let's queue a proper GET request for this item... |
493 | } elseif ($this->isCached($url)) { | 493 | $this->debug('Wrong guess at content-type, queing GET request'); |
494 | $this->debug("......is cached"); | 494 | $this->requests[$orig]['wrongGuess'] = true; |
495 | if (!$this->minimiseMemoryUse) { | 495 | $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url']; |
496 | $this->requests[$url] = $this->getCached($url); | 496 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
497 | } | 497 | // check for <meta name='fragment' content='!'/> |
498 | */ | 498 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
499 | } else { | 499 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
500 | $this->debug("Sending request for $url"); | 500 | if (isset($this->requests[$orig]['body'])) { |
501 | $this->requests[$orig]['original_url'] = $orig; | 501 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
502 | $req_url = $this->rewriteUrls($url); | 502 | if ($redirectURL) { |
503 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; | 503 | $this->redirectQueue[$orig] = $redirectURL; |
504 | $req_url = $this->removeFragment($req_url); | 504 | } |
505 | // send cookies, if we have any | 505 | } |
506 | $httpContext = $this->httpContext; | 506 | } |
507 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; | 507 | // die($url.' -multi- '.$request->getResponseInfo('effective_url')); |
508 | // add referer for picky sites | 508 | unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']); |
509 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; | 509 | } |
510 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { | 510 | } |
511 | $this->debug("......sending cookies: $cookies"); | 511 | } |
512 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; | 512 | } |
513 | } | 513 | |
514 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { | 514 | ////////////////////////////////////////////////////// |
515 | $this->debug('Received response'); | 515 | // sequential (file_get_contents) |
516 | // get status code | 516 | else { |
517 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { | 517 | $this->debug('Starting sequential fetch (file_get_contents)'); |
518 | $this->debug('Error: no status code found'); | 518 | $this->debug('Processing set of '.count($urls)); |
519 | // TODO: handle error - no status code | 519 | foreach ($urls as $orig => $url) { |
520 | } else { | 520 | if (!$isRedirect) $orig = $url; |
521 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); | 521 | unset($this->redirectQueue[$orig]); |
522 | // check content type | 522 | $this->debug("...$url"); |
523 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { | 523 | if (!$isRedirect && isset($this->requests[$url])) { |
524 | $this->requests[$orig]['body'] = ''; | 524 | $this->debug("......in memory"); |
525 | } else { | 525 | /* |
526 | $this->requests[$orig]['body'] = $html; | 526 | } elseif ($this->isCached($url)) { |
527 | } | 527 | $this->debug("......is cached"); |
528 | $this->requests[$orig]['effective_url'] = $req_url; | 528 | if (!$this->minimiseMemoryUse) { |
529 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; | 529 | $this->requests[$url] = $this->getCached($url); |
530 | unset($match); | 530 | } |
531 | // handle redirect | 531 | */ |
532 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 532 | } else { |
533 | $this->requests[$orig]['location'] = trim($match[1]); | 533 | $this->debug("Sending request for $url"); |
534 | } | 534 | $this->requests[$orig]['original_url'] = $orig; |
535 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { | 535 | $req_url = $this->rewriteUrls($url); |
536 | $redirectURL = $this->requests[$orig]['location']; | 536 | $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; |
537 | if (!preg_match('!^https?://!i', $redirectURL)) { | 537 | $req_url = $this->removeFragment($req_url); |
538 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); | 538 | // send cookies, if we have any |
539 | } | 539 | $httpContext = $this->httpContext; |
540 | if ($this->validateURL($redirectURL)) { | 540 | $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; |
541 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | 541 | // add referer for picky sites |
542 | // store any cookies | 542 | $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; |
543 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); | 543 | if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { |
544 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); | 544 | $this->debug("......sending cookies: $cookies"); |
545 | $this->redirectQueue[$orig] = $redirectURL; | 545 | $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; |
546 | } else { | 546 | } |
547 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); | 547 | if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { |
548 | } | 548 | $this->debug('Received response'); |
549 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { | 549 | // get status code |
550 | // check for <meta name='fragment' content='!'/> | 550 | if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { |
551 | // for AJAX sites, e.g. Blogger with its dynamic views templates. | 551 | $this->debug('Error: no status code found'); |
552 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification | 552 | // TODO: handle error - no status code |
553 | if (isset($this->requests[$orig]['body'])) { | 553 | } else { |
554 | $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); | 554 | $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); |
555 | if ($redirectURL) { | 555 | // check content type |
556 | $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); | 556 | if ($this->headerOnlyType($this->requests[$orig]['headers'])) { |
557 | $this->redirectQueue[$orig] = $redirectURL; | 557 | $this->requests[$orig]['body'] = ''; |
558 | } | 558 | } else { |
559 | } | 559 | $this->requests[$orig]['body'] = $html; |
560 | } | 560 | } |
561 | } | 561 | $this->requests[$orig]['effective_url'] = $req_url; |
562 | } else { | 562 | $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; |
563 | $this->debug('Error retrieving URL'); | 563 | unset($match); |
564 | //print_r($req_url); | 564 | // handle redirect |
565 | //print_r($http_response_header); | 565 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
566 | //print_r($html); | 566 | $this->requests[$orig]['location'] = trim($match[1]); |
567 | 567 | } | |
568 | // TODO: handle error - failed to retrieve URL | 568 | if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { |
569 | } | 569 | $redirectURL = $this->requests[$orig]['location']; |
570 | } | 570 | if (!preg_match('!^https?://!i', $redirectURL)) { |
571 | } | 571 | $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); |
572 | } | 572 | } |
573 | } | 573 | if ($this->validateURL($redirectURL)) { |
574 | 574 | $this->debug('Redirect detected. Valid URL: '.$redirectURL); | |
575 | public function handleCurlResponse($response, $info, $request) { | 575 | // store any cookies |
576 | $orig = $request->url_original; | 576 | $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); |
577 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); | 577 | if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); |
578 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); | 578 | $this->redirectQueue[$orig] = $redirectURL; |
579 | $this->requests[$orig]['method'] = $request->method; | 579 | } else { |
580 | $this->requests[$orig]['effective_url'] = $info['url']; | 580 | $this->debug('Redirect detected. Invalid URL: '.$redirectURL); |
581 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; | 581 | } |
582 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { | 582 | } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) { |
583 | $this->requests[$orig]['location'] = trim($match[1]); | 583 | // check for <meta name='fragment' content='!'/> |
584 | } | 584 | // for AJAX sites, e.g. Blogger with its dynamic views templates. |
585 | } | 585 | // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification |
586 | 586 | if (isset($this->requests[$orig]['body'])) { | |
587 | protected function headersToString(array $headers, $associative=true) { | 587 | $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); |
588 | if (!$associative) { | 588 | if ($redirectURL) { |
589 | return implode("\n", $headers); | 589 | $this->redirectQueue[$orig] = $redirectURL; |
590 | } else { | 590 | } |
591 | $str = ''; | 591 | } |
592 | foreach ($headers as $key => $val) { | 592 | } |
593 | if (is_array($val)) { | 593 | } |
594 | foreach ($val as $v) $str .= "$key: $v\n"; | 594 | } else { |
595 | } else { | 595 | $this->debug('Error retrieving URL'); |
596 | $str .= "$key: $val\n"; | 596 | //print_r($req_url); |
597 | } | 597 | //print_r($http_response_header); |
598 | } | 598 | //print_r($html); |
599 | return rtrim($str); | 599 | |
600 | } | 600 | // TODO: handle error - failed to retrieve URL |
601 | } | 601 | } |
602 | 602 | } | |
603 | public function get($url, $remove=false, $gzdecode=true) { | 603 | } |
604 | $url = "$url"; | 604 | } |
605 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 605 | } |
606 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | 606 | |
607 | $response = $this->requests[$url]; | 607 | public function handleCurlResponse($response, $info, $request) { |
608 | /* | 608 | $orig = $request->url_original; |
609 | } elseif ($this->isCached($url)) { | 609 | $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); |
610 | $this->debug("URL already fetched - in disk cache ($url)"); | 610 | $this->requests[$orig]['body'] = substr($response, $info['header_size']); |
611 | $response = $this->getCached($url); | 611 | $this->requests[$orig]['method'] = $request->method; |
612 | $this->requests[$url] = $response; | 612 | $this->requests[$orig]['effective_url'] = $info['url']; |
613 | */ | 613 | $this->requests[$orig]['status_code'] = (int)$info['http_code']; |
614 | } else { | 614 | if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) { |
615 | $this->debug("Fetching URL ($url)"); | 615 | $this->requests[$orig]['location'] = trim($match[1]); |
616 | $this->fetchAll(array($url)); | 616 | } |
617 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { | 617 | } |
618 | $response = $this->requests[$url]; | 618 | |
619 | } else { | 619 | protected function headersToString(array $headers, $associative=true) { |
620 | $this->debug("Request failed"); | 620 | if (!$associative) { |
621 | $response = false; | 621 | return implode("\n", $headers); |
622 | } | 622 | } else { |
623 | } | 623 | $str = ''; |
624 | /* | 624 | foreach ($headers as $key => $val) { |
625 | if ($this->minimiseMemoryUse && $response) { | 625 | if (is_array($val)) { |
626 | $this->cache($url); | 626 | foreach ($val as $v) $str .= "$key: $v\n"; |
627 | unset($this->requests[$url]); | 627 | } else { |
628 | } | 628 | $str .= "$key: $val\n"; |
629 | */ | 629 | } |
630 | if ($remove && $response) unset($this->requests[$url]); | 630 | } |
631 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { | 631 | return rtrim($str); |
632 | if ($html = gzdecode($response['body'])) { | 632 | } |
633 | $response['body'] = $html; | 633 | } |
634 | } | 634 | |
635 | } | 635 | public function get($url, $remove=false, $gzdecode=true) { |
636 | return $response; | 636 | $url = "$url"; |
637 | } | 637 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
638 | 638 | $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); | |
639 | public function parallelSupport() { | 639 | $response = $this->requests[$url]; |
640 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); | 640 | /* |
641 | } | 641 | } elseif ($this->isCached($url)) { |
642 | 642 | $this->debug("URL already fetched - in disk cache ($url)"); | |
643 | private function headerOnlyType($headers) { | 643 | $response = $this->getCached($url); |
644 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { | 644 | $this->requests[$url] = $response; |
645 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) | 645 | */ |
646 | $match[1] = strtolower(trim($match[1])); | 646 | } else { |
647 | $match[2] = strtolower(trim($match[2])); | 647 | $this->debug("Fetching URL ($url)"); |
648 | foreach (array($match[1], $match[2]) as $mime) { | 648 | $this->fetchAll(array($url)); |
649 | if (in_array($mime, $this->headerOnlyTypes)) return true; | 649 | if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { |
650 | } | 650 | $response = $this->requests[$url]; |
651 | } | 651 | } else { |
652 | return false; | 652 | $this->debug("Request failed"); |
653 | } | 653 | $response = false; |
654 | 654 | } | |
655 | private function possibleUnsupportedType($url) { | 655 | } |
656 | $path = @parse_url($url, PHP_URL_PATH); | 656 | /* |
657 | if ($path && strpos($path, '.') !== false) { | 657 | if ($this->minimiseMemoryUse && $response) { |
658 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); | 658 | $this->cache($url); |
659 | return in_array($ext, $this->headerOnlyClues); | 659 | unset($this->requests[$url]); |
660 | } | 660 | } |
661 | return false; | 661 | */ |
662 | } | 662 | if ($remove && $response) unset($this->requests[$url]); |
663 | } | 663 | if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) { |
664 | 664 | if ($html = gzdecode($response['body'])) { | |
665 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 | 665 | $response['body'] = $html; |
666 | if (!function_exists('gzdecode')) { | 666 | } |
667 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) | 667 | } |
668 | { | 668 | return $response; |
669 | $len = strlen($data); | 669 | } |
670 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { | 670 | |
671 | $error = "Not in GZIP format."; | 671 | public function parallelSupport() { |
672 | return null; // Not GZIP format (See RFC 1952) | 672 | return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); |
673 | } | 673 | } |
674 | $method = ord(substr($data,2,1)); // Compression method | 674 | |
675 | $flags = ord(substr($data,3,1)); // Flags | 675 | private function headerOnlyType($headers) { |
676 | if ($flags & 31 != $flags) { | 676 | if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) { |
677 | $error = "Reserved bits not allowed."; | 677 | // look for full mime type (e.g. image/jpeg) or just type (e.g. image) |
678 | return null; | 678 | $match[1] = strtolower(trim($match[1])); |
679 | } | 679 | $match[2] = strtolower(trim($match[2])); |
680 | // NOTE: $mtime may be negative (PHP integer limitations) | 680 | foreach (array($match[1], $match[2]) as $mime) { |
681 | $mtime = unpack("V", substr($data,4,4)); | 681 | if (in_array($mime, $this->headerOnlyTypes)) return true; |
682 | $mtime = $mtime[1]; | 682 | } |
683 | $xfl = substr($data,8,1); | 683 | } |
684 | $os = substr($data,8,1); | 684 | return false; |
685 | $headerlen = 10; | 685 | } |
686 | $extralen = 0; | 686 | |
687 | $extra = ""; | 687 | private function possibleUnsupportedType($url) { |
688 | if ($flags & 4) { | 688 | $path = @parse_url($url, PHP_URL_PATH); |
689 | // 2-byte length prefixed EXTRA data in header | 689 | if ($path && strpos($path, '.') !== false) { |
690 | if ($len - $headerlen - 2 < 8) { | 690 | $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION))); |
691 | return false; // invalid | 691 | return in_array($ext, $this->headerOnlyClues); |
692 | } | 692 | } |
693 | $extralen = unpack("v",substr($data,8,2)); | 693 | return false; |
694 | $extralen = $extralen[1]; | 694 | } |
695 | if ($len - $headerlen - 2 - $extralen < 8) { | 695 | } |
696 | return false; // invalid | 696 | |
697 | } | 697 | // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 |
698 | $extra = substr($data,10,$extralen); | 698 | if (!function_exists('gzdecode')) { |
699 | $headerlen += 2 + $extralen; | 699 | function gzdecode($data,&$filename='',&$error='',$maxlength=null) |
700 | } | 700 | { |
701 | $filenamelen = 0; | 701 | $len = strlen($data); |
702 | $filename = ""; | 702 | if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { |
703 | if ($flags & 8) { | 703 | $error = "Not in GZIP format."; |
704 | // C-style string | 704 | return null; // Not GZIP format (See RFC 1952) |
705 | if ($len - $headerlen - 1 < 8) { | 705 | } |
706 | return false; // invalid | 706 | $method = ord(substr($data,2,1)); // Compression method |
707 | } | 707 | $flags = ord(substr($data,3,1)); // Flags |
708 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); | 708 | if ($flags & 31 != $flags) { |
709 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { | 709 | $error = "Reserved bits not allowed."; |
710 | return false; // invalid | 710 | return null; |
711 | } | 711 | } |
712 | $filename = substr($data,$headerlen,$filenamelen); | 712 | // NOTE: $mtime may be negative (PHP integer limitations) |
713 | $headerlen += $filenamelen + 1; | 713 | $mtime = unpack("V", substr($data,4,4)); |
714 | } | 714 | $mtime = $mtime[1]; |
715 | $commentlen = 0; | 715 | $xfl = substr($data,8,1); |
716 | $comment = ""; | 716 | $os = substr($data,8,1); |
717 | if ($flags & 16) { | 717 | $headerlen = 10; |
718 | // C-style string COMMENT data in header | 718 | $extralen = 0; |
719 | if ($len - $headerlen - 1 < 8) { | 719 | $extra = ""; |
720 | return false; // invalid | 720 | if ($flags & 4) { |
721 | } | 721 | // 2-byte length prefixed EXTRA data in header |
722 | $commentlen = strpos(substr($data,$headerlen),chr(0)); | 722 | if ($len - $headerlen - 2 < 8) { |
723 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { | 723 | return false; // invalid |
724 | return false; // Invalid header format | 724 | } |
725 | } | 725 | $extralen = unpack("v",substr($data,8,2)); |
726 | $comment = substr($data,$headerlen,$commentlen); | 726 | $extralen = $extralen[1]; |
727 | $headerlen += $commentlen + 1; | 727 | if ($len - $headerlen - 2 - $extralen < 8) { |
728 | } | 728 | return false; // invalid |
729 | $headercrc = ""; | 729 | } |
730 | if ($flags & 2) { | 730 | $extra = substr($data,10,$extralen); |
731 | // 2-bytes (lowest order) of CRC32 on header present | 731 | $headerlen += 2 + $extralen; |
732 | if ($len - $headerlen - 2 < 8) { | 732 | } |
733 | return false; // invalid | 733 | $filenamelen = 0; |
734 | } | 734 | $filename = ""; |
735 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; | 735 | if ($flags & 8) { |
736 | $headercrc = unpack("v", substr($data,$headerlen,2)); | 736 | // C-style string |
737 | $headercrc = $headercrc[1]; | 737 | if ($len - $headerlen - 1 < 8) { |
738 | if ($headercrc != $calccrc) { | 738 | return false; // invalid |
739 | $error = "Header checksum failed."; | 739 | } |
740 | return false; // Bad header CRC | 740 | $filenamelen = strpos(substr($data,$headerlen),chr(0)); |
741 | } | 741 | if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { |
742 | $headerlen += 2; | 742 | return false; // invalid |
743 | } | 743 | } |
744 | // GZIP FOOTER | 744 | $filename = substr($data,$headerlen,$filenamelen); |
745 | $datacrc = unpack("V",substr($data,-8,4)); | 745 | $headerlen += $filenamelen + 1; |
746 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); | 746 | } |
747 | $isize = unpack("V",substr($data,-4)); | 747 | $commentlen = 0; |
748 | $isize = $isize[1]; | 748 | $comment = ""; |
749 | // decompression: | 749 | if ($flags & 16) { |
750 | $bodylen = $len-$headerlen-8; | 750 | // C-style string COMMENT data in header |
751 | if ($bodylen < 1) { | 751 | if ($len - $headerlen - 1 < 8) { |
752 | // IMPLEMENTATION BUG! | 752 | return false; // invalid |
753 | return null; | 753 | } |
754 | } | 754 | $commentlen = strpos(substr($data,$headerlen),chr(0)); |
755 | $body = substr($data,$headerlen,$bodylen); | 755 | if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { |
756 | $data = ""; | 756 | return false; // Invalid header format |
757 | if ($bodylen > 0) { | 757 | } |
758 | switch ($method) { | 758 | $comment = substr($data,$headerlen,$commentlen); |
759 | case 8: | 759 | $headerlen += $commentlen + 1; |
760 | // Currently the only supported compression method: | 760 | } |
761 | $data = gzinflate($body,$maxlength); | 761 | $headercrc = ""; |
762 | break; | 762 | if ($flags & 2) { |
763 | default: | 763 | // 2-bytes (lowest order) of CRC32 on header present |
764 | $error = "Unknown compression method."; | 764 | if ($len - $headerlen - 2 < 8) { |
765 | return false; | 765 | return false; // invalid |
766 | } | 766 | } |
767 | } // zero-byte body content is allowed | 767 | $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; |
768 | // Verifiy CRC32 | 768 | $headercrc = unpack("v", substr($data,$headerlen,2)); |
769 | $crc = sprintf("%u",crc32($data)); | 769 | $headercrc = $headercrc[1]; |
770 | $crcOK = $crc == $datacrc; | 770 | if ($headercrc != $calccrc) { |
771 | $lenOK = $isize == strlen($data); | 771 | $error = "Header checksum failed."; |
772 | if (!$lenOK || !$crcOK) { | 772 | return false; // Bad header CRC |
773 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | 773 | } |
774 | return false; | 774 | $headerlen += 2; |
775 | } | 775 | } |
776 | return $data; | 776 | // GZIP FOOTER |
777 | } | 777 | $datacrc = unpack("V",substr($data,-8,4)); |
778 | } | 778 | $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF); |
779 | ?> \ No newline at end of file | 779 | $isize = unpack("V",substr($data,-4)); |
780 | $isize = $isize[1]; | ||
781 | // decompression: | ||
782 | $bodylen = $len-$headerlen-8; | ||
783 | if ($bodylen < 1) { | ||
784 | // IMPLEMENTATION BUG! | ||
785 | return null; | ||
786 | } | ||
787 | $body = substr($data,$headerlen,$bodylen); | ||
788 | $data = ""; | ||
789 | if ($bodylen > 0) { | ||
790 | switch ($method) { | ||
791 | case 8: | ||
792 | // Currently the only supported compression method: | ||
793 | $data = gzinflate($body,$maxlength); | ||
794 | break; | ||
795 | default: | ||
796 | $error = "Unknown compression method."; | ||
797 | return false; | ||
798 | } | ||
799 | } // zero-byte body content is allowed | ||
800 | // Verifiy CRC32 | ||
801 | $crc = sprintf("%u",crc32($data)); | ||
802 | $crcOK = $crc == $datacrc; | ||
803 | $lenOK = $isize == strlen($data); | ||
804 | if (!$lenOK || !$crcOK) { | ||
805 | $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.'); | ||
806 | return false; | ||
807 | } | ||
808 | return $data; | ||
809 | } | ||
810 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php index ecd46d5f..c524a1ee 100644 --- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php +++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php | |||
@@ -1,79 +1,78 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Humble HTTP Agent extension for SimplePie_File | 3 | * Humble HTTP Agent extension for SimplePie_File |
4 | * | 4 | * |
5 | * This class is designed to extend and override SimplePie_File | 5 | * This class is designed to extend and override SimplePie_File |
6 | * in order to prevent duplicate HTTP requests being sent out. | 6 | * in order to prevent duplicate HTTP requests being sent out. |
7 | * The idea is to initialise an instance of Humble HTTP Agent | 7 | * The idea is to initialise an instance of Humble HTTP Agent |
8 | * and attach it, to a static class variable, of this class. | 8 | * and attach it, to a static class variable, of this class. |
9 | * SimplePie will then automatically initialise this class | 9 | * SimplePie will then automatically initialise this class |
10 | * | 10 | * |
11 | * @date 2011-02-28 | 11 | * @date 2011-02-28 |
12 | */ | 12 | */ |
13 | 13 | ||
14 | class SimplePie_HumbleHttpAgent extends SimplePie_File | 14 | class SimplePie_HumbleHttpAgent extends SimplePie_File |
15 | { | 15 | { |
16 | protected static $agent; | 16 | protected static $agent; |
17 | var $url; | 17 | var $url; |
18 | var $useragent; | 18 | var $useragent; |
19 | var $success = true; | 19 | var $success = true; |
20 | var $headers = array(); | 20 | var $headers = array(); |
21 | var $body; | 21 | var $body; |
22 | var $status_code; | 22 | var $status_code; |
23 | var $redirects = 0; | 23 | var $redirects = 0; |
24 | var $error; | 24 | var $error; |
25 | var $method = SIMPLEPIE_FILE_SOURCE_NONE; | 25 | var $method = SIMPLEPIE_FILE_SOURCE_NONE; |
26 | 26 | ||
27 | public static function set_agent(HumbleHttpAgent $agent) { | 27 | public static function set_agent(HumbleHttpAgent $agent) { |
28 | self::$agent = $agent; | 28 | self::$agent = $agent; |
29 | } | 29 | } |
30 | 30 | ||
31 | public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { | 31 | public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) { |
32 | if (class_exists('idna_convert')) | 32 | if (class_exists('idna_convert')) |
33 | { | 33 | { |
34 | $idn = new idna_convert(); | 34 | $idn = new idna_convert(); |
35 | $parsed = SimplePie_Misc::parse_url($url); | 35 | $parsed = SimplePie_Misc::parse_url($url); |
36 | $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); | 36 | $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); |
37 | } | 37 | } |
38 | $this->url = $url; | 38 | $this->url = $url; |
39 | $this->useragent = $useragent; | 39 | $this->useragent = $useragent; |
40 | if (preg_match('/^http(s)?:\/\//i', $url)) | 40 | if (preg_match('/^http(s)?:\/\//i', $url)) |
41 | { | 41 | { |
42 | if (!is_array($headers)) | 42 | if (!is_array($headers)) |
43 | { | 43 | { |
44 | $headers = array(); | 44 | $headers = array(); |
45 | } | 45 | } |
46 | $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; | 46 | $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; |
47 | $headers2 = array(); | 47 | $headers2 = array(); |
48 | foreach ($headers as $key => $value) { | 48 | foreach ($headers as $key => $value) { |
49 | $headers2[] = "$key: $value"; | 49 | $headers2[] = "$key: $value"; |
50 | } | 50 | } |
51 | //TODO: allow for HTTP headers | 51 | //TODO: allow for HTTP headers |
52 | // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); | 52 | // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); |
53 | 53 | ||
54 | $response = self::$agent->get($url); | 54 | $response = self::$agent->get($url); |
55 | 55 | ||
56 | if ($response === false || !isset($response['status_code'])) { | 56 | if ($response === false || !isset($response['status_code'])) { |
57 | $this->error = 'failed to fetch URL'; | 57 | $this->error = 'failed to fetch URL'; |
58 | $this->success = false; | 58 | $this->success = false; |
59 | } else { | 59 | } else { |
60 | // The extra lines at the end are there to satisfy SimplePie's HTTP parser. | 60 | // The extra lines at the end are there to satisfy SimplePie's HTTP parser. |
61 | // The class expects a full HTTP message, whereas we're giving it only | 61 | // The class expects a full HTTP message, whereas we're giving it only |
62 | // headers - the new lines indicate the start of the body. | 62 | // headers - the new lines indicate the start of the body. |
63 | $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); | 63 | $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n"); |
64 | if ($parser->parse()) { | 64 | if ($parser->parse()) { |
65 | $this->headers = $parser->headers; | 65 | $this->headers = $parser->headers; |
66 | //$this->body = $parser->body; | 66 | //$this->body = $parser->body; |
67 | $this->body = $response['body']; | 67 | $this->body = $response['body']; |
68 | $this->status_code = $parser->status_code; | 68 | $this->status_code = $parser->status_code; |
69 | } | 69 | } |
70 | } | 70 | } |
71 | } | 71 | } |
72 | else | 72 | else |
73 | { | 73 | { |
74 | $this->error = 'invalid URL'; | 74 | $this->error = 'invalid URL'; |
75 | $this->success = false; | 75 | $this->success = false; |
76 | } | 76 | } |
77 | } | 77 | } |
78 | } | 78 | } \ No newline at end of file |
79 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php index 09b11546..382d869c 100644 --- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php | |||
@@ -6,23 +6,24 @@ | |||
6 | * Attempts to detect the language of a sample of text by correlating ranked | 6 | * Attempts to detect the language of a sample of text by correlating ranked |
7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. | 7 | * 3-gram frequencies to a table of 3-gram frequencies of known languages. |
8 | * | 8 | * |
9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle | 9 | * Implements a version of a technique originally proposed by Cavnar & Trenkle |
10 | * (1994): "N-Gram-Based Text Categorization" | 10 | * (1994): "N-Gram-Based Text Categorization" |
11 | * | 11 | * |
12 | * PHP versions 4 and 5 | 12 | * PHP version 5 |
13 | * | 13 | * |
14 | * @category Text | 14 | * @category Text |
15 | * @package Text_LanguageDetect | 15 | * @package Text_LanguageDetect |
16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 16 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
17 | * @copyright 2005-2006 Nicholas Pisarro | 17 | * @copyright 2005-2006 Nicholas Pisarro |
18 | * @license http://www.debian.org/misc/bsd.license BSD | 18 | * @license http://www.debian.org/misc/bsd.license BSD |
19 | * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ | 19 | * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ |
20 | * @link http://pear.php.net/package/Text_LanguageDetect/ | 20 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
21 | * @link http://langdetect.blogspot.com/ | 21 | * @link http://langdetect.blogspot.com/ |
22 | */ | 22 | */ |
23 | 23 | ||
24 | //require_once 'PEAR.php'; | 24 | require_once 'LanguageDetect/Exception.php'; |
25 | require_once 'Parser.php'; | 25 | require_once 'LanguageDetect/Parser.php'; |
26 | require_once 'LanguageDetect/ISO639.php'; | ||
26 | 27 | ||
27 | /** | 28 | /** |
28 | * Language detection class | 29 | * Language detection class |
@@ -41,9 +42,10 @@ require_once 'Parser.php'; | |||
41 | * | 42 | * |
42 | * echo "Supported languages:\n"; | 43 | * echo "Supported languages:\n"; |
43 | * | 44 | * |
44 | * $langs = $l->getLanguages(); | 45 | * try { |
45 | * if (PEAR::isError($langs)) { | 46 | * $langs = $l->getLanguages(); |
46 | * die($langs->getMessage()); | 47 | * } catch (Text_LanguageDetect_Exception $e) { |
48 | * die($e->getMessage()); | ||
47 | * } | 49 | * } |
48 | * | 50 | * |
49 | * sort($langs); | 51 | * sort($langs); |
@@ -54,38 +56,38 @@ require_once 'Parser.php'; | |||
54 | * } | 56 | * } |
55 | * </code> | 57 | * </code> |
56 | * | 58 | * |
57 | * @category Text | 59 | * @category Text |
58 | * @package Text_LanguageDetect | 60 | * @package Text_LanguageDetect |
59 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> | 61 | * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> |
60 | * @copyright 2005 Nicholas Pisarro | 62 | * @copyright 2005 Nicholas Pisarro |
61 | * @license http://www.debian.org/misc/bsd.license BSD | 63 | * @license http://www.debian.org/misc/bsd.license BSD |
62 | * @version Release: @package_version@ | 64 | * @version Release: @package_version@ |
63 | * @todo allow users to generate their own language models | 65 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
66 | * @todo allow users to generate their own language models | ||
64 | */ | 67 | */ |
65 | |||
66 | class Text_LanguageDetect | 68 | class Text_LanguageDetect |
67 | { | 69 | { |
68 | /** | 70 | /** |
69 | * The filename that stores the trigram data for the detector | 71 | * The filename that stores the trigram data for the detector |
70 | * | 72 | * |
71 | * If this value starts with a slash (/) or a dot (.) the value of | 73 | * If this value starts with a slash (/) or a dot (.) the value of |
72 | * $this->_data_dir will be ignored | 74 | * $this->_data_dir will be ignored |
73 | * | 75 | * |
74 | * @var string | 76 | * @var string |
75 | * @access private | 77 | * @access private |
76 | */ | 78 | */ |
77 | var $_db_filename = './lang.dat'; | 79 | var $_db_filename = 'lang.dat'; |
78 | 80 | ||
79 | /** | 81 | /** |
80 | * The filename that stores the unicode block definitions | 82 | * The filename that stores the unicode block definitions |
81 | * | 83 | * |
82 | * If this value starts with a slash (/) or a dot (.) the value of | 84 | * If this value starts with a slash (/) or a dot (.) the value of |
83 | * $this->_data_dir will be ignored | 85 | * $this->_data_dir will be ignored |
84 | * | 86 | * |
85 | * @var string | 87 | * @var string |
86 | * @access private | 88 | * @access private |
87 | */ | 89 | */ |
88 | var $_unicode_db_filename = './unicode_blocks.dat'; | 90 | var $_unicode_db_filename = 'unicode_blocks.dat'; |
89 | 91 | ||
90 | /** | 92 | /** |
91 | * The data directory | 93 | * The data directory |
@@ -99,11 +101,8 @@ class Text_LanguageDetect | |||
99 | 101 | ||
100 | /** | 102 | /** |
101 | * The trigram data for comparison | 103 | * The trigram data for comparison |
102 | * | ||
103 | * Will be loaded on start from $this->_db_filename | ||
104 | * | 104 | * |
105 | * May be set to a PEAR_Error object if there is an error during its | 105 | * Will be loaded on start from $this->_db_filename |
106 | * initialization | ||
107 | * | 106 | * |
108 | * @var array | 107 | * @var array |
109 | * @access private | 108 | * @access private |
@@ -120,7 +119,7 @@ class Text_LanguageDetect | |||
120 | 119 | ||
121 | /** | 120 | /** |
122 | * The size of the trigram data arrays | 121 | * The size of the trigram data arrays |
123 | * | 122 | * |
124 | * @var int | 123 | * @var int |
125 | * @access private | 124 | * @access private |
126 | */ | 125 | */ |
@@ -140,7 +139,7 @@ class Text_LanguageDetect | |||
140 | 139 | ||
141 | /** | 140 | /** |
142 | * Whether or not to simulate perl's Language::Guess exactly | 141 | * Whether or not to simulate perl's Language::Guess exactly |
143 | * | 142 | * |
144 | * @access private | 143 | * @access private |
145 | * @var bool | 144 | * @var bool |
146 | * @see setPerlCompatible() | 145 | * @see setPerlCompatible() |
@@ -165,18 +164,24 @@ class Text_LanguageDetect | |||
165 | var $_clusters; | 164 | var $_clusters; |
166 | 165 | ||
167 | /** | 166 | /** |
167 | * Which type of "language names" are accepted and returned: | ||
168 | * | ||
169 | * 0 - language name ("english") | ||
170 | * 2 - 2-letter ISO 639-1 code ("en") | ||
171 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
172 | */ | ||
173 | var $_name_mode = 0; | ||
174 | |||
175 | /** | ||
168 | * Constructor | 176 | * Constructor |
169 | * | 177 | * |
170 | * Will attempt to load the language database. If it fails, you will get | 178 | * Will attempt to load the language database. If it fails, you will get |
171 | * a PEAR_Error object returned when you try to use detect() | 179 | * an exception. |
172 | * | ||
173 | */ | 180 | */ |
174 | function Text_LanguageDetect($db=null, $unicode_db=null) | 181 | function __construct() |
175 | { | 182 | { |
176 | if (isset($db)) $this->_db_filename = $db; | ||
177 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
178 | |||
179 | $data = $this->_readdb($this->_db_filename); | 183 | $data = $this->_readdb($this->_db_filename); |
184 | $this->_checkTrigram($data['trigram']); | ||
180 | $this->_lang_db = $data['trigram']; | 185 | $this->_lang_db = $data['trigram']; |
181 | 186 | ||
182 | if (isset($data['trigram-unicodemap'])) { | 187 | if (isset($data['trigram-unicodemap'])) { |
@@ -186,29 +191,32 @@ class Text_LanguageDetect | |||
186 | // Not yet implemented: | 191 | // Not yet implemented: |
187 | if (isset($data['trigram-clusters'])) { | 192 | if (isset($data['trigram-clusters'])) { |
188 | $this->_clusters = $data['trigram-clusters']; | 193 | $this->_clusters = $data['trigram-clusters']; |
189 | } | 194 | } |
190 | } | 195 | } |
191 | 196 | ||
192 | /** | 197 | /** |
193 | * Returns the path to the location of the database | 198 | * Returns the path to the location of the database |
194 | * | 199 | * |
195 | * @access private | 200 | * @param string $fname File name to load |
196 | * @return string expected path to the language model database | 201 | * |
202 | * @return string expected path to the language model database | ||
203 | * @access private | ||
197 | */ | 204 | */ |
198 | function _get_data_loc($fname) | 205 | function _get_data_loc($fname) |
199 | { | 206 | { |
200 | return $fname; | 207 | return dirname(__FILE__).'/'.$fname; |
201 | } | 208 | } |
202 | 209 | ||
203 | /** | 210 | /** |
204 | * Loads the language trigram database from filename | 211 | * Loads the language trigram database from filename |
205 | * | 212 | * |
206 | * Trigram datbase should be a serialize()'d array | 213 | * Trigram datbase should be a serialize()'d array |
207 | * | 214 | * |
208 | * @access private | 215 | * @param string $fname the filename where the data is stored |
209 | * @param string $fname the filename where the data is stored | 216 | * |
210 | * @return array the language model data | 217 | * @return array the language model data |
211 | * @throws PEAR_Error | 218 | * @throws Text_LanguageDetect_Exception |
219 | * @access private | ||
212 | */ | 220 | */ |
213 | function _readdb($fname) | 221 | function _readdb($fname) |
214 | { | 222 | { |
@@ -217,79 +225,74 @@ class Text_LanguageDetect | |||
217 | 225 | ||
218 | // input check | 226 | // input check |
219 | if (!file_exists($fname)) { | 227 | if (!file_exists($fname)) { |
220 | throw new Exception('Language database does not exist.'); | 228 | throw new Text_LanguageDetect_Exception( |
229 | 'Language database does not exist: ' . $fname, | ||
230 | Text_LanguageDetect_Exception::DB_NOT_FOUND | ||
231 | ); | ||
221 | } elseif (!is_readable($fname)) { | 232 | } elseif (!is_readable($fname)) { |
222 | throw new Exception('Language database is not readable.'); | 233 | throw new Text_LanguageDetect_Exception( |
234 | 'Language database is not readable: ' . $fname, | ||
235 | Text_LanguageDetect_Exception::DB_NOT_READABLE | ||
236 | ); | ||
223 | } | 237 | } |
224 | 238 | ||
225 | if (function_exists('file_get_contents')) { | 239 | return unserialize(file_get_contents($fname)); |
226 | return unserialize(file_get_contents($fname)); | ||
227 | } else { | ||
228 | // if you don't have file_get_contents(), | ||
229 | // then this is the next fastest way | ||
230 | ob_start(); | ||
231 | readfile($fname); | ||
232 | $contents = ob_get_contents(); | ||
233 | ob_end_clean(); | ||
234 | return unserialize($contents); | ||
235 | } | ||
236 | } | 240 | } |
237 | 241 | ||
238 | 242 | ||
239 | /** | 243 | /** |
240 | * Checks if this object is ready to detect languages | 244 | * Checks if this object is ready to detect languages |
241 | * | 245 | * |
242 | * @access private | 246 | * @param array $trigram Trigram data from database |
243 | * @param mixed &$err error object to be returned by reference, if any | 247 | * |
244 | * @return bool true if no errors | 248 | * @return void |
249 | * @access private | ||
245 | */ | 250 | */ |
246 | function _setup_ok(&$err) | 251 | function _checkTrigram($trigram) |
247 | { | 252 | { |
248 | if (!is_array($this->_lang_db)) { | 253 | if (!is_array($trigram)) { |
249 | if (ini_get('magic_quotes_runtime')) { | 254 | if (ini_get('magic_quotes_runtime')) { |
250 | throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); | 255 | throw new Text_LanguageDetect_Exception( |
251 | } else { | 256 | 'Error loading database. Try turning magic_quotes_runtime off.', |
252 | throw new Exception('Language database is not an array.'); | 257 | Text_LanguageDetect_Exception::MAGIC_QUOTES |
258 | ); | ||
253 | } | 259 | } |
254 | return false; | 260 | throw new Text_LanguageDetect_Exception( |
255 | 261 | 'Language database is not an array.', | |
256 | } elseif (empty($this->_lang_db)) { | 262 | Text_LanguageDetect_Exception::DB_NOT_ARRAY |
257 | throw new Exception('Language database has no elements.'); | 263 | ); |
258 | return false; | 264 | } elseif (empty($trigram)) { |
259 | 265 | throw new Text_LanguageDetect_Exception( | |
260 | } else { | 266 | 'Language database has no elements.', |
261 | return true; | 267 | Text_LanguageDetect_Exception::DB_EMPTY |
268 | ); | ||
262 | } | 269 | } |
263 | } | 270 | } |
264 | 271 | ||
265 | /** | 272 | /** |
266 | * Omits languages | 273 | * Omits languages |
267 | * | 274 | * |
268 | * Pass this function the name of or an array of names of | 275 | * Pass this function the name of or an array of names of |
269 | * languages that you don't want considered | 276 | * languages that you don't want considered |
270 | * | 277 | * |
271 | * If you're only expecting a limited set of languages, this can greatly | 278 | * If you're only expecting a limited set of languages, this can greatly |
272 | * speed up processing | 279 | * speed up processing |
273 | * | 280 | * |
274 | * @access public | 281 | * @param mixed $omit_list language name or array of names to omit |
275 | * @param mixed $omit_list language name or array of names to omit | 282 | * @param bool $include_only if true will include (rather than |
276 | * @param bool $include_only if true will include (rather than | 283 | * exclude) only those in the list |
277 | * exclude) only those in the list | 284 | * |
278 | * @return int number of languages successfully deleted | 285 | * @return int number of languages successfully deleted |
279 | * @throws PEAR_Error | 286 | * @throws Text_LanguageDetect_Exception |
280 | */ | 287 | */ |
281 | function omitLanguages($omit_list, $include_only = false) | 288 | public function omitLanguages($omit_list, $include_only = false) |
282 | { | 289 | { |
283 | |||
284 | // setup check | ||
285 | if (!$this->_setup_ok($err)) { | ||
286 | return $err; | ||
287 | } | ||
288 | |||
289 | $deleted = 0; | 290 | $deleted = 0; |
290 | 291 | ||
291 | // deleting the given languages | 292 | $omit_list = $this->_convertFromNameMode($omit_list); |
293 | |||
292 | if (!$include_only) { | 294 | if (!$include_only) { |
295 | // deleting the given languages | ||
293 | if (!is_array($omit_list)) { | 296 | if (!is_array($omit_list)) { |
294 | $omit_list = strtolower($omit_list); // case desensitize | 297 | $omit_list = strtolower($omit_list); // case desensitize |
295 | if (isset($this->_lang_db[$omit_list])) { | 298 | if (isset($this->_lang_db[$omit_list])) { |
@@ -301,12 +304,12 @@ class Text_LanguageDetect | |||
301 | if (isset($this->_lang_db[$omit_lang])) { | 304 | if (isset($this->_lang_db[$omit_lang])) { |
302 | unset($this->_lang_db[$omit_lang]); | 305 | unset($this->_lang_db[$omit_lang]); |
303 | $deleted++; | 306 | $deleted++; |
304 | } | 307 | } |
305 | } | 308 | } |
306 | } | 309 | } |
307 | 310 | ||
308 | // deleting all except the given languages | ||
309 | } else { | 311 | } else { |
312 | // deleting all except the given languages | ||
310 | if (!is_array($omit_list)) { | 313 | if (!is_array($omit_list)) { |
311 | $omit_list = array($omit_list); | 314 | $omit_list = array($omit_list); |
312 | } | 315 | } |
@@ -327,7 +330,7 @@ class Text_LanguageDetect | |||
327 | // reset the cluster cache if the number of languages changes | 330 | // reset the cluster cache if the number of languages changes |
328 | // this will then have to be recalculated | 331 | // this will then have to be recalculated |
329 | if (isset($this->_clusters) && $deleted > 0) { | 332 | if (isset($this->_clusters) && $deleted > 0) { |
330 | unset($this->_clusters); | 333 | $this->_clusters = null; |
331 | } | 334 | } |
332 | 335 | ||
333 | return $deleted; | 336 | return $deleted; |
@@ -339,49 +342,40 @@ class Text_LanguageDetect | |||
339 | * | 342 | * |
340 | * @access public | 343 | * @access public |
341 | * @return int the number of languages | 344 | * @return int the number of languages |
342 | * @throws PEAR_Error | 345 | * @throws Text_LanguageDetect_Exception |
343 | */ | 346 | */ |
344 | function getLanguageCount() | 347 | function getLanguageCount() |
345 | { | 348 | { |
346 | if (!$this->_setup_ok($err)) { | 349 | return count($this->_lang_db); |
347 | return $err; | ||
348 | } else { | ||
349 | return count($this->_lang_db); | ||
350 | } | ||
351 | } | 350 | } |
352 | 351 | ||
353 | /** | 352 | /** |
354 | * Returns true if a given language exists | 353 | * Checks if the language with the given name exists in the database |
355 | * | 354 | * |
356 | * If passed an array of names, will return true only if all exist | 355 | * @param mixed $lang Language name or array of language names |
357 | * | 356 | * |
358 | * @access public | 357 | * @return bool true if language model exists |
359 | * @param mixed $lang language name or array of language names | ||
360 | * @return bool true if language model exists | ||
361 | * @throws PEAR_Error | ||
362 | */ | 358 | */ |
363 | function languageExists($lang) | 359 | public function languageExists($lang) |
364 | { | 360 | { |
365 | if (!$this->_setup_ok($err)) { | 361 | $lang = $this->_convertFromNameMode($lang); |
366 | return $err; | ||
367 | } else { | ||
368 | // string | ||
369 | if (is_string($lang)) { | ||
370 | return isset($this->_lang_db[strtolower($lang)]); | ||
371 | |||
372 | // array | ||
373 | } elseif (is_array($lang)) { | ||
374 | foreach ($lang as $test_lang) { | ||
375 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
376 | return false; | ||
377 | } | ||
378 | } | ||
379 | return true; | ||
380 | 362 | ||
381 | // other (error) | 363 | if (is_string($lang)) { |
382 | } else { | 364 | return isset($this->_lang_db[strtolower($lang)]); |
383 | throw new Exception('Unknown type passed to languageExists()'); | 365 | |
366 | } elseif (is_array($lang)) { | ||
367 | foreach ($lang as $test_lang) { | ||
368 | if (!isset($this->_lang_db[strtolower($test_lang)])) { | ||
369 | return false; | ||
370 | } | ||
384 | } | 371 | } |
372 | return true; | ||
373 | |||
374 | } else { | ||
375 | throw new Text_LanguageDetect_Exception( | ||
376 | 'Unsupported parameter type passed to languageExists()', | ||
377 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
378 | ); | ||
385 | } | 379 | } |
386 | } | 380 | } |
387 | 381 | ||
@@ -389,25 +383,24 @@ class Text_LanguageDetect | |||
389 | * Returns the list of detectable languages | 383 | * Returns the list of detectable languages |
390 | * | 384 | * |
391 | * @access public | 385 | * @access public |
392 | * @return array the names of the languages known to this object | 386 | * @return array the names of the languages known to this object<<<<<<< |
393 | * @throws PEAR_Error | 387 | * @throws Text_LanguageDetect_Exception |
394 | */ | 388 | */ |
395 | function getLanguages() | 389 | function getLanguages() |
396 | { | 390 | { |
397 | if (!$this->_setup_ok($err)) { | 391 | return $this->_convertToNameMode( |
398 | return $err; | 392 | array_keys($this->_lang_db) |
399 | } else { | 393 | ); |
400 | return array_keys($this->_lang_db); | ||
401 | } | ||
402 | } | 394 | } |
403 | 395 | ||
404 | /** | 396 | /** |
405 | * Make this object behave like Language::Guess | 397 | * Make this object behave like Language::Guess |
406 | * | 398 | * |
407 | * @access public | 399 | * @param bool $setting false to turn off perl compatibility |
408 | * @param bool $setting false to turn off perl compatibility | 400 | * |
401 | * @return void | ||
409 | */ | 402 | */ |
410 | function setPerlCompatible($setting = true) | 403 | public function setPerlCompatible($setting = true) |
411 | { | 404 | { |
412 | if (is_bool($setting)) { // input check | 405 | if (is_bool($setting)) { // input check |
413 | $this->_perl_compatible = $setting; | 406 | $this->_perl_compatible = $setting; |
@@ -422,6 +415,21 @@ class Text_LanguageDetect | |||
422 | } | 415 | } |
423 | 416 | ||
424 | /** | 417 | /** |
418 | * Sets the way how language names are accepted and returned. | ||
419 | * | ||
420 | * @param integer $name_mode One of the following modes: | ||
421 | * 0 - language name ("english") | ||
422 | * 2 - 2-letter ISO 639-1 code ("en") | ||
423 | * 3 - 3-letter ISO 639-2 code ("eng") | ||
424 | * | ||
425 | * @return void | ||
426 | */ | ||
427 | function setNameMode($name_mode) | ||
428 | { | ||
429 | $this->_name_mode = $name_mode; | ||
430 | } | ||
431 | |||
432 | /** | ||
425 | * Whether to use unicode block ranges in detection | 433 | * Whether to use unicode block ranges in detection |
426 | * | 434 | * |
427 | * Should speed up most detections if turned on (detault is on). In some | 435 | * Should speed up most detections if turned on (detault is on). In some |
@@ -429,10 +437,11 @@ class Text_LanguageDetect | |||
429 | * in languages that use latin scripts. In other cases it should speed up | 437 | * in languages that use latin scripts. In other cases it should speed up |
430 | * detection noticeably. | 438 | * detection noticeably. |
431 | * | 439 | * |
432 | * @access public | 440 | * @param bool $setting false to turn off |
433 | * @param bool $setting false to turn off | 441 | * |
442 | * @return void | ||
434 | */ | 443 | */ |
435 | function useUnicodeBlocks($setting = true) | 444 | public function useUnicodeBlocks($setting = true) |
436 | { | 445 | { |
437 | if (is_bool($setting)) { | 446 | if (is_bool($setting)) { |
438 | $this->_use_unicode_narrowing = $setting; | 447 | $this->_use_unicode_narrowing = $setting; |
@@ -442,15 +451,15 @@ class Text_LanguageDetect | |||
442 | /** | 451 | /** |
443 | * Converts a piece of text into trigrams | 452 | * Converts a piece of text into trigrams |
444 | * | 453 | * |
445 | * Superceded by the Text_LanguageDetect_Parser class | 454 | * @param string $text text to convert |
446 | * | 455 | * |
447 | * @access private | 456 | * @return array array of trigram frequencies |
448 | * @param string $text text to convert | 457 | * @access private |
449 | * @return array array of trigram frequencies | 458 | * @deprecated Superceded by the Text_LanguageDetect_Parser class |
450 | */ | 459 | */ |
451 | function _trigram($text) | 460 | function _trigram($text) |
452 | { | 461 | { |
453 | $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); | 462 | $s = new Text_LanguageDetect_Parser($text); |
454 | $s->prepareTrigram(); | 463 | $s->prepareTrigram(); |
455 | $s->prepareUnicode(false); | 464 | $s->prepareUnicode(false); |
456 | $s->setPadStart(!$this->_perl_compatible); | 465 | $s->setPadStart(!$this->_perl_compatible); |
@@ -463,11 +472,12 @@ class Text_LanguageDetect | |||
463 | * | 472 | * |
464 | * Thresholds (cuts off) the list at $this->_threshold | 473 | * Thresholds (cuts off) the list at $this->_threshold |
465 | * | 474 | * |
466 | * @access protected | 475 | * @param array $arr array of trigram |
467 | * @param array $arr array of trgram | 476 | * |
468 | * @return array ranks of trigrams | 477 | * @return array ranks of trigrams |
478 | * @access protected | ||
469 | */ | 479 | */ |
470 | function _arr_rank(&$arr) | 480 | function _arr_rank($arr) |
471 | { | 481 | { |
472 | 482 | ||
473 | // sorts alphabetically first as a standard way of breaking rank ties | 483 | // sorts alphabetically first as a standard way of breaking rank ties |
@@ -494,14 +504,17 @@ class Text_LanguageDetect | |||
494 | 504 | ||
495 | /** | 505 | /** |
496 | * Sorts an array by value breaking ties alphabetically | 506 | * Sorts an array by value breaking ties alphabetically |
497 | * | 507 | * |
498 | * @access private | 508 | * @param array &$arr the array to sort |
499 | * @param array &$arr the array to sort | 509 | * |
510 | * @return void | ||
511 | * @access private | ||
500 | */ | 512 | */ |
501 | function _bub_sort(&$arr) | 513 | function _bub_sort(&$arr) |
502 | { | 514 | { |
503 | // should do the same as this perl statement: | 515 | // should do the same as this perl statement: |
504 | // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | 516 | // sort { $trigrams{$b} == $trigrams{$a} |
517 | // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } | ||
505 | 518 | ||
506 | // needs to sort by both key and value at once | 519 | // needs to sort by both key and value at once |
507 | // using the key to break ties for the value | 520 | // using the key to break ties for the value |
@@ -528,13 +541,14 @@ class Text_LanguageDetect | |||
528 | /** | 541 | /** |
529 | * Sort function used by bubble sort | 542 | * Sort function used by bubble sort |
530 | * | 543 | * |
531 | * Callback function for usort(). | 544 | * Callback function for usort(). |
532 | * | 545 | * |
533 | * @access private | 546 | * @param array $a first param passed by usort() |
534 | * @param array first param passed by usort() | 547 | * @param array $b second param passed by usort() |
535 | * @param array second param passed by usort() | 548 | * |
536 | * @return int 1 if $a is greater, -1 if not | 549 | * @return int 1 if $a is greater, -1 if not |
537 | * @see _bub_sort() | 550 | * @see _bub_sort() |
551 | * @access private | ||
538 | */ | 552 | */ |
539 | function _sort_func($a, $b) | 553 | function _sort_func($a, $b) |
540 | { | 554 | { |
@@ -542,12 +556,12 @@ class Text_LanguageDetect | |||
542 | list($a_key, $a_value) = $a; | 556 | list($a_key, $a_value) = $a; |
543 | list($b_key, $b_value) = $b; | 557 | list($b_key, $b_value) = $b; |
544 | 558 | ||
545 | // if the values are the same, break ties using the key | ||
546 | if ($a_value == $b_value) { | 559 | if ($a_value == $b_value) { |
560 | // if the values are the same, break ties using the key | ||
547 | return strcmp($a_key, $b_key); | 561 | return strcmp($a_key, $b_key); |
548 | 562 | ||
549 | // if not, just sort normally | ||
550 | } else { | 563 | } else { |
564 | // if not, just sort normally | ||
551 | if ($a_value > $b_value) { | 565 | if ($a_value > $b_value) { |
552 | return -1; | 566 | return -1; |
553 | } else { | 567 | } else { |
@@ -559,23 +573,24 @@ class Text_LanguageDetect | |||
559 | } | 573 | } |
560 | 574 | ||
561 | /** | 575 | /** |
562 | * Calculates a linear rank-order distance statistic between two sets of | 576 | * Calculates a linear rank-order distance statistic between two sets of |
563 | * ranked trigrams | 577 | * ranked trigrams |
564 | * | 578 | * |
565 | * Sums the differences in rank for each trigram. If the trigram does not | 579 | * Sums the differences in rank for each trigram. If the trigram does not |
566 | * appear in both, consider it a difference of $this->_threshold. | 580 | * appear in both, consider it a difference of $this->_threshold. |
567 | * | 581 | * |
568 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite | 582 | * This distance measure was proposed by Cavnar & Trenkle (1994). Despite |
569 | * its simplicity it has been shown to be highly accurate for language | 583 | * its simplicity it has been shown to be highly accurate for language |
570 | * identification tasks. | 584 | * identification tasks. |
571 | * | 585 | * |
572 | * @access private | 586 | * @param array $arr1 the reference set of trigram ranks |
573 | * @param array $arr1 the reference set of trigram ranks | 587 | * @param array $arr2 the target set of trigram ranks |
574 | * @param array $arr2 the target set of trigram ranks | 588 | * |
575 | * @return int the sum of the differences between the ranks of | 589 | * @return int the sum of the differences between the ranks of |
576 | * the two trigram sets | 590 | * the two trigram sets |
591 | * @access private | ||
577 | */ | 592 | */ |
578 | function _distance(&$arr1, &$arr2) | 593 | function _distance($arr1, $arr2) |
579 | { | 594 | { |
580 | $sumdist = 0; | 595 | $sumdist = 0; |
581 | 596 | ||
@@ -598,14 +613,15 @@ class Text_LanguageDetect | |||
598 | 613 | ||
599 | /** | 614 | /** |
600 | * Normalizes the score returned by _distance() | 615 | * Normalizes the score returned by _distance() |
601 | * | 616 | * |
602 | * Different if perl compatible or not | 617 | * Different if perl compatible or not |
603 | * | 618 | * |
604 | * @access private | 619 | * @param int $score the score from _distance() |
605 | * @param int $score the score from _distance() | 620 | * @param int $base_count the number of trigrams being considered |
606 | * @param int $base_count the number of trigrams being considered | 621 | * |
607 | * @return float the normalized score | 622 | * @return float the normalized score |
608 | * @see _distance() | 623 | * @see _distance() |
624 | * @access private | ||
609 | */ | 625 | */ |
610 | function _normalize_score($score, $base_count = null) | 626 | function _normalize_score($score, $base_count = null) |
611 | { | 627 | { |
@@ -630,29 +646,24 @@ class Text_LanguageDetect | |||
630 | * | 646 | * |
631 | * If perl compatible, the score is 300-0, 0 being most similar. | 647 | * If perl compatible, the score is 300-0, 0 being most similar. |
632 | * Otherwise, it's 0-1 with 1 being most similar. | 648 | * Otherwise, it's 0-1 with 1 being most similar. |
633 | * | 649 | * |
634 | * The $sample text should be at least a few sentences in length; | 650 | * The $sample text should be at least a few sentences in length; |
635 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension | 651 | * should be ascii-7 or utf8 encoded, if another and the mbstring extension |
636 | * is present it will try to detect and convert. However, experience has | 652 | * is present it will try to detect and convert. However, experience has |
637 | * shown that mb_detect_encoding() *does not work very well* with at least | 653 | * shown that mb_detect_encoding() *does not work very well* with at least |
638 | * some types of encoding. | 654 | * some types of encoding. |
639 | * | 655 | * |
640 | * @access public | 656 | * @param string $sample a sample of text to compare. |
641 | * @param string $sample a sample of text to compare. | 657 | * @param int $limit if specified, return an array of the most likely |
642 | * @param int $limit if specified, return an array of the most likely | 658 | * $limit languages and their scores. |
643 | * $limit languages and their scores. | 659 | * |
644 | * @return mixed sorted array of language scores, blank array if no | 660 | * @return mixed sorted array of language scores, blank array if no |
645 | * useable text was found, or PEAR_Error if error | 661 | * useable text was found |
646 | * with the object setup | 662 | * @see _distance() |
647 | * @see _distance() | 663 | * @throws Text_LanguageDetect_Exception |
648 | * @throws PEAR_Error | ||
649 | */ | 664 | */ |
650 | function detect($sample, $limit = 0) | 665 | public function detect($sample, $limit = 0) |
651 | { | 666 | { |
652 | if (!$this->_setup_ok($err)) { | ||
653 | return $err; | ||
654 | } | ||
655 | |||
656 | // input check | 667 | // input check |
657 | if (!Text_LanguageDetect_Parser::validateString($sample)) { | 668 | if (!Text_LanguageDetect_Parser::validateString($sample)) { |
658 | return array(); | 669 | return array(); |
@@ -660,36 +671,27 @@ class Text_LanguageDetect | |||
660 | 671 | ||
661 | // check char encoding | 672 | // check char encoding |
662 | // (only if mbstring extension is compiled and PHP > 4.0.6) | 673 | // (only if mbstring extension is compiled and PHP > 4.0.6) |
663 | if (function_exists('mb_detect_encoding') | 674 | if (function_exists('mb_detect_encoding') |
664 | && function_exists('mb_convert_encoding')) { | 675 | && function_exists('mb_convert_encoding') |
665 | 676 | ) { | |
666 | // mb_detect_encoding isn't very reliable, to say the least | 677 | // mb_detect_encoding isn't very reliable, to say the least |
667 | // detection should still work with a sufficient sample of ascii characters | 678 | // detection should still work with a sufficient sample |
679 | // of ascii characters | ||
668 | $encoding = mb_detect_encoding($sample); | 680 | $encoding = mb_detect_encoding($sample); |
669 | 681 | ||
670 | // mb_detect_encoding() will return FALSE if detection fails | 682 | // mb_detect_encoding() will return FALSE if detection fails |
671 | // don't attempt conversion if that's the case | 683 | // don't attempt conversion if that's the case |
672 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { | 684 | if ($encoding != 'ASCII' && $encoding != 'UTF-8' |
673 | 685 | && $encoding !== false | |
674 | if (function_exists('mb_list_encodings')) { | 686 | ) { |
675 | 687 | // verify the encoding exists in mb_list_encodings | |
676 | // verify the encoding exists in mb_list_encodings | 688 | if (in_array($encoding, mb_list_encodings())) { |
677 | if (in_array($encoding, mb_list_encodings())) { | 689 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); |
678 | $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
679 | } | ||
680 | |||
681 | // if the previous condition failed: | ||
682 | // somehow we detected an encoding that also we don't support | ||
683 | |||
684 | } else { | ||
685 | // php 4 doesnt have mb_list_encodings() | ||
686 | // so attempt with error suppression | ||
687 | $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); | ||
688 | } | 690 | } |
689 | } | 691 | } |
690 | } | 692 | } |
691 | 693 | ||
692 | $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); | 694 | $sample_obj = new Text_LanguageDetect_Parser($sample); |
693 | $sample_obj->prepareTrigram(); | 695 | $sample_obj->prepareTrigram(); |
694 | if ($this->_use_unicode_narrowing) { | 696 | if ($this->_use_unicode_narrowing) { |
695 | $sample_obj->prepareUnicode(); | 697 | $sample_obj->prepareUnicode(); |
@@ -713,7 +715,10 @@ class Text_LanguageDetect | |||
713 | if (is_array($blocks)) { | 715 | if (is_array($blocks)) { |
714 | $present_blocks = array_keys($blocks); | 716 | $present_blocks = array_keys($blocks); |
715 | } else { | 717 | } else { |
716 | throw new Exception('Error during block detection'); | 718 | throw new Text_LanguageDetect_Exception( |
719 | 'Error during block detection', | ||
720 | Text_LanguageDetect_Exception::BLOCK_DETECTION | ||
721 | ); | ||
717 | } | 722 | } |
718 | 723 | ||
719 | $possible_langs = array(); | 724 | $possible_langs = array(); |
@@ -731,30 +736,30 @@ class Text_LanguageDetect | |||
731 | } | 736 | } |
732 | 737 | ||
733 | // could also try an intersect operation rather than a union | 738 | // could also try an intersect operation rather than a union |
734 | // in other words, choose languages whose trigrams contain | 739 | // in other words, choose languages whose trigrams contain |
735 | // ALL of the unicode blocks found in this sample | 740 | // ALL of the unicode blocks found in this sample |
736 | // would improve speed but would be completely thrown off by an | 741 | // would improve speed but would be completely thrown off by an |
737 | // unexpected character, like an umlaut appearing in english text | 742 | // unexpected character, like an umlaut appearing in english text |
738 | 743 | ||
739 | $possible_langs = array_intersect( | 744 | $possible_langs = array_intersect( |
740 | array_keys($this->_lang_db), | 745 | array_keys($this->_lang_db), |
741 | array_unique($possible_langs) | 746 | array_unique($possible_langs) |
742 | ); | 747 | ); |
743 | 748 | ||
744 | // needs to intersect it with the keys of _lang_db in case | 749 | // needs to intersect it with the keys of _lang_db in case |
745 | // languages have been omitted | 750 | // languages have been omitted |
746 | 751 | ||
747 | // or just try 'em all | ||
748 | } else { | 752 | } else { |
753 | // or just try 'em all | ||
749 | $possible_langs = array_keys($this->_lang_db); | 754 | $possible_langs = array_keys($this->_lang_db); |
750 | } | 755 | } |
751 | 756 | ||
752 | 757 | ||
753 | foreach ($possible_langs as $lang) { | 758 | foreach ($possible_langs as $lang) { |
754 | $scores[$lang] = | 759 | $scores[$lang] = $this->_normalize_score( |
755 | $this->_normalize_score( | 760 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), |
756 | $this->_distance($this->_lang_db[$lang], $trigram_freqs), | 761 | $trigram_count |
757 | $trigram_count); | 762 | ); |
758 | } | 763 | } |
759 | 764 | ||
760 | unset($sample_obj); | 765 | unset($sample_obj); |
@@ -772,7 +777,6 @@ class Text_LanguageDetect | |||
772 | $limited_scores = array(); | 777 | $limited_scores = array(); |
773 | 778 | ||
774 | $i = 0; | 779 | $i = 0; |
775 | |||
776 | foreach ($scores as $key => $value) { | 780 | foreach ($scores as $key => $value) { |
777 | if ($i++ >= $limit) { | 781 | if ($i++ >= $limit) { |
778 | break; | 782 | break; |
@@ -781,9 +785,9 @@ class Text_LanguageDetect | |||
781 | $limited_scores[$key] = $value; | 785 | $limited_scores[$key] = $value; |
782 | } | 786 | } |
783 | 787 | ||
784 | return $limited_scores; | 788 | return $this->_convertToNameMode($limited_scores, true); |
785 | } else { | 789 | } else { |
786 | return $scores; | 790 | return $this->_convertToNameMode($scores, true); |
787 | } | 791 | } |
788 | } | 792 | } |
789 | 793 | ||
@@ -791,35 +795,33 @@ class Text_LanguageDetect | |||
791 | * Returns only the most similar language to the text sample | 795 | * Returns only the most similar language to the text sample |
792 | * | 796 | * |
793 | * Calls $this->detect() and returns only the top result | 797 | * Calls $this->detect() and returns only the top result |
794 | * | 798 | * |
795 | * @access public | 799 | * @param string $sample text to detect the language of |
796 | * @param string $sample text to detect the language of | 800 | * |
797 | * @return string the name of the most likely language | 801 | * @return string the name of the most likely language |
798 | * or null if no language is similar | 802 | * or null if no language is similar |
799 | * @see detect() | 803 | * @see detect() |
800 | * @throws PEAR_Error | 804 | * @throws Text_LanguageDetect_Exception |
801 | */ | 805 | */ |
802 | function detectSimple($sample) | 806 | public function detectSimple($sample) |
803 | { | 807 | { |
804 | $scores = $this->detect($sample, 1); | 808 | $scores = $this->detect($sample, 1); |
805 | 809 | ||
806 | // if top language has the maximum possible score, | 810 | // if top language has the maximum possible score, |
807 | // then the top score will have been picked at random | 811 | // then the top score will have been picked at random |
808 | if ( !is_array($scores) | 812 | if (!is_array($scores) || empty($scores) |
809 | || empty($scores) | 813 | || current($scores) == $this->_max_score |
810 | || current($scores) == $this->_max_score) { | 814 | ) { |
811 | |||
812 | return null; | 815 | return null; |
813 | |||
814 | } else { | 816 | } else { |
815 | return ucfirst(key($scores)); | 817 | return key($scores); |
816 | } | 818 | } |
817 | } | 819 | } |
818 | 820 | ||
819 | /** | 821 | /** |
820 | * Returns an array containing the most similar language and a confidence | 822 | * Returns an array containing the most similar language and a confidence |
821 | * rating | 823 | * rating |
822 | * | 824 | * |
823 | * Confidence is a simple measure calculated from the similarity score | 825 | * Confidence is a simple measure calculated from the similarity score |
824 | * minus the similarity score from the next most similar language | 826 | * minus the similarity score from the next most similar language |
825 | * divided by the highest possible score. Languages that have closely | 827 | * divided by the highest possible score. Languages that have closely |
@@ -827,46 +829,43 @@ class Text_LanguageDetect | |||
827 | * confidence scores. | 829 | * confidence scores. |
828 | * | 830 | * |
829 | * The similarity score answers the question "How likely is the text the | 831 | * The similarity score answers the question "How likely is the text the |
830 | * returned language regardless of the other languages considered?" The | 832 | * returned language regardless of the other languages considered?" The |
831 | * confidence score is one way of answering the question "how likely is the | 833 | * confidence score is one way of answering the question "how likely is the |
832 | * text the detected language relative to the rest of the language model | 834 | * text the detected language relative to the rest of the language model |
833 | * set?" | 835 | * set?" |
834 | * | 836 | * |
835 | * To see how similar languages are a priori, see languageSimilarity() | 837 | * To see how similar languages are a priori, see languageSimilarity() |
836 | * | 838 | * |
837 | * @access public | 839 | * @param string $sample text for which language will be detected |
838 | * @param string $sample text for which language will be detected | 840 | * |
839 | * @return array most similar language, score and confidence rating | 841 | * @return array most similar language, score and confidence rating |
840 | * or null if no language is similar | 842 | * or null if no language is similar |
841 | * @see detect() | 843 | * @see detect() |
842 | * @throws PEAR_Error | 844 | * @throws Text_LanguageDetect_Exception |
843 | */ | 845 | */ |
844 | function detectConfidence($sample) | 846 | public function detectConfidence($sample) |
845 | { | 847 | { |
846 | $scores = $this->detect($sample, 2); | 848 | $scores = $this->detect($sample, 2); |
847 | 849 | ||
848 | // if most similar language has the max score, it | 850 | // if most similar language has the max score, it |
849 | // will have been picked at random | 851 | // will have been picked at random |
850 | if ( !is_array($scores) | 852 | if (!is_array($scores) || empty($scores) |
851 | || empty($scores) | 853 | || current($scores) == $this->_max_score |
852 | || current($scores) == $this->_max_score) { | 854 | ) { |
853 | |||
854 | return null; | 855 | return null; |
855 | } | 856 | } |
856 | 857 | ||
857 | $arr['language'] = ucfirst(key($scores)); | 858 | $arr['language'] = key($scores); |
858 | $arr['similarity'] = current($scores); | 859 | $arr['similarity'] = current($scores); |
859 | if (next($scores) !== false) { // if false then no next element | 860 | if (next($scores) !== false) { // if false then no next element |
860 | // the goal is to return a higher value if the distance between | 861 | // the goal is to return a higher value if the distance between |
861 | // the similarity of the first score and the second score is high | 862 | // the similarity of the first score and the second score is high |
862 | 863 | ||
863 | if ($this->_perl_compatible) { | 864 | if ($this->_perl_compatible) { |
864 | 865 | $arr['confidence'] = (current($scores) - $arr['similarity']) | |
865 | $arr['confidence'] = | 866 | / $this->_max_score; |
866 | (current($scores) - $arr['similarity']) / $this->_max_score; | ||
867 | 867 | ||
868 | } else { | 868 | } else { |
869 | |||
870 | $arr['confidence'] = $arr['similarity'] - current($scores); | 869 | $arr['confidence'] = $arr['similarity'] - current($scores); |
871 | 870 | ||
872 | } | 871 | } |
@@ -882,32 +881,26 @@ class Text_LanguageDetect | |||
882 | * Returns the distribution of unicode blocks in a given utf8 string | 881 | * Returns the distribution of unicode blocks in a given utf8 string |
883 | * | 882 | * |
884 | * For the block name of a single char, use unicodeBlockName() | 883 | * For the block name of a single char, use unicodeBlockName() |
885 | * | 884 | * |
886 | * @access public | 885 | * @param string $str input string. Must be ascii or utf8 |
887 | * @param string $str input string. Must be ascii or utf8 | 886 | * @param bool $skip_symbols if true, skip ascii digits, symbols and |
888 | * @param bool $skip_symbols if true, skip ascii digits, symbols and | 887 | * non-printing characters. Includes spaces, |
889 | * non-printing characters. Includes spaces, | 888 | * newlines and common punctutation characters. |
890 | * newlines and common punctutation characters. | 889 | * |
891 | * @return array | 890 | * @return array |
892 | * @throws PEAR_Error | 891 | * @throws Text_LanguageDetect_Exception |
893 | */ | 892 | */ |
894 | function detectUnicodeBlocks($str, $skip_symbols) | 893 | public function detectUnicodeBlocks($str, $skip_symbols) |
895 | { | 894 | { |
896 | // input check | 895 | $skip_symbols = (bool)$skip_symbols; |
897 | if (!is_bool($skip_symbols)) { | 896 | $str = (string)$str; |
898 | throw new Exception('Second parameter must be boolean'); | ||
899 | } | ||
900 | |||
901 | if (!is_string($str)) { | ||
902 | throw new Exception('First parameter was not a string'); | ||
903 | } | ||
904 | 897 | ||
905 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 898 | $sample_obj = new Text_LanguageDetect_Parser($str); |
906 | $sample_obj->prepareUnicode(); | 899 | $sample_obj->prepareUnicode(); |
907 | $sample_obj->prepareTrigram(false); | 900 | $sample_obj->prepareTrigram(false); |
908 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); | 901 | $sample_obj->setUnicodeSkipSymbols($skip_symbols); |
909 | $sample_obj->analyze(); | 902 | $sample_obj->analyze(); |
910 | $blocks =& $sample_obj->getUnicodeBlocks(); | 903 | $blocks = $sample_obj->getUnicodeBlocks(); |
911 | unset($sample_obj); | 904 | unset($sample_obj); |
912 | return $blocks; | 905 | return $blocks; |
913 | } | 906 | } |
@@ -915,38 +908,37 @@ class Text_LanguageDetect | |||
915 | /** | 908 | /** |
916 | * Returns the block name for a given unicode value | 909 | * Returns the block name for a given unicode value |
917 | * | 910 | * |
918 | * If passed a string, will assume it is being passed a UTF8-formatted | 911 | * If passed a string, will assume it is being passed a UTF8-formatted |
919 | * character and will automatically convert. Otherwise it will assume it | 912 | * character and will automatically convert. Otherwise it will assume it |
920 | * is being passed a numeric unicode value. | 913 | * is being passed a numeric unicode value. |
921 | * | 914 | * |
922 | * Make sure input is of the correct type! | 915 | * Make sure input is of the correct type! |
923 | * | 916 | * |
924 | * @access public | ||
925 | * @param mixed $unicode unicode value or utf8 char | 917 | * @param mixed $unicode unicode value or utf8 char |
918 | * | ||
926 | * @return mixed the block name string or false if not found | 919 | * @return mixed the block name string or false if not found |
927 | * @throws PEAR_Error | 920 | * @throws Text_LanguageDetect_Exception |
928 | */ | 921 | */ |
929 | function unicodeBlockName($unicode) { | 922 | public function unicodeBlockName($unicode) |
923 | { | ||
930 | if (is_string($unicode)) { | 924 | if (is_string($unicode)) { |
931 | // assume it is being passed a utf8 char, so convert it | 925 | // assume it is being passed a utf8 char, so convert it |
932 | 926 | if (self::utf8strlen($unicode) > 1) { | |
933 | // input check | 927 | throw new Text_LanguageDetect_Exception( |
934 | if ($this->utf8strlen($unicode) > 1) { | 928 | 'Pass a single char only to this method', |
935 | throw new Exception('Pass this function only a single char'); | 929 | Text_LanguageDetect_Exception::PARAM_TYPE |
930 | ); | ||
936 | } | 931 | } |
937 | |||
938 | $unicode = $this->_utf8char2unicode($unicode); | 932 | $unicode = $this->_utf8char2unicode($unicode); |
939 | 933 | ||
940 | if ($unicode == -1) { | ||
941 | throw new Exception('Malformatted char'); | ||
942 | } | ||
943 | |||
944 | // input check | ||
945 | } elseif (!is_int($unicode)) { | 934 | } elseif (!is_int($unicode)) { |
946 | throw new Exception('Input must be of type string or int.'); | 935 | throw new Text_LanguageDetect_Exception( |
936 | 'Input must be of type string or int.', | ||
937 | Text_LanguageDetect_Exception::PARAM_TYPE | ||
938 | ); | ||
947 | } | 939 | } |
948 | 940 | ||
949 | $blocks =& $this->_read_unicode_block_db(); | 941 | $blocks = $this->_read_unicode_block_db(); |
950 | 942 | ||
951 | $result = $this->_unicode_block_name($unicode, $blocks); | 943 | $result = $this->_unicode_block_name($unicode, $blocks); |
952 | 944 | ||
@@ -964,14 +956,17 @@ class Text_LanguageDetect | |||
964 | * the public interface for this function, which does input checks which | 956 | * the public interface for this function, which does input checks which |
965 | * this function omits for speed. | 957 | * this function omits for speed. |
966 | * | 958 | * |
967 | * @access protected | 959 | * @param int $unicode the unicode value |
968 | * @param int $unicode the unicode value | 960 | * @param array $blocks the block database |
969 | * @param array &$blocks the block database | 961 | * @param int $block_count the number of defined blocks in the database |
970 | * @param int $block_count the number of defined blocks in the database | 962 | * |
971 | * @see unicodeBlockName() | 963 | * @return mixed Block name, -1 if it failed |
964 | * @see unicodeBlockName() | ||
965 | * @access protected | ||
972 | */ | 966 | */ |
973 | function _unicode_block_name($unicode, &$blocks, $block_count = -1) { | 967 | function _unicode_block_name($unicode, $blocks, $block_count = -1) |
974 | // for a reference, see | 968 | { |
969 | // for a reference, see | ||
975 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt | 970 | // http://www.unicode.org/Public/UNIDATA/Blocks.txt |
976 | 971 | ||
977 | // assume that ascii characters are the most common | 972 | // assume that ascii characters are the most common |
@@ -994,35 +989,36 @@ class Text_LanguageDetect | |||
994 | while ($low <= $high) { | 989 | while ($low <= $high) { |
995 | $mid = floor(($low + $high) / 2); | 990 | $mid = floor(($low + $high) / 2); |
996 | 991 | ||
997 | // if it's lower than the lower bound | ||
998 | if ($unicode < $blocks[$mid][0]) { | 992 | if ($unicode < $blocks[$mid][0]) { |
993 | // if it's lower than the lower bound | ||
999 | $high = $mid - 1; | 994 | $high = $mid - 1; |
1000 | 995 | ||
1001 | // if it's higher than the upper bound | ||
1002 | } elseif ($unicode > $blocks[$mid][1]) { | 996 | } elseif ($unicode > $blocks[$mid][1]) { |
997 | // if it's higher than the upper bound | ||
1003 | $low = $mid + 1; | 998 | $low = $mid + 1; |
1004 | 999 | ||
1005 | // found it | ||
1006 | } else { | 1000 | } else { |
1001 | // found it | ||
1007 | return $blocks[$mid]; | 1002 | return $blocks[$mid]; |
1008 | } | 1003 | } |
1009 | } | 1004 | } |
1010 | 1005 | ||
1011 | // failed to find the block | 1006 | // failed to find the block |
1012 | return -1; | 1007 | return -1; |
1013 | 1008 | ||
1014 | // todo: differentiate when it's out of range or when it falls | 1009 | // todo: differentiate when it's out of range or when it falls |
1015 | // into an unassigned range? | 1010 | // into an unassigned range? |
1016 | } | 1011 | } |
1017 | 1012 | ||
1018 | /** | 1013 | /** |
1019 | * Brings up the unicode block database | 1014 | * Brings up the unicode block database |
1020 | * | 1015 | * |
1021 | * @access protected | ||
1022 | * @return array the database of unicode block definitions | 1016 | * @return array the database of unicode block definitions |
1023 | * @throws PEAR_Error | 1017 | * @throws Text_LanguageDetect_Exception |
1018 | * @access protected | ||
1024 | */ | 1019 | */ |
1025 | function &_read_unicode_block_db() { | 1020 | function _read_unicode_block_db() |
1021 | { | ||
1026 | // since the unicode definitions are always going to be the same, | 1022 | // since the unicode definitions are always going to be the same, |
1027 | // might as well share the memory for the db with all other instances | 1023 | // might as well share the memory for the db with all other instances |
1028 | // of this class | 1024 | // of this class |
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect | |||
1037 | 1033 | ||
1038 | /** | 1034 | /** |
1039 | * Calculate the similarities between the language models | 1035 | * Calculate the similarities between the language models |
1040 | * | 1036 | * |
1041 | * Use this function to see how similar languages are to each other. | 1037 | * Use this function to see how similar languages are to each other. |
1042 | * | 1038 | * |
1043 | * If passed 2 language names, will return just those languages compared. | 1039 | * If passed 2 language names, will return just those languages compared. |
1044 | * If passed 1 language name, will return that language compared to | 1040 | * If passed 1 language name, will return that language compared to |
1045 | * all others. | 1041 | * all others. |
1046 | * If passed none, will return an array of every language model compared | 1042 | * If passed none, will return an array of every language model compared |
1047 | * to every other one. | 1043 | * to every other one. |
1048 | * | 1044 | * |
1049 | * @access public | 1045 | * @param string $lang1 the name of the first language to be compared |
1050 | * @param string $lang1 the name of the first language to be compared | 1046 | * @param string $lang2 the name of the second language to be compared |
1051 | * @param string $lang2 the name of the second language to be compared | 1047 | * |
1052 | * @return array scores of every language compared | 1048 | * @return array scores of every language compared |
1053 | * or the score of just the provided languages | 1049 | * or the score of just the provided languages |
1054 | * or null if one of the supplied languages does not exist | 1050 | * or null if one of the supplied languages does not exist |
1055 | * @throws PEAR_Error | 1051 | * @throws Text_LanguageDetect_Exception |
1056 | */ | 1052 | */ |
1057 | function languageSimilarity($lang1 = null, $lang2 = null) | 1053 | public function languageSimilarity($lang1 = null, $lang2 = null) |
1058 | { | 1054 | { |
1059 | if (!$this->_setup_ok($err)) { | 1055 | $lang1 = $this->_convertFromNameMode($lang1); |
1060 | return $err; | 1056 | $lang2 = $this->_convertFromNameMode($lang2); |
1061 | } | ||
1062 | |||
1063 | if ($lang1 != null) { | 1057 | if ($lang1 != null) { |
1064 | $lang1 = strtolower($lang1); | 1058 | $lang1 = strtolower($lang1); |
1065 | 1059 | ||
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect | |||
1069 | } | 1063 | } |
1070 | 1064 | ||
1071 | if ($lang2 != null) { | 1065 | if ($lang2 != null) { |
1072 | 1066 | if (!isset($this->_lang_db[$lang2])) { | |
1073 | // can't only set the second param | 1067 | // check if language model exists |
1074 | if ($lang1 == null) { | ||
1075 | return null; | ||
1076 | // check if language model exists | ||
1077 | } elseif (!isset($this->_lang_db[$lang2])) { | ||
1078 | return null; | 1068 | return null; |
1079 | } | 1069 | } |
1080 | 1070 | ||
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect | |||
1088 | ) | 1078 | ) |
1089 | ); | 1079 | ); |
1090 | 1080 | ||
1091 | |||
1092 | // compare just $lang1 to all languages | ||
1093 | } else { | 1081 | } else { |
1082 | // compare just $lang1 to all languages | ||
1094 | $return_arr = array(); | 1083 | $return_arr = array(); |
1095 | foreach ($this->_lang_db as $key => $value) { | 1084 | foreach ($this->_lang_db as $key => $value) { |
1096 | if ($key != $lang1) { // don't compare a language to itself | 1085 | if ($key != $lang1) { |
1086 | // don't compare a language to itself | ||
1097 | $return_arr[$key] = $this->_normalize_score( | 1087 | $return_arr[$key] = $this->_normalize_score( |
1098 | $this->_distance($this->_lang_db[$lang1], $value)); | 1088 | $this->_distance($this->_lang_db[$lang1], $value) |
1089 | ); | ||
1099 | } | 1090 | } |
1100 | } | 1091 | } |
1101 | asort($return_arr); | 1092 | asort($return_arr); |
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect | |||
1104 | } | 1095 | } |
1105 | 1096 | ||
1106 | 1097 | ||
1107 | // compare all languages to each other | ||
1108 | } else { | 1098 | } else { |
1099 | // compare all languages to each other | ||
1109 | $return_arr = array(); | 1100 | $return_arr = array(); |
1110 | foreach (array_keys($this->_lang_db) as $lang1) { | 1101 | foreach (array_keys($this->_lang_db) as $lang1) { |
1111 | foreach (array_keys($this->_lang_db) as $lang2) { | 1102 | foreach (array_keys($this->_lang_db) as $lang2) { |
1112 | |||
1113 | // skip comparing languages to themselves | 1103 | // skip comparing languages to themselves |
1114 | if ($lang1 != $lang2) { | 1104 | if ($lang1 != $lang2) { |
1115 | |||
1116 | // don't re-calculate what's already been done | ||
1117 | if (isset($return_arr[$lang2][$lang1])) { | ||
1118 | 1105 | ||
1119 | $return_arr[$lang1][$lang2] = | 1106 | if (isset($return_arr[$lang2][$lang1])) { |
1120 | $return_arr[$lang2][$lang1]; | 1107 | // don't re-calculate what's already been done |
1108 | $return_arr[$lang1][$lang2] | ||
1109 | = $return_arr[$lang2][$lang1]; | ||
1121 | 1110 | ||
1122 | // calculate | ||
1123 | } else { | 1111 | } else { |
1124 | 1112 | // calculate | |
1125 | $return_arr[$lang1][$lang2] = | 1113 | $return_arr[$lang1][$lang2] |
1126 | $this->_normalize_score( | 1114 | = $this->_normalize_score( |
1127 | $this->_distance( | 1115 | $this->_distance( |
1128 | $this->_lang_db[$lang1], | 1116 | $this->_lang_db[$lang1], |
1129 | $this->_lang_db[$lang2] | 1117 | $this->_lang_db[$lang2] |
1130 | ) | 1118 | ) |
1131 | ); | 1119 | ); |
1132 | 1120 | ||
1133 | } | 1121 | } |
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect | |||
1150 | * | 1138 | * |
1151 | * @access public | 1139 | * @access public |
1152 | * @return array language cluster data | 1140 | * @return array language cluster data |
1153 | * @throws PEAR_Error | 1141 | * @throws Text_LanguageDetect_Exception |
1154 | * @see languageSimilarity() | 1142 | * @see languageSimilarity() |
1155 | * @deprecated this function will eventually be removed and placed into | 1143 | * @deprecated this function will eventually be removed and placed into |
1156 | * the model generation class | 1144 | * the model generation class |
1157 | */ | 1145 | */ |
1158 | function clusterLanguages() | 1146 | function clusterLanguages() |
1159 | { | 1147 | { |
1160 | // todo: set the maximum number of clusters | 1148 | // todo: set the maximum number of clusters |
1161 | |||
1162 | // setup check | ||
1163 | if (!$this->_setup_ok($err)) { | ||
1164 | return $err; | ||
1165 | } | ||
1166 | |||
1167 | // return cached result, if any | 1149 | // return cached result, if any |
1168 | if (isset($this->_clusters)) { | 1150 | if (isset($this->_clusters)) { |
1169 | return $this->_clusters; | 1151 | return $this->_clusters; |
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect | |||
1177 | 1159 | ||
1178 | foreach ($langs as $lang) { | 1160 | foreach ($langs as $lang) { |
1179 | if (!isset($this->_lang_db[$lang])) { | 1161 | if (!isset($this->_lang_db[$lang])) { |
1180 | throw new Exception("missing $lang!\n"); | 1162 | throw new Text_LanguageDetect_Exception( |
1163 | "missing $lang!", | ||
1164 | Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE | ||
1165 | ); | ||
1181 | } | 1166 | } |
1182 | } | 1167 | } |
1183 | 1168 | ||
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect | |||
1186 | $langs[$lang1] = $lang1; | 1171 | $langs[$lang1] = $lang1; |
1187 | unset($langs[$old_key]); | 1172 | unset($langs[$old_key]); |
1188 | } | 1173 | } |
1189 | 1174 | ||
1175 | $result_data = $really_map = array(); | ||
1176 | |||
1190 | $i = 0; | 1177 | $i = 0; |
1191 | while (count($langs) > 2 && $i++ < 200) { | 1178 | while (count($langs) > 2 && $i++ < 200) { |
1192 | $highest_score = -1; | 1179 | $highest_score = -1; |
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect | |||
1194 | $highest_key2 = ''; | 1181 | $highest_key2 = ''; |
1195 | foreach ($langs as $lang1) { | 1182 | foreach ($langs as $lang1) { |
1196 | foreach ($langs as $lang2) { | 1183 | foreach ($langs as $lang2) { |
1197 | if ( $lang1 != $lang2 | 1184 | if ($lang1 != $lang2 |
1198 | && $arr[$lang1][$lang2] > $highest_score) { | 1185 | && $arr[$lang1][$lang2] > $highest_score |
1186 | ) { | ||
1199 | $highest_score = $arr[$lang1][$lang2]; | 1187 | $highest_score = $arr[$lang1][$lang2]; |
1200 | $highest_key1 = $lang1; | 1188 | $highest_key1 = $lang1; |
1201 | $highest_key2 = $lang2; | 1189 | $highest_key2 = $lang2; |
1202 | } | 1190 | } |
1203 | } | 1191 | } |
1204 | } | 1192 | } |
1205 | 1193 | ||
1206 | if (!$highest_key1) { | 1194 | if (!$highest_key1) { |
1207 | // should not ever happen | 1195 | // should not ever happen |
1208 | throw new Exception("no highest key? (step: $i)"); | 1196 | throw new Text_LanguageDetect_Exception( |
1197 | "no highest key? (step: $i)", | ||
1198 | Text_LanguageDetect_Exception::NO_HIGHEST_KEY | ||
1199 | ); | ||
1209 | } | 1200 | } |
1210 | 1201 | ||
1211 | if ($highest_score == 0) { | 1202 | if ($highest_score == 0) { |
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect | |||
1217 | $sum1 = array_sum($arr[$highest_key1]); | 1208 | $sum1 = array_sum($arr[$highest_key1]); |
1218 | $sum2 = array_sum($arr[$highest_key2]); | 1209 | $sum2 = array_sum($arr[$highest_key2]); |
1219 | 1210 | ||
1220 | // use the score for the one that is most similar to the rest of | 1211 | // use the score for the one that is most similar to the rest of |
1221 | // the field as the score for the group | 1212 | // the field as the score for the group |
1222 | // todo: could try averaging or "centroid" method instead | 1213 | // todo: could try averaging or "centroid" method instead |
1223 | // seems like that might make more sense | 1214 | // seems like that might make more sense |
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect | |||
1248 | $really_lang = $replaceme; | 1239 | $really_lang = $replaceme; |
1249 | while (isset($really_map[$really_lang])) { | 1240 | while (isset($really_map[$really_lang])) { |
1250 | $really_lang = $really_map[$really_lang]; | 1241 | $really_lang = $really_map[$really_lang]; |
1251 | } | 1242 | } |
1252 | $really_map[$newkey] = $really_lang; | 1243 | $really_map[$newkey] = $really_lang; |
1253 | 1244 | ||
1254 | 1245 | ||
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect | |||
1259 | $arr[$key1][$newkey] = $arr[$key1][$key2]; | 1250 | $arr[$key1][$newkey] = $arr[$key1][$key2]; |
1260 | unset($arr[$key1][$key2]); | 1251 | unset($arr[$key1][$key2]); |
1261 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] | 1252 | // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] |
1262 | } | 1253 | } |
1263 | 1254 | ||
1264 | if ($key1 == $replaceme) { | 1255 | if ($key1 == $replaceme) { |
1265 | $arr[$newkey][$key2] = $arr[$key1][$key2]; | 1256 | $arr[$newkey][$key2] = $arr[$key1][$key2]; |
1266 | unset($arr[$key1][$key2]); | 1257 | unset($arr[$key1][$key2]); |
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect | |||
1273 | } | 1264 | } |
1274 | } | 1265 | } |
1275 | } | 1266 | } |
1276 | 1267 | ||
1277 | 1268 | ||
1278 | unset($langs[$highest_key1]); | 1269 | unset($langs[$highest_key1]); |
1279 | unset($langs[$highest_key2]); | 1270 | unset($langs[$highest_key2]); |
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect | |||
1293 | } | 1284 | } |
1294 | 1285 | ||
1295 | $return_val = array( | 1286 | $return_val = array( |
1296 | 'open_forks' => $langs, | 1287 | 'open_forks' => $langs, |
1297 | // the top level of clusters | 1288 | // the top level of clusters |
1298 | // clusters that are mutually exclusive | 1289 | // clusters that are mutually exclusive |
1299 | // or specified by a specific maximum | 1290 | // or specified by a specific maximum |
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect | |||
1323 | * use, and it may disappear or its functionality may change in future | 1314 | * use, and it may disappear or its functionality may change in future |
1324 | * releases without notice. | 1315 | * releases without notice. |
1325 | * | 1316 | * |
1326 | * This compares the sample text to top the top level of clusters. If the | 1317 | * This compares the sample text to top the top level of clusters. If the |
1327 | * sample is similar to the cluster it will drop down and compare it to the | 1318 | * sample is similar to the cluster it will drop down and compare it to the |
1328 | * languages in the cluster, and so on until it hits a leaf node. | 1319 | * languages in the cluster, and so on until it hits a leaf node. |
1329 | * | 1320 | * |
1330 | * this should find the language in considerably fewer compares | 1321 | * this should find the language in considerably fewer compares |
1331 | * (the equivalent of a binary search), however clusterLanguages() is costly | 1322 | * (the equivalent of a binary search), however clusterLanguages() is costly |
1332 | * and the loss of accuracy from this technique is significant. | 1323 | * and the loss of accuracy from this technique is significant. |
1333 | * | 1324 | * |
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect | |||
1337 | * was very large, however in such cases some method of Bayesian inference | 1328 | * was very large, however in such cases some method of Bayesian inference |
1338 | * might be more helpful. | 1329 | * might be more helpful. |
1339 | * | 1330 | * |
1340 | * @see clusterLanguages() | 1331 | * @param string $str input string |
1341 | * @access public | 1332 | * |
1342 | * @param string $str input string | 1333 | * @return array language scores (only those compared) |
1343 | * @return array language scores (only those compared) | 1334 | * @throws Text_LanguageDetect_Exception |
1344 | * @throws PEAR_Error | 1335 | * @see clusterLanguages() |
1345 | */ | 1336 | */ |
1346 | function clusteredSearch($str) | 1337 | public function clusteredSearch($str) |
1347 | { | 1338 | { |
1348 | |||
1349 | // input check | 1339 | // input check |
1350 | if (!Text_LanguageDetect_Parser::validateString($str)) { | 1340 | if (!Text_LanguageDetect_Parser::validateString($str)) { |
1351 | return array(); | 1341 | return array(); |
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect | |||
1359 | $dendogram_data = $result['fork_data']; | 1349 | $dendogram_data = $result['fork_data']; |
1360 | $dendogram_alias = $result['name_map']; | 1350 | $dendogram_alias = $result['name_map']; |
1361 | 1351 | ||
1362 | $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); | 1352 | $sample_obj = new Text_LanguageDetect_Parser($str); |
1363 | $sample_obj->prepareTrigram(); | 1353 | $sample_obj->prepareTrigram(); |
1364 | $sample_obj->setPadStart(!$this->_perl_compatible); | 1354 | $sample_obj->setPadStart(!$this->_perl_compatible); |
1365 | $sample_obj->analyze(); | 1355 | $sample_obj->analyze(); |
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect | |||
1372 | } | 1362 | } |
1373 | 1363 | ||
1374 | $i = 0; // counts the number of steps | 1364 | $i = 0; // counts the number of steps |
1375 | 1365 | ||
1376 | foreach ($dendogram_start as $lang) { | 1366 | foreach ($dendogram_start as $lang) { |
1377 | if (isset($dendogram_alias[$lang])) { | 1367 | if (isset($dendogram_alias[$lang])) { |
1378 | $lang_key = $dendogram_alias[$lang]; | 1368 | $lang_key = $dendogram_alias[$lang]; |
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect | |||
1382 | 1372 | ||
1383 | $scores[$lang] = $this->_normalize_score( | 1373 | $scores[$lang] = $this->_normalize_score( |
1384 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1374 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1385 | $sample_count); | 1375 | $sample_count |
1376 | ); | ||
1386 | 1377 | ||
1387 | $i++; | 1378 | $i++; |
1388 | } | 1379 | } |
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect | |||
1411 | 1402 | ||
1412 | $scores[$lang] = $this->_normalize_score( | 1403 | $scores[$lang] = $this->_normalize_score( |
1413 | $this->_distance($this->_lang_db[$lang_key], $sample_result), | 1404 | $this->_distance($this->_lang_db[$lang_key], $sample_result), |
1414 | $sample_count); | 1405 | $sample_count |
1406 | ); | ||
1415 | 1407 | ||
1416 | //todo: does not need to do same comparison again | 1408 | //todo: does not need to do same comparison again |
1417 | } | 1409 | } |
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect | |||
1428 | 1420 | ||
1429 | $diff = $scores[$cur_key] - $scores[$loser_key]; | 1421 | $diff = $scores[$cur_key] - $scores[$loser_key]; |
1430 | 1422 | ||
1431 | // $cur_key ({$dendogram_alias[$cur_key]}) wins | 1423 | // $cur_key ({$dendogram_alias[$cur_key]}) wins |
1432 | // over $loser_key ({$dendogram_alias[$loser_key]}) | 1424 | // over $loser_key ({$dendogram_alias[$loser_key]}) |
1433 | // with a difference of $diff | 1425 | // with a difference of $diff |
1434 | } | 1426 | } |
1435 | 1427 | ||
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect | |||
1439 | // which paths the algorithm decided to take along the tree | 1431 | // which paths the algorithm decided to take along the tree |
1440 | 1432 | ||
1441 | // but sometimes the last item is only the second highest | 1433 | // but sometimes the last item is only the second highest |
1442 | if ( ($this->_perl_compatible && (end($scores) > prev($scores))) | 1434 | if (($this->_perl_compatible && (end($scores) > prev($scores))) |
1443 | || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { | 1435 | || (!$this->_perl_compatible && (end($scores) < prev($scores))) |
1444 | 1436 | ) { | |
1445 | $real_last_score = current($scores); | 1437 | $real_last_score = current($scores); |
1446 | $real_last_key = key($scores); | 1438 | $real_last_key = key($scores); |
1447 | 1439 | ||
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect | |||
1449 | unset($scores[$real_last_key]); | 1441 | unset($scores[$real_last_key]); |
1450 | $scores[$real_last_key] = $real_last_score; | 1442 | $scores[$real_last_key] = $real_last_score; |
1451 | } | 1443 | } |
1452 | 1444 | ||
1453 | 1445 | ||
1454 | if (!$this->_perl_compatible) { | 1446 | if (!$this->_perl_compatible) { |
1455 | $scores = array_reverse($scores, true); | 1447 | $scores = array_reverse($scores, true); |
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect | |||
1464 | * | 1456 | * |
1465 | * Returns the numbers of characters (not bytes) in a utf8 string | 1457 | * Returns the numbers of characters (not bytes) in a utf8 string |
1466 | * | 1458 | * |
1467 | * @static | 1459 | * @param string $str string to get the length of |
1468 | * @access public | 1460 | * |
1469 | * @param string $str string to get the length of | 1461 | * @return int number of chars |
1470 | * @return int number of chars | ||
1471 | */ | 1462 | */ |
1472 | function utf8strlen($str) | 1463 | public static function utf8strlen($str) |
1473 | { | 1464 | { |
1474 | // utf8_decode() will convert unknown chars to '?', which is actually | 1465 | // utf8_decode() will convert unknown chars to '?', which is actually |
1475 | // ideal for counting. | 1466 | // ideal for counting. |
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect | |||
1482 | /** | 1473 | /** |
1483 | * Returns the unicode value of a utf8 char | 1474 | * Returns the unicode value of a utf8 char |
1484 | * | 1475 | * |
1485 | * @access protected | 1476 | * @param string $char a utf8 (possibly multi-byte) char |
1486 | * @param string $char a utf8 (possibly multi-byte) char | 1477 | * |
1487 | * @return int unicode value or -1 if malformatted | 1478 | * @return int unicode value |
1479 | * @access protected | ||
1480 | * @link http://en.wikipedia.org/wiki/UTF-8 | ||
1488 | */ | 1481 | */ |
1489 | function _utf8char2unicode($char) { | 1482 | function _utf8char2unicode($char) |
1490 | 1483 | { | |
1491 | // strlen() here will actually get the binary length of a single char | 1484 | // strlen() here will actually get the binary length of a single char |
1492 | switch (strlen($char)) { | 1485 | switch (strlen($char)) { |
1493 | 1486 | case 1: | |
1494 | // for a reference, see http://en.wikipedia.org/wiki/UTF-8 | 1487 | // normal ASCII-7 byte |
1495 | 1488 | // 0xxxxxxx --> 0xxxxxxx | |
1496 | case 1: | 1489 | return ord($char{0}); |
1497 | // normal ASCII-7 byte | 1490 | |
1498 | // 0xxxxxxx --> 0xxxxxxx | 1491 | case 2: |
1499 | return ord($char{0}); | 1492 | // 2 byte unicode |
1500 | 1493 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | |
1501 | case 2: | 1494 | $z = (ord($char{0}) & 0x000001F) << 6; |
1502 | // 2 byte unicode | 1495 | $x = (ord($char{1}) & 0x0000003F); |
1503 | // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx | 1496 | return ($z | $x); |
1504 | $z = (ord($char{0}) & 0x000001F) << 6; | 1497 | |
1505 | $x = (ord($char{1}) & 0x0000003F); | 1498 | case 3: |
1506 | 1499 | // 3 byte unicode | |
1507 | return ($z | $x); | 1500 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx |
1508 | 1501 | $z = (ord($char{0}) & 0x0000000F) << 12; | |
1509 | case 3: | 1502 | $x1 = (ord($char{1}) & 0x0000003F) << 6; |
1510 | // 3 byte unicode | 1503 | $x2 = (ord($char{2}) & 0x0000003F); |
1511 | // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx | 1504 | return ($z | $x1 | $x2); |
1512 | $z = (ord($char{0}) & 0x0000000F) << 12; | 1505 | |
1513 | $x1 = (ord($char{1}) & 0x0000003F) << 6; | 1506 | case 4: |
1514 | $x2 = (ord($char{2}) & 0x0000003F); | 1507 | // 4 byte unicode |
1515 | 1508 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | |
1516 | return ($z | $x1 | $x2); | 1509 | // 000zzzzz xxxxxxxx xxxxxxxx |
1517 | 1510 | $z1 = (ord($char{0}) & 0x00000007) << 18; | |
1518 | case 4: | 1511 | $z2 = (ord($char{1}) & 0x0000003F) << 12; |
1519 | // 4 byte unicode | 1512 | $x1 = (ord($char{2}) & 0x0000003F) << 6; |
1520 | // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> | 1513 | $x2 = (ord($char{3}) & 0x0000003F); |
1521 | // 000zzzzz xxxxxxxx xxxxxxxx | 1514 | return ($z1 | $z2 | $x1 | $x2); |
1522 | $z1 = (ord($char{0}) & 0x00000007) << 18; | ||
1523 | $z2 = (ord($char{1}) & 0x0000003F) << 12; | ||
1524 | $x1 = (ord($char{2}) & 0x0000003F) << 6; | ||
1525 | $x2 = (ord($char{3}) & 0x0000003F); | ||
1526 | |||
1527 | return ($z1 | $z2 | $x1 | $x2); | ||
1528 | |||
1529 | default: | ||
1530 | // error: malformatted char? | ||
1531 | return -1; | ||
1532 | } | 1515 | } |
1533 | } | 1516 | } |
1534 | 1517 | ||
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect | |||
1536 | * utf8-safe fast character iterator | 1519 | * utf8-safe fast character iterator |
1537 | * | 1520 | * |
1538 | * Will get the next character starting from $counter, which will then be | 1521 | * Will get the next character starting from $counter, which will then be |
1539 | * incremented. If a multi-byte char the bytes will be concatenated and | 1522 | * incremented. If a multi-byte char the bytes will be concatenated and |
1540 | * $counter will be incremeted by the number of bytes in the char. | 1523 | * $counter will be incremeted by the number of bytes in the char. |
1541 | * | 1524 | * |
1542 | * @access private | 1525 | * @param string $str the string being iterated over |
1543 | * @param string &$str the string being iterated over | 1526 | * @param int &$counter the iterator, will increment by reference |
1544 | * @param int &$counter the iterator, will increment by reference | 1527 | * @param bool $special_convert whether to do special conversions |
1545 | * @param bool $special_convert whether to do special conversions | 1528 | * |
1546 | * @return char the next (possibly multi-byte) char from $counter | 1529 | * @return char the next (possibly multi-byte) char from $counter |
1530 | * @access private | ||
1547 | */ | 1531 | */ |
1548 | function _next_char(&$str, &$counter, $special_convert = false) | 1532 | static function _next_char($str, &$counter, $special_convert = false) |
1549 | { | 1533 | { |
1550 | |||
1551 | $char = $str{$counter++}; | 1534 | $char = $str{$counter++}; |
1552 | $ord = ord($char); | 1535 | $ord = ord($char); |
1553 | 1536 | ||
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect | |||
1556 | 1539 | ||
1557 | // normal ascii one byte char | 1540 | // normal ascii one byte char |
1558 | if ($ord <= 127) { | 1541 | if ($ord <= 127) { |
1559 | |||
1560 | // special conversions needed for this package | 1542 | // special conversions needed for this package |
1561 | // (that only apply to regular ascii characters) | 1543 | // (that only apply to regular ascii characters) |
1562 | // lower case, and convert all non-alphanumeric characters | 1544 | // lower case, and convert all non-alphanumeric characters |
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect | |||
1571 | 1553 | ||
1572 | return $char; | 1554 | return $char; |
1573 | 1555 | ||
1574 | // multi-byte chars | ||
1575 | } elseif ($ord >> 5 == 6) { // two-byte char | 1556 | } elseif ($ord >> 5 == 6) { // two-byte char |
1557 | // multi-byte chars | ||
1576 | $nextchar = $str{$counter++}; // get next byte | 1558 | $nextchar = $str{$counter++}; // get next byte |
1577 | 1559 | ||
1578 | // lower-casing of non-ascii characters is still incomplete | 1560 | // lower-casing of non-ascii characters is still incomplete |
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect | |||
1582 | if ($ord == 195) { | 1564 | if ($ord == 195) { |
1583 | $nextord = ord($nextchar); | 1565 | $nextord = ord($nextchar); |
1584 | $nextord_adj = $nextord + 64; | 1566 | $nextord_adj = $nextord + 64; |
1585 | // for a reference, see | 1567 | // for a reference, see |
1586 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html | 1568 | // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html |
1587 | 1569 | ||
1588 | // À - Þ but not × | 1570 | // À - Þ but not × |
1589 | if ( $nextord_adj >= 192 | 1571 | if ($nextord_adj >= 192 |
1590 | && $nextord_adj <= 222 | 1572 | && $nextord_adj <= 222 |
1591 | && $nextord_adj != 215) { | 1573 | && $nextord_adj != 215 |
1592 | 1574 | ) { | |
1593 | $nextchar = chr($nextord + 32); | 1575 | $nextchar = chr($nextord + 32); |
1594 | } | 1576 | } |
1595 | 1577 | ||
1596 | // lower case cyrillic alphabet | ||
1597 | } elseif ($ord == 208) { | 1578 | } elseif ($ord == 208) { |
1579 | // lower case cyrillic alphabet | ||
1598 | $nextord = ord($nextchar); | 1580 | $nextord = ord($nextchar); |
1599 | // if A - Pe | 1581 | // if A - Pe |
1600 | if ($nextord >= 144 && $nextord <= 159) { | 1582 | if ($nextord >= 144 && $nextord <= 159) { |
1601 | // lower case | 1583 | // lower case |
1602 | $nextchar = chr($nextord + 32); | 1584 | $nextchar = chr($nextord + 32); |
1603 | 1585 | ||
1604 | // if Er - Ya | ||
1605 | } elseif ($nextord >= 160 && $nextord <= 175) { | 1586 | } elseif ($nextord >= 160 && $nextord <= 175) { |
1587 | // if Er - Ya | ||
1606 | // lower case | 1588 | // lower case |
1607 | $char = chr(209); // == $ord++ | 1589 | $char = chr(209); // == $ord++ |
1608 | $nextchar = chr($nextord - 32); | 1590 | $nextchar = chr($nextord - 32); |
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect | |||
1611 | } | 1593 | } |
1612 | 1594 | ||
1613 | // tag on next byte | 1595 | // tag on next byte |
1614 | return $char . $nextchar; | 1596 | return $char . $nextchar; |
1615 | |||
1616 | } elseif ($ord >> 4 == 14) { // three-byte char | 1597 | } elseif ($ord >> 4 == 14) { // three-byte char |
1617 | 1598 | ||
1618 | // tag on next 2 bytes | 1599 | // tag on next 2 bytes |
1619 | return $char . $str{$counter++} . $str{$counter++}; | 1600 | return $char . $str{$counter++} . $str{$counter++}; |
1620 | 1601 | ||
1621 | } elseif ($ord >> 3 == 30) { // four-byte char | 1602 | } elseif ($ord >> 3 == 30) { // four-byte char |
1622 | 1603 | ||
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect | |||
1628 | } | 1609 | } |
1629 | } | 1610 | } |
1630 | 1611 | ||
1631 | } | 1612 | /** |
1613 | * Converts an $language input parameter from the configured mode | ||
1614 | * to the language name that is used internally. | ||
1615 | * | ||
1616 | * Works for strings and arrays. | ||
1617 | * | ||
1618 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1619 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1620 | * converts the keys to the language name. | ||
1621 | * | ||
1622 | * @return string|array Language name | ||
1623 | */ | ||
1624 | function _convertFromNameMode($lang, $convertKey = false) | ||
1625 | { | ||
1626 | if ($this->_name_mode == 0) { | ||
1627 | return $lang; | ||
1628 | } | ||
1629 | |||
1630 | if ($this->_name_mode == 2) { | ||
1631 | $method = 'code2ToName'; | ||
1632 | } else { | ||
1633 | $method = 'code3ToName'; | ||
1634 | } | ||
1635 | |||
1636 | if (is_string($lang)) { | ||
1637 | return (string)Text_LanguageDetect_ISO639::$method($lang); | ||
1638 | } | ||
1639 | |||
1640 | $newlang = array(); | ||
1641 | foreach ($lang as $key => $val) { | ||
1642 | if ($convertKey) { | ||
1643 | $newkey = (string)Text_LanguageDetect_ISO639::$method($key); | ||
1644 | $newlang[$newkey] = $val; | ||
1645 | } else { | ||
1646 | $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); | ||
1647 | } | ||
1648 | } | ||
1649 | return $newlang; | ||
1650 | } | ||
1632 | 1651 | ||
1633 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | 1652 | /** |
1653 | * Converts an $language output parameter from the language name that is | ||
1654 | * used internally to the configured mode. | ||
1655 | * | ||
1656 | * Works for strings and arrays. | ||
1657 | * | ||
1658 | * @param string|array $lang A language description ("english"/"en"/"eng") | ||
1659 | * @param boolean $convertKey If $lang is an array, setting $key | ||
1660 | * converts the keys to the language name. | ||
1661 | * | ||
1662 | * @return string|array Language name | ||
1663 | */ | ||
1664 | function _convertToNameMode($lang, $convertKey = false) | ||
1665 | { | ||
1666 | if ($this->_name_mode == 0) { | ||
1667 | return $lang; | ||
1668 | } | ||
1669 | |||
1670 | if ($this->_name_mode == 2) { | ||
1671 | $method = 'nameToCode2'; | ||
1672 | } else { | ||
1673 | $method = 'nameToCode3'; | ||
1674 | } | ||
1675 | |||
1676 | if (is_string($lang)) { | ||
1677 | return Text_LanguageDetect_ISO639::$method($lang); | ||
1678 | } | ||
1679 | |||
1680 | $newlang = array(); | ||
1681 | foreach ($lang as $key => $val) { | ||
1682 | if ($convertKey) { | ||
1683 | $newkey = Text_LanguageDetect_ISO639::$method($key); | ||
1684 | $newlang[$newkey] = $val; | ||
1685 | } else { | ||
1686 | $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); | ||
1687 | } | ||
1688 | } | ||
1689 | return $newlang; | ||
1690 | } | ||
1691 | } | ||
1634 | 1692 | ||
1635 | ?> | 1693 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file |
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php new file mode 100644 index 00000000..196d994f --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php | |||
@@ -0,0 +1,57 @@ | |||
1 | <?php | ||
2 | class Text_LanguageDetect_Exception extends Exception | ||
3 | { | ||
4 | /** | ||
5 | * Database file could not be found | ||
6 | */ | ||
7 | const DB_NOT_FOUND = 10; | ||
8 | |||
9 | /** | ||
10 | * Database file found, but not readable | ||
11 | */ | ||
12 | const DB_NOT_READABLE = 11; | ||
13 | |||
14 | /** | ||
15 | * Database file is empty | ||
16 | */ | ||
17 | const DB_EMPTY = 12; | ||
18 | |||
19 | /** | ||
20 | * Database contents is not a PHP array | ||
21 | */ | ||
22 | const DB_NOT_ARRAY = 13; | ||
23 | |||
24 | /** | ||
25 | * Magic quotes are activated | ||
26 | */ | ||
27 | const MAGIC_QUOTES = 14; | ||
28 | |||
29 | |||
30 | /** | ||
31 | * Parameter of invalid type passed to method | ||
32 | */ | ||
33 | const PARAM_TYPE = 20; | ||
34 | |||
35 | /** | ||
36 | * Character in parameter is invalid | ||
37 | */ | ||
38 | const INVALID_CHAR = 21; | ||
39 | |||
40 | |||
41 | /** | ||
42 | * Language is not in the database | ||
43 | */ | ||
44 | const UNKNOWN_LANGUAGE = 30; | ||
45 | |||
46 | |||
47 | /** | ||
48 | * Error during block detection | ||
49 | */ | ||
50 | const BLOCK_DETECTION = 40; | ||
51 | |||
52 | |||
53 | /** | ||
54 | * Error while clustering languages | ||
55 | */ | ||
56 | const NO_HIGHEST_KEY = 50; | ||
57 | } | ||
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php new file mode 100644 index 00000000..05b0590d --- /dev/null +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php | |||
@@ -0,0 +1,339 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Part of Text_LanguageDetect | ||
4 | * | ||
5 | * PHP version 5 | ||
6 | * | ||
7 | * @category Text | ||
8 | * @package Text_LanguageDetect | ||
9 | * @author Christian Weiske <cweiske@php.net> | ||
10 | * @copyright 2011 Christian Weiske <cweiske@php.net> | ||
11 | * @license http://www.debian.org/misc/bsd.license BSD | ||
12 | * @version SVN: $Id$ | ||
13 | * @link http://pear.php.net/package/Text_LanguageDetect/ | ||
14 | */ | ||
15 | |||
16 | /** | ||
17 | * Provides a mapping between the languages from lang.dat and the | ||
18 | * ISO 639-1 and ISO-639-2 codes. | ||
19 | * | ||
20 | * Note that this class contains only languages that exist in lang.dat. | ||
21 | * | ||
22 | * @category Text | ||
23 | * @package Text_LanguageDetect | ||
24 | * @author Christian Weiske <cweiske@php.net> | ||
25 | * @copyright 2011 Christian Weiske <cweiske@php.net> | ||
26 | * @license http://www.debian.org/misc/bsd.license BSD | ||
27 | * @link http://www.loc.gov/standards/iso639-2/php/code_list.php | ||
28 | */ | ||
29 | class Text_LanguageDetect_ISO639 | ||
30 | { | ||
31 | /** | ||
32 | * Maps all language names from the language database to the | ||
33 | * ISO 639-1 2-letter language code. | ||
34 | * | ||
35 | * NULL indicates that there is no 2-letter code. | ||
36 | * | ||
37 | * @var array | ||
38 | */ | ||
39 | public static $nameToCode2 = array( | ||
40 | 'albanian' => 'sq', | ||
41 | 'arabic' => 'ar', | ||
42 | 'azeri' => 'az', | ||
43 | 'bengali' => 'bn', | ||
44 | 'bulgarian' => 'bg', | ||
45 | 'cebuano' => null, | ||
46 | 'croatian' => 'hr', | ||
47 | 'czech' => 'cs', | ||
48 | 'danish' => 'da', | ||
49 | 'dutch' => 'nl', | ||
50 | 'english' => 'en', | ||
51 | 'estonian' => 'et', | ||
52 | 'farsi' => 'fa', | ||
53 | 'finnish' => 'fi', | ||
54 | 'french' => 'fr', | ||
55 | 'german' => 'de', | ||
56 | 'hausa' => 'ha', | ||
57 | 'hawaiian' => null, | ||
58 | 'hindi' => 'hi', | ||
59 | 'hungarian' => 'hu', | ||
60 | 'icelandic' => 'is', | ||
61 | 'indonesian' => 'id', | ||
62 | 'italian' => 'it', | ||
63 | 'kazakh' => 'kk', | ||
64 | 'kyrgyz' => 'ky', | ||
65 | 'latin' => 'la', | ||
66 | 'latvian' => 'lv', | ||
67 | 'lithuanian' => 'lt', | ||
68 | 'macedonian' => 'mk', | ||
69 | 'mongolian' => 'mn', | ||
70 | 'nepali' => 'ne', | ||
71 | 'norwegian' => 'no', | ||
72 | 'pashto' => 'ps', | ||
73 | 'pidgin' => null, | ||
74 | 'polish' => 'pl', | ||
75 | 'portuguese' => 'pt', | ||
76 | 'romanian' => 'ro', | ||
77 | 'russian' => 'ru', | ||
78 | 'serbian' => 'sr', | ||
79 | 'slovak' => 'sk', | ||
80 | 'slovene' => 'sl', | ||
81 | 'somali' => 'so', | ||
82 | 'spanish' => 'es', | ||
83 | 'swahili' => 'sw', | ||
84 | 'swedish' => 'sv', | ||
85 | 'tagalog' => 'tl', | ||
86 | 'turkish' => 'tr', | ||
87 | 'ukrainian' => 'uk', | ||
88 | 'urdu' => 'ur', | ||
89 | 'uzbek' => 'uz', | ||
90 | 'vietnamese' => 'vi', | ||
91 | 'welsh' => 'cy', | ||
92 | ); | ||
93 | |||
94 | /** | ||
95 | * Maps all language names from the language database to the | ||
96 | * ISO 639-2 3-letter language code. | ||
97 | * | ||
98 | * @var array | ||
99 | */ | ||
100 | public static $nameToCode3 = array( | ||
101 | 'albanian' => 'sqi', | ||
102 | 'arabic' => 'ara', | ||
103 | 'azeri' => 'aze', | ||
104 | 'bengali' => 'ben', | ||
105 | 'bulgarian' => 'bul', | ||
106 | 'cebuano' => 'ceb', | ||
107 | 'croatian' => 'hrv', | ||
108 | 'czech' => 'ces', | ||
109 | 'danish' => 'dan', | ||
110 | 'dutch' => 'nld', | ||
111 | 'english' => 'eng', | ||
112 | 'estonian' => 'est', | ||
113 | 'farsi' => 'fas', | ||
114 | 'finnish' => 'fin', | ||
115 | 'french' => 'fra', | ||
116 | 'german' => 'deu', | ||
117 | 'hausa' => 'hau', | ||
118 | 'hawaiian' => 'haw', | ||
119 | 'hindi' => 'hin', | ||
120 | 'hungarian' => 'hun', | ||
121 | 'icelandic' => 'isl', | ||
122 | 'indonesian' => 'ind', | ||
123 | 'italian' => 'ita', | ||
124 | 'kazakh' => 'kaz', | ||
125 | 'kyrgyz' => 'kir', | ||
126 | 'latin' => 'lat', | ||
127 | 'latvian' => 'lav', | ||
128 | 'lithuanian' => 'lit', | ||
129 | 'macedonian' => 'mkd', | ||
130 | 'mongolian' => 'mon', | ||
131 | 'nepali' => 'nep', | ||
132 | 'norwegian' => 'nor', | ||
133 | 'pashto' => 'pus', | ||
134 | 'pidgin' => 'crp', | ||
135 | 'polish' => 'pol', | ||
136 | 'portuguese' => 'por', | ||
137 | 'romanian' => 'ron', | ||
138 | 'russian' => 'rus', | ||
139 | 'serbian' => 'srp', | ||
140 | 'slovak' => 'slk', | ||
141 | 'slovene' => 'slv', | ||
142 | 'somali' => 'som', | ||
143 | 'spanish' => 'spa', | ||
144 | 'swahili' => 'swa', | ||
145 | 'swedish' => 'swe', | ||
146 | 'tagalog' => 'tgl', | ||
147 | 'turkish' => 'tur', | ||
148 | 'ukrainian' => 'ukr', | ||
149 | 'urdu' => 'urd', | ||
150 | 'uzbek' => 'uzb', | ||
151 | 'vietnamese' => 'vie', | ||
152 | 'welsh' => 'cym', | ||
153 | ); | ||
154 | |||
155 | /** | ||
156 | * Maps ISO 639-1 2-letter language codes to the language names | ||
157 | * in the language database | ||
158 | * | ||
159 | * Not all languages have a 2 letter code, so some are missing | ||
160 | * | ||
161 | * @var array | ||
162 | */ | ||
163 | public static $code2ToName = array( | ||
164 | 'ar' => 'arabic', | ||
165 | 'az' => 'azeri', | ||
166 | 'bg' => 'bulgarian', | ||
167 | 'bn' => 'bengali', | ||
168 | 'cs' => 'czech', | ||
169 | 'cy' => 'welsh', | ||
170 | 'da' => 'danish', | ||
171 | 'de' => 'german', | ||
172 | 'en' => 'english', | ||
173 | 'es' => 'spanish', | ||
174 | 'et' => 'estonian', | ||
175 | 'fa' => 'farsi', | ||
176 | 'fi' => 'finnish', | ||
177 | 'fr' => 'french', | ||
178 | 'ha' => 'hausa', | ||
179 | 'hi' => 'hindi', | ||
180 | 'hr' => 'croatian', | ||
181 | 'hu' => 'hungarian', | ||
182 | 'id' => 'indonesian', | ||
183 | 'is' => 'icelandic', | ||
184 | 'it' => 'italian', | ||
185 | 'kk' => 'kazakh', | ||
186 | 'ky' => 'kyrgyz', | ||
187 | 'la' => 'latin', | ||
188 | 'lt' => 'lithuanian', | ||
189 | 'lv' => 'latvian', | ||
190 | 'mk' => 'macedonian', | ||
191 | 'mn' => 'mongolian', | ||
192 | 'ne' => 'nepali', | ||
193 | 'nl' => 'dutch', | ||
194 | 'no' => 'norwegian', | ||
195 | 'pl' => 'polish', | ||
196 | 'ps' => 'pashto', | ||
197 | 'pt' => 'portuguese', | ||
198 | 'ro' => 'romanian', | ||
199 | 'ru' => 'russian', | ||
200 | 'sk' => 'slovak', | ||
201 | 'sl' => 'slovene', | ||
202 | 'so' => 'somali', | ||
203 | 'sq' => 'albanian', | ||
204 | 'sr' => 'serbian', | ||
205 | 'sv' => 'swedish', | ||
206 | 'sw' => 'swahili', | ||
207 | 'tl' => 'tagalog', | ||
208 | 'tr' => 'turkish', | ||
209 | 'uk' => 'ukrainian', | ||
210 | 'ur' => 'urdu', | ||
211 | 'uz' => 'uzbek', | ||
212 | 'vi' => 'vietnamese', | ||
213 | ); | ||
214 | |||
215 | /** | ||
216 | * Maps ISO 639-2 3-letter language codes to the language names | ||
217 | * in the language database. | ||
218 | * | ||
219 | * @var array | ||
220 | */ | ||
221 | public static $code3ToName = array( | ||
222 | 'ara' => 'arabic', | ||
223 | 'aze' => 'azeri', | ||
224 | 'ben' => 'bengali', | ||
225 | 'bul' => 'bulgarian', | ||
226 | 'ceb' => 'cebuano', | ||
227 | 'ces' => 'czech', | ||
228 | 'crp' => 'pidgin', | ||
229 | 'cym' => 'welsh', | ||
230 | 'dan' => 'danish', | ||
231 | 'deu' => 'german', | ||
232 | 'eng' => 'english', | ||
233 | 'est' => 'estonian', | ||
234 | 'fas' => 'farsi', | ||
235 | 'fin' => 'finnish', | ||
236 | 'fra' => 'french', | ||
237 | 'hau' => 'hausa', | ||
238 | 'haw' => 'hawaiian', | ||
239 | 'hin' => 'hindi', | ||
240 | 'hrv' => 'croatian', | ||
241 | 'hun' => 'hungarian', | ||
242 | 'ind' => 'indonesian', | ||
243 | 'isl' => 'icelandic', | ||
244 | 'ita' => 'italian', | ||
245 | 'kaz' => 'kazakh', | ||
246 | 'kir' => 'kyrgyz', | ||
247 | 'lat' => 'latin', | ||
248 | 'lav' => 'latvian', | ||
249 | 'lit' => 'lithuanian', | ||
250 | 'mkd' => 'macedonian', | ||
251 | 'mon' => 'mongolian', | ||
252 | 'nep' => 'nepali', | ||
253 | 'nld' => 'dutch', | ||
254 | 'nor' => 'norwegian', | ||
255 | 'pol' => 'polish', | ||
256 | 'por' => 'portuguese', | ||
257 | 'pus' => 'pashto', | ||
258 | 'rom' => 'romanian', | ||
259 | 'rus' => 'russian', | ||
260 | 'slk' => 'slovak', | ||
261 | 'slv' => 'slovene', | ||
262 | 'som' => 'somali', | ||
263 | 'spa' => 'spanish', | ||
264 | 'sqi' => 'albanian', | ||
265 | 'srp' => 'serbian', | ||
266 | 'swa' => 'swahili', | ||
267 | 'swe' => 'swedish', | ||
268 | 'tgl' => 'tagalog', | ||
269 | 'tur' => 'turkish', | ||
270 | 'ukr' => 'ukrainian', | ||
271 | 'urd' => 'urdu', | ||
272 | 'uzb' => 'uzbek', | ||
273 | 'vie' => 'vietnamese', | ||
274 | ); | ||
275 | |||
276 | /** | ||
277 | * Returns the 2-letter ISO 639-1 code for the given language name. | ||
278 | * | ||
279 | * @param string $lang English language name like "swedish" | ||
280 | * | ||
281 | * @return string Two-letter language code (e.g. "sv") or NULL if not found | ||
282 | */ | ||
283 | public static function nameToCode2($lang) | ||
284 | { | ||
285 | $lang = strtolower($lang); | ||
286 | if (!isset(self::$nameToCode2[$lang])) { | ||
287 | return null; | ||
288 | } | ||
289 | return self::$nameToCode2[$lang]; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * Returns the 3-letter ISO 639-2 code for the given language name. | ||
294 | * | ||
295 | * @param string $lang English language name like "swedish" | ||
296 | * | ||
297 | * @return string Three-letter language code (e.g. "swe") or NULL if not found | ||
298 | */ | ||
299 | public static function nameToCode3($lang) | ||
300 | { | ||
301 | $lang = strtolower($lang); | ||
302 | if (!isset(self::$nameToCode3[$lang])) { | ||
303 | return null; | ||
304 | } | ||
305 | return self::$nameToCode3[$lang]; | ||
306 | } | ||
307 | |||
308 | /** | ||
309 | * Returns the language name for the given 2-letter ISO 639-1 code. | ||
310 | * | ||
311 | * @param string $code Two-letter language code (e.g. "sv") | ||
312 | * | ||
313 | * @return string English language name like "swedish" | ||
314 | */ | ||
315 | public static function code2ToName($code) | ||
316 | { | ||
317 | $lang = strtolower($code); | ||
318 | if (!isset(self::$code2ToName[$code])) { | ||
319 | return null; | ||
320 | } | ||
321 | return self::$code2ToName[$code]; | ||
322 | } | ||
323 | |||
324 | /** | ||
325 | * Returns the language name for the given 3-letter ISO 639-2 code. | ||
326 | * | ||
327 | * @param string $code Three-letter language code (e.g. "swe") | ||
328 | * | ||
329 | * @return string English language name like "swedish" | ||
330 | */ | ||
331 | public static function code3ToName($code) | ||
332 | { | ||
333 | $lang = strtolower($code); | ||
334 | if (!isset(self::$code3ToName[$code])) { | ||
335 | return null; | ||
336 | } | ||
337 | return self::$code3ToName[$code]; | ||
338 | } | ||
339 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php index 7f15fa98..fb0e1e20 100644 --- a/inc/3rdparty/libraries/language-detect/Parser.php +++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php | |||
@@ -8,7 +8,7 @@ | |||
8 | * @author Nicholas Pisarro | 8 | * @author Nicholas Pisarro |
9 | * @copyright 2006 | 9 | * @copyright 2006 |
10 | * @license BSD | 10 | * @license BSD |
11 | * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ | 11 | * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ |
12 | * @link http://pear.php.net/package/Text_LanguageDetect/ | 12 | * @link http://pear.php.net/package/Text_LanguageDetect/ |
13 | * @link http://langdetect.blogspot.com/ | 13 | * @link http://langdetect.blogspot.com/ |
14 | */ | 14 | */ |
@@ -28,7 +28,7 @@ | |||
28 | * @author Nicholas Pisarro | 28 | * @author Nicholas Pisarro |
29 | * @copyright 2006 | 29 | * @copyright 2006 |
30 | * @license BSD | 30 | * @license BSD |
31 | * @version release: 0.2.3 | 31 | * @version release: 0.3.0 |
32 | */ | 32 | */ |
33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect | 33 | class Text_LanguageDetect_Parser extends Text_LanguageDetect |
34 | { | 34 | { |
@@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect | |||
102 | * @access private | 102 | * @access private |
103 | * @param string $string string to be parsed | 103 | * @param string $string string to be parsed |
104 | */ | 104 | */ |
105 | function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { | 105 | function Text_LanguageDetect_Parser($string) { |
106 | if (isset($db)) $this->_db_filename = $db; | ||
107 | if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; | ||
108 | $this->_string = $string; | 106 | $this->_string = $string; |
109 | } | 107 | } |
110 | 108 | ||
111 | /** | 109 | /** |
112 | * Returns true if a string is suitable for parsing | 110 | * Returns true if a string is suitable for parsing |
113 | * | 111 | * |
114 | * @static | ||
115 | * @access public | ||
116 | * @param string $str input string to test | 112 | * @param string $str input string to test |
117 | * @return bool true if acceptable, false if not | 113 | * @return bool true if acceptable, false if not |
118 | */ | 114 | */ |
119 | function validateString($str) { | 115 | public static function validateString($str) { |
120 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { | 116 | if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { |
121 | return true; | 117 | return true; |
122 | } else { | 118 | } else { |
@@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect | |||
222 | 218 | ||
223 | // unicode startup | 219 | // unicode startup |
224 | if ($this->_compile_unicode) { | 220 | if ($this->_compile_unicode) { |
225 | $blocks =& $this->_read_unicode_block_db(); | 221 | $blocks = $this->_read_unicode_block_db(); |
226 | |||
227 | $block_count = count($blocks); | 222 | $block_count = count($blocks); |
228 | 223 | ||
229 | $skipped_count = 0; | 224 | $skipped_count = 0; |
@@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect | |||
349 | } | 344 | } |
350 | } | 345 | } |
351 | 346 | ||
352 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ | 347 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file |
353 | |||
354 | ?> | ||
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php index 2e8991cc..d0f09d74 100644 --- a/inc/3rdparty/libraries/readability/Readability.php +++ b/inc/3rdparty/libraries/readability/Readability.php | |||
@@ -1,1138 +1,1138 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Arc90's Readability ported to PHP for FiveFilters.org | 3 | * Arc90's Readability ported to PHP for FiveFilters.org |
4 | * Based on readability.js version 1.7.1 (without multi-page support) | 4 | * Based on readability.js version 1.7.1 (without multi-page support) |
5 | * Updated to allow HTML5 parsing with html5lib | 5 | * Updated to allow HTML5 parsing with html5lib |
6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds | 6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds |
7 | * ------------------------------------------------------ | 7 | * ------------------------------------------------------ |
8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | 8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js |
9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | 9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ |
10 | * JS Source: http://code.google.com/p/arc90labs-readability | 10 | * JS Source: http://code.google.com/p/arc90labs-readability |
11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | 11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net |
12 | * More information: http://fivefilters.org/content-only/ | 12 | * More information: http://fivefilters.org/content-only/ |
13 | * License: Apache License, Version 2.0 | 13 | * License: Apache License, Version 2.0 |
14 | * Requires: PHP5 | 14 | * Requires: PHP5 |
15 | * Date: 2012-09-19 | 15 | * Date: 2012-09-19 |
16 | * | 16 | * |
17 | * Differences between the PHP port and the original | 17 | * Differences between the PHP port and the original |
18 | * ------------------------------------------------------ | 18 | * ------------------------------------------------------ |
19 | * Arc90's Readability is designed to run in the browser. It works on the DOM | 19 | * Arc90's Readability is designed to run in the browser. It works on the DOM |
20 | * tree (the parsed HTML) after the page's CSS styles have been applied and | 20 | * tree (the parsed HTML) after the page's CSS styles have been applied and |
21 | * Javascript code executed. This PHP port does not run inside a browser. | 21 | * Javascript code executed. This PHP port does not run inside a browser. |
22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | 22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot |
23 | * rely on CSS or Javascript support. As such, the results will not always | 23 | * rely on CSS or Javascript support. As such, the results will not always |
24 | * match Arc90's Readability. (For example, if a web page contains CSS style | 24 | * match Arc90's Readability. (For example, if a web page contains CSS style |
25 | * rules or Javascript code which hide certain HTML elements from display, | 25 | * rules or Javascript code which hide certain HTML elements from display, |
26 | * Arc90's Readability will dismiss those from consideration but our PHP port, | 26 | * Arc90's Readability will dismiss those from consideration but our PHP port, |
27 | * unable to understand CSS or Javascript, will not know any better.) | 27 | * unable to understand CSS or Javascript, will not know any better.) |
28 | * | 28 | * |
29 | * Another significant difference is that the aim of Arc90's Readability is | 29 | * Another significant difference is that the aim of Arc90's Readability is |
30 | * to re-present the main content block of a given web page so users can | 30 | * to re-present the main content block of a given web page so users can |
31 | * read it more easily in their browsers. Correct identification, clean up, | 31 | * read it more easily in their browsers. Correct identification, clean up, |
32 | * and separation of the content block is only a part of this process. | 32 | * and separation of the content block is only a part of this process. |
33 | * This PHP port is only concerned with this part, it does not include code | 33 | * This PHP port is only concerned with this part, it does not include code |
34 | * that relates to presentation in the browser - Arc90 already do | 34 | * that relates to presentation in the browser - Arc90 already do |
35 | * that extremely well, and for PDF output there's FiveFilters.org's | 35 | * that extremely well, and for PDF output there's FiveFilters.org's |
36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | 36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. |
37 | * | 37 | * |
38 | * Finally, this class contains methods that might be useful for developers | 38 | * Finally, this class contains methods that might be useful for developers |
39 | * working on HTML document fragments. So without deviating too much from | 39 | * working on HTML document fragments. So without deviating too much from |
40 | * the original code (which I don't want to do because it makes debugging | 40 | * the original code (which I don't want to do because it makes debugging |
41 | * and updating more difficult), I've tried to make it a little more | 41 | * and updating more difficult), I've tried to make it a little more |
42 | * developer friendly. You should be able to use the methods here on | 42 | * developer friendly. You should be able to use the methods here on |
43 | * existing DOMElement objects without passing an entire HTML document to | 43 | * existing DOMElement objects without passing an entire HTML document to |
44 | * be parsed. | 44 | * be parsed. |
45 | */ | 45 | */ |
46 | 46 | ||
47 | // This class allows us to do JavaScript like assignements to innerHTML | 47 | // This class allows us to do JavaScript like assignements to innerHTML |
48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | 48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); |
49 | 49 | ||
50 | // Alternative usage (for testing only!) | 50 | // Alternative usage (for testing only!) |
51 | // uncomment the lines below and call Readability.php in your browser | 51 | // uncomment the lines below and call Readability.php in your browser |
52 | // passing it the URL of the page you'd like content from, e.g.: | 52 | // passing it the URL of the page you'd like content from, e.g.: |
53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | 53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php |
54 | 54 | ||
55 | /* | 55 | /* |
56 | if (!isset($_GET['url']) || $_GET['url'] == '') { | 56 | if (!isset($_GET['url']) || $_GET['url'] == '') { |
57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | 57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); |
58 | } | 58 | } |
59 | $url = $_GET['url']; | 59 | $url = $_GET['url']; |
60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | 60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; |
61 | $html = file_get_contents($url); | 61 | $html = file_get_contents($url); |
62 | $r = new Readability($html, $url); | 62 | $r = new Readability($html, $url); |
63 | $r->init(); | 63 | $r->init(); |
64 | echo $r->articleContent->innerHTML; | 64 | echo $r->articleContent->innerHTML; |
65 | */ | 65 | */ |
66 | 66 | ||
67 | class Readability | 67 | class Readability |
68 | { | 68 | { |
69 | public $version = '1.7.1-without-multi-page'; | 69 | public $version = '1.7.1-without-multi-page'; |
70 | public $convertLinksToFootnotes = false; | 70 | public $convertLinksToFootnotes = false; |
71 | public $revertForcedParagraphElements = true; | 71 | public $revertForcedParagraphElements = true; |
72 | public $articleTitle; | 72 | public $articleTitle; |
73 | public $articleContent; | 73 | public $articleContent; |
74 | public $dom; | 74 | public $dom; |
75 | public $url = null; // optional - URL where HTML was retrieved | 75 | public $url = null; // optional - URL where HTML was retrieved |
76 | public $debug = false; | 76 | public $debug = false; |
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 | 77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 |
78 | protected $body = null; // | 78 | protected $body = null; // |
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | 79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later |
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | 80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. |
81 | protected $success = false; // indicates whether we were able to extract or not | 81 | protected $success = false; // indicates whether we were able to extract or not |
82 | 82 | ||
83 | /** | 83 | /** |
84 | * All of the regular expressions in use within readability. | 84 | * All of the regular expressions in use within readability. |
85 | * Defined up here so we don't instantiate them repeatedly in loops. | 85 | * Defined up here so we don't instantiate them repeatedly in loops. |
86 | **/ | 86 | **/ |
87 | public $regexps = array( | 87 | public $regexps = array( |
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', | 88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', |
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | 89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', | 90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', |
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | 91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', |
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | 92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', |
93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | 93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', |
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | 94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', |
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | 95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() |
96 | 'normalize' => '/\s{2,}/', | 96 | 'normalize' => '/\s{2,}/', |
97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | 97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', |
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', | 98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', |
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | 99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' |
100 | ); | 100 | ); |
101 | 101 | ||
102 | /* constants */ | 102 | /* constants */ |
103 | const FLAG_STRIP_UNLIKELYS = 1; | 103 | const FLAG_STRIP_UNLIKELYS = 1; |
104 | const FLAG_WEIGHT_CLASSES = 2; | 104 | const FLAG_WEIGHT_CLASSES = 2; |
105 | const FLAG_CLEAN_CONDITIONALLY = 4; | 105 | const FLAG_CLEAN_CONDITIONALLY = 4; |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * Create instance of Readability | 108 | * Create instance of Readability |
109 | * @param string UTF-8 encoded string | 109 | * @param string UTF-8 encoded string |
110 | * @param string (optional) URL associated with HTML (used for footnotes) | 110 | * @param string (optional) URL associated with HTML (used for footnotes) |
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
112 | */ | 112 | */ |
113 | function __construct($html, $url=null, $parser='libxml') | 113 | function __construct($html, $url=null, $parser='libxml') |
114 | { | 114 | { |
115 | $this->url = $url; | 115 | $this->url = $url; |
116 | /* Turn all double br's into p's */ | 116 | /* Turn all double br's into p's */ |
117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | 117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); |
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | 118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); |
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | 119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); |
120 | if (trim($html) == '') $html = '<html></html>'; | 120 | if (trim($html) == '') $html = '<html></html>'; |
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { | 121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { |
122 | // all good | 122 | // all good |
123 | } else { | 123 | } else { |
124 | $this->dom = new DOMDocument(); | 124 | $this->dom = new DOMDocument(); |
125 | $this->dom->preserveWhiteSpace = false; | 125 | $this->dom->preserveWhiteSpace = false; |
126 | @$this->dom->loadHTML($html); | 126 | @$this->dom->loadHTML($html); |
127 | } | 127 | } |
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | 128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); |
129 | } | 129 | } |
130 | 130 | ||
131 | /** | 131 | /** |
132 | * Get article title element | 132 | * Get article title element |
133 | * @return DOMElement | 133 | * @return DOMElement |
134 | */ | 134 | */ |
135 | public function getTitle() { | 135 | public function getTitle() { |
136 | return $this->articleTitle; | 136 | return $this->articleTitle; |
137 | } | 137 | } |
138 | 138 | ||
139 | /** | 139 | /** |
140 | * Get article content element | 140 | * Get article content element |
141 | * @return DOMElement | 141 | * @return DOMElement |
142 | */ | 142 | */ |
143 | public function getContent() { | 143 | public function getContent() { |
144 | return $this->articleContent; | 144 | return $this->articleContent; |
145 | } | 145 | } |
146 | 146 | ||
147 | /** | 147 | /** |
148 | * Runs readability. | 148 | * Runs readability. |
149 | * | 149 | * |
150 | * Workflow: | 150 | * Workflow: |
151 | * 1. Prep the document by removing script tags, css, etc. | 151 | * 1. Prep the document by removing script tags, css, etc. |
152 | * 2. Build readability's DOM tree. | 152 | * 2. Build readability's DOM tree. |
153 | * 3. Grab the article content from the current dom tree. | 153 | * 3. Grab the article content from the current dom tree. |
154 | * 4. Replace the current DOM tree with the new one. | 154 | * 4. Replace the current DOM tree with the new one. |
155 | * 5. Read peacefully. | 155 | * 5. Read peacefully. |
156 | * | 156 | * |
157 | * @return boolean true if we found content, false otherwise | 157 | * @return boolean true if we found content, false otherwise |
158 | **/ | 158 | **/ |
159 | public function init() | 159 | public function init() |
160 | { | 160 | { |
161 | if (!isset($this->dom->documentElement)) return false; | 161 | if (!isset($this->dom->documentElement)) return false; |
162 | $this->removeScripts($this->dom); | 162 | $this->removeScripts($this->dom); |
163 | //die($this->getInnerHTML($this->dom->documentElement)); | 163 | //die($this->getInnerHTML($this->dom->documentElement)); |
164 | 164 | ||
165 | // Assume successful outcome | 165 | // Assume successful outcome |
166 | $this->success = true; | 166 | $this->success = true; |
167 | 167 | ||
168 | $bodyElems = $this->dom->getElementsByTagName('body'); | 168 | $bodyElems = $this->dom->getElementsByTagName('body'); |
169 | if ($bodyElems->length > 0) { | 169 | if ($bodyElems->length > 0) { |
170 | if ($this->bodyCache == null) { | 170 | if ($this->bodyCache == null) { |
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | 171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; |
172 | } | 172 | } |
173 | if ($this->body == null) { | 173 | if ($this->body == null) { |
174 | $this->body = $bodyElems->item(0); | 174 | $this->body = $bodyElems->item(0); |
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | $this->prepDocument(); | 178 | $this->prepDocument(); |
179 | 179 | ||
180 | //die($this->dom->documentElement->parentNode->nodeType); | 180 | //die($this->dom->documentElement->parentNode->nodeType); |
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | 181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); |
182 | //die($this->getInnerHTML($this->dom->documentElement)); | 182 | //die($this->getInnerHTML($this->dom->documentElement)); |
183 | 183 | ||
184 | /* Build readability's DOM tree */ | 184 | /* Build readability's DOM tree */ |
185 | $overlay = $this->dom->createElement('div'); | 185 | $overlay = $this->dom->createElement('div'); |
186 | $innerDiv = $this->dom->createElement('div'); | 186 | $innerDiv = $this->dom->createElement('div'); |
187 | $articleTitle = $this->getArticleTitle(); | 187 | $articleTitle = $this->getArticleTitle(); |
188 | $articleContent = $this->grabArticle(); | 188 | $articleContent = $this->grabArticle(); |
189 | 189 | ||
190 | if (!$articleContent) { | 190 | if (!$articleContent) { |
191 | $this->success = false; | 191 | $this->success = false; |
192 | $articleContent = $this->dom->createElement('div'); | 192 | $articleContent = $this->dom->createElement('div'); |
193 | $articleContent->setAttribute('id', 'readability-content'); | 193 | $articleContent->setAttribute('id', 'readability-content'); |
194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | 194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; |
195 | } | 195 | } |
196 | 196 | ||
197 | $overlay->setAttribute('id', 'readOverlay'); | 197 | $overlay->setAttribute('id', 'readOverlay'); |
198 | $innerDiv->setAttribute('id', 'readInner'); | 198 | $innerDiv->setAttribute('id', 'readInner'); |
199 | 199 | ||
200 | /* Glue the structure of our document together. */ | 200 | /* Glue the structure of our document together. */ |
201 | $innerDiv->appendChild($articleTitle); | 201 | $innerDiv->appendChild($articleTitle); |
202 | $innerDiv->appendChild($articleContent); | 202 | $innerDiv->appendChild($articleContent); |
203 | $overlay->appendChild($innerDiv); | 203 | $overlay->appendChild($innerDiv); |
204 | 204 | ||
205 | /* Clear the old HTML, insert the new content. */ | 205 | /* Clear the old HTML, insert the new content. */ |
206 | $this->body->innerHTML = ''; | 206 | $this->body->innerHTML = ''; |
207 | $this->body->appendChild($overlay); | 207 | $this->body->appendChild($overlay); |
208 | //document.body.insertBefore(overlay, document.body.firstChild); | 208 | //document.body.insertBefore(overlay, document.body.firstChild); |
209 | $this->body->removeAttribute('style'); | 209 | $this->body->removeAttribute('style'); |
210 | 210 | ||
211 | $this->postProcessContent($articleContent); | 211 | $this->postProcessContent($articleContent); |
212 | 212 | ||
213 | // Set title and content instance variables | 213 | // Set title and content instance variables |
214 | $this->articleTitle = $articleTitle; | 214 | $this->articleTitle = $articleTitle; |
215 | $this->articleContent = $articleContent; | 215 | $this->articleContent = $articleContent; |
216 | 216 | ||
217 | return $this->success; | 217 | return $this->success; |
218 | } | 218 | } |
219 | 219 | ||
220 | /** | 220 | /** |
221 | * Debug | 221 | * Debug |
222 | */ | 222 | */ |
223 | protected function dbg($msg) { | 223 | protected function dbg($msg) { |
224 | if ($this->debug) echo '* ',$msg, "\n"; | 224 | if ($this->debug) echo '* ',$msg, "\n"; |
225 | } | 225 | } |
226 | 226 | ||
227 | /** | 227 | /** |
228 | * Run any post-process modifications to article content as necessary. | 228 | * Run any post-process modifications to article content as necessary. |
229 | * | 229 | * |
230 | * @param DOMElement | 230 | * @param DOMElement |
231 | * @return void | 231 | * @return void |
232 | */ | 232 | */ |
233 | public function postProcessContent($articleContent) { | 233 | public function postProcessContent($articleContent) { |
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | 234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { |
235 | $this->addFootnotes($articleContent); | 235 | $this->addFootnotes($articleContent); |
236 | } | 236 | } |
237 | } | 237 | } |
238 | 238 | ||
239 | /** | 239 | /** |
240 | * Get the article title as an H1. | 240 | * Get the article title as an H1. |
241 | * | 241 | * |
242 | * @return DOMElement | 242 | * @return DOMElement |
243 | */ | 243 | */ |
244 | protected function getArticleTitle() { | 244 | protected function getArticleTitle() { |
245 | $curTitle = ''; | 245 | $curTitle = ''; |
246 | $origTitle = ''; | 246 | $origTitle = ''; |
247 | 247 | ||
248 | try { | 248 | try { |
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | 249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); |
250 | } catch(Exception $e) {} | 250 | } catch(Exception $e) {} |
251 | 251 | ||
252 | if (preg_match('/ [\|\-] /', $curTitle)) | 252 | if (preg_match('/ [\|\-] /', $curTitle)) |
253 | { | 253 | { |
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | 254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); |
255 | 255 | ||
256 | if (count(explode(' ', $curTitle)) < 3) { | 256 | if (count(explode(' ', $curTitle)) < 3) { |
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | 257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); |
258 | } | 258 | } |
259 | } | 259 | } |
260 | else if (strpos($curTitle, ': ') !== false) | 260 | else if (strpos($curTitle, ': ') !== false) |
261 | { | 261 | { |
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | 262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); |
263 | 263 | ||
264 | if (count(explode(' ', $curTitle)) < 3) { | 264 | if (count(explode(' ', $curTitle)) < 3) { |
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | 265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | 268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) |
269 | { | 269 | { |
270 | $hOnes = $this->dom->getElementsByTagName('h1'); | 270 | $hOnes = $this->dom->getElementsByTagName('h1'); |
271 | if($hOnes->length == 1) | 271 | if($hOnes->length == 1) |
272 | { | 272 | { |
273 | $curTitle = $this->getInnerText($hOnes->item(0)); | 273 | $curTitle = $this->getInnerText($hOnes->item(0)); |
274 | } | 274 | } |
275 | } | 275 | } |
276 | 276 | ||
277 | $curTitle = trim($curTitle); | 277 | $curTitle = trim($curTitle); |
278 | 278 | ||
279 | if (count(explode(' ', $curTitle)) <= 4) { | 279 | if (count(explode(' ', $curTitle)) <= 4) { |
280 | $curTitle = $origTitle; | 280 | $curTitle = $origTitle; |
281 | } | 281 | } |
282 | 282 | ||
283 | $articleTitle = $this->dom->createElement('h1'); | 283 | $articleTitle = $this->dom->createElement('h1'); |
284 | $articleTitle->innerHTML = $curTitle; | 284 | $articleTitle->innerHTML = $curTitle; |
285 | 285 | ||
286 | return $articleTitle; | 286 | return $articleTitle; |
287 | } | 287 | } |
288 | 288 | ||
289 | /** | 289 | /** |
290 | * Prepare the HTML document for readability to scrape it. | 290 | * Prepare the HTML document for readability to scrape it. |
291 | * This includes things like stripping javascript, CSS, and handling terrible markup. | 291 | * This includes things like stripping javascript, CSS, and handling terrible markup. |
292 | * | 292 | * |
293 | * @return void | 293 | * @return void |
294 | **/ | 294 | **/ |
295 | protected function prepDocument() { | 295 | protected function prepDocument() { |
296 | /** | 296 | /** |
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | 297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) |
298 | * so we create a new body node and append it to the document. | 298 | * so we create a new body node and append it to the document. |
299 | */ | 299 | */ |
300 | if ($this->body == null) | 300 | if ($this->body == null) |
301 | { | 301 | { |
302 | $this->body = $this->dom->createElement('body'); | 302 | $this->body = $this->dom->createElement('body'); |
303 | $this->dom->documentElement->appendChild($this->body); | 303 | $this->dom->documentElement->appendChild($this->body); |
304 | } | 304 | } |
305 | $this->body->setAttribute('id', 'readabilityBody'); | 305 | $this->body->setAttribute('id', 'readabilityBody'); |
306 | 306 | ||
307 | /* Remove all style tags in head */ | 307 | /* Remove all style tags in head */ |
308 | $styleTags = $this->dom->getElementsByTagName('style'); | 308 | $styleTags = $this->dom->getElementsByTagName('style'); |
309 | for ($i = $styleTags->length-1; $i >= 0; $i--) | 309 | for ($i = $styleTags->length-1; $i >= 0; $i--) |
310 | { | 310 | { |
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | 311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); |
312 | } | 312 | } |
313 | 313 | ||
314 | /* Turn all double br's into p's */ | 314 | /* Turn all double br's into p's */ |
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | 315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ |
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | 316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); |
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | 317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. |
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | 318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. |
319 | } | 319 | } |
320 | 320 | ||
321 | /** | 321 | /** |
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | 322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. |
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | 323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php |
324 | * | 324 | * |
325 | * @return void | 325 | * @return void |
326 | **/ | 326 | **/ |
327 | public function addFootnotes($articleContent) { | 327 | public function addFootnotes($articleContent) { |
328 | $footnotesWrapper = $this->dom->createElement('div'); | 328 | $footnotesWrapper = $this->dom->createElement('div'); |
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | 329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); |
330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | 330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; |
331 | 331 | ||
332 | $articleFootnotes = $this->dom->createElement('ol'); | 332 | $articleFootnotes = $this->dom->createElement('ol'); |
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | 333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); |
334 | $footnotesWrapper->appendChild($articleFootnotes); | 334 | $footnotesWrapper->appendChild($articleFootnotes); |
335 | 335 | ||
336 | $articleLinks = $articleContent->getElementsByTagName('a'); | 336 | $articleLinks = $articleContent->getElementsByTagName('a'); |
337 | 337 | ||
338 | $linkCount = 0; | 338 | $linkCount = 0; |
339 | for ($i = 0; $i < $articleLinks->length; $i++) | 339 | for ($i = 0; $i < $articleLinks->length; $i++) |
340 | { | 340 | { |
341 | $articleLink = $articleLinks->item($i); | 341 | $articleLink = $articleLinks->item($i); |
342 | $footnoteLink = $articleLink->cloneNode(true); | 342 | $footnoteLink = $articleLink->cloneNode(true); |
343 | $refLink = $this->dom->createElement('a'); | 343 | $refLink = $this->dom->createElement('a'); |
344 | $footnote = $this->dom->createElement('li'); | 344 | $footnote = $this->dom->createElement('li'); |
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | 345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); |
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | 346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); |
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | 347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, |
348 | $linkText = $this->getInnerText($articleLink); | 348 | $linkText = $this->getInnerText($articleLink); |
349 | 349 | ||
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | 350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { |
351 | continue; | 351 | continue; |
352 | } | 352 | } |
353 | 353 | ||
354 | $linkCount++; | 354 | $linkCount++; |
355 | 355 | ||
356 | /** Add a superscript reference after the article link */ | 356 | /** Add a superscript reference after the article link */ |
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | 357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); |
358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | 358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; |
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | 359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); |
360 | $refLink->setAttribute('style', 'color: inherit;'); | 360 | $refLink->setAttribute('style', 'color: inherit;'); |
361 | 361 | ||
362 | //TODO: does this work or should we use DOMNode.isSameNode()? | 362 | //TODO: does this work or should we use DOMNode.isSameNode()? |
363 | if ($articleLink->parentNode->lastChild == $articleLink) { | 363 | if ($articleLink->parentNode->lastChild == $articleLink) { |
364 | $articleLink->parentNode->appendChild($refLink); | 364 | $articleLink->parentNode->appendChild($refLink); |
365 | } else { | 365 | } else { |
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | 366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); |
367 | } | 367 | } |
368 | 368 | ||
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | 369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | 370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
371 | 371 | ||
372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | 372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; |
373 | 373 | ||
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | 374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); |
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | 375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
376 | 376 | ||
377 | $footnote->appendChild($footnoteLink); | 377 | $footnote->appendChild($footnoteLink); |
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | 378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; |
379 | 379 | ||
380 | $articleFootnotes->appendChild($footnote); | 380 | $articleFootnotes->appendChild($footnote); |
381 | } | 381 | } |
382 | 382 | ||
383 | if ($linkCount > 0) { | 383 | if ($linkCount > 0) { |
384 | $articleContent->appendChild($footnotesWrapper); | 384 | $articleContent->appendChild($footnotesWrapper); |
385 | } | 385 | } |
386 | } | 386 | } |
387 | 387 | ||
388 | /** | 388 | /** |
389 | * Reverts P elements with class 'readability-styled' | 389 | * Reverts P elements with class 'readability-styled' |
390 | * to text nodes - which is what they were before. | 390 | * to text nodes - which is what they were before. |
391 | * | 391 | * |
392 | * @param DOMElement | 392 | * @param DOMElement |
393 | * @return void | 393 | * @return void |
394 | */ | 394 | */ |
395 | function revertReadabilityStyledElements($articleContent) { | 395 | function revertReadabilityStyledElements($articleContent) { |
396 | $xpath = new DOMXPath($articleContent->ownerDocument); | 396 | $xpath = new DOMXPath($articleContent->ownerDocument); |
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | 397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); |
398 | //$elems = $articleContent->getElementsByTagName('p'); | 398 | //$elems = $articleContent->getElementsByTagName('p'); |
399 | for ($i = $elems->length-1; $i >= 0; $i--) { | 399 | for ($i = $elems->length-1; $i >= 0; $i--) { |
400 | $e = $elems->item($i); | 400 | $e = $elems->item($i); |
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | 401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); |
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | 402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { |
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | 403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); |
404 | //} | 404 | //} |
405 | } | 405 | } |
406 | } | 406 | } |
407 | 407 | ||
408 | /** | 408 | /** |
409 | * Prepare the article node for display. Clean out any inline styles, | 409 | * Prepare the article node for display. Clean out any inline styles, |
410 | * iframes, forms, strip extraneous <p> tags, etc. | 410 | * iframes, forms, strip extraneous <p> tags, etc. |
411 | * | 411 | * |
412 | * @param DOMElement | 412 | * @param DOMElement |
413 | * @return void | 413 | * @return void |
414 | */ | 414 | */ |
415 | function prepArticle($articleContent) { | 415 | function prepArticle($articleContent) { |
416 | $this->cleanStyles($articleContent); | 416 | $this->cleanStyles($articleContent); |
417 | $this->killBreaks($articleContent); | 417 | $this->killBreaks($articleContent); |
418 | if ($this->revertForcedParagraphElements) { | 418 | if ($this->revertForcedParagraphElements) { |
419 | $this->revertReadabilityStyledElements($articleContent); | 419 | $this->revertReadabilityStyledElements($articleContent); |
420 | } | 420 | } |
421 | 421 | ||
422 | /* Clean out junk from the article content */ | 422 | /* Clean out junk from the article content */ |
423 | $this->cleanConditionally($articleContent, 'form'); | 423 | $this->cleanConditionally($articleContent, 'form'); |
424 | $this->clean($articleContent, 'object'); | 424 | $this->clean($articleContent, 'object'); |
425 | $this->clean($articleContent, 'h1'); | 425 | $this->clean($articleContent, 'h1'); |
426 | 426 | ||
427 | /** | 427 | /** |
428 | * If there is only one h2, they are probably using it | 428 | * If there is only one h2, they are probably using it |
429 | * as a header and not a subheader, so remove it since we already have a header. | 429 | * as a header and not a subheader, so remove it since we already have a header. |
430 | ***/ | 430 | ***/ |
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { | 431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { |
432 | $this->clean($articleContent, 'h2'); | 432 | $this->clean($articleContent, 'h2'); |
433 | } | 433 | } |
434 | $this->clean($articleContent, 'iframe'); | 434 | $this->clean($articleContent, 'iframe'); |
435 | 435 | ||
436 | $this->cleanHeaders($articleContent); | 436 | $this->cleanHeaders($articleContent); |
437 | 437 | ||
438 | /* Do these last as the previous stuff may have removed junk that will affect these */ | 438 | /* Do these last as the previous stuff may have removed junk that will affect these */ |
439 | $this->cleanConditionally($articleContent, 'table'); | 439 | $this->cleanConditionally($articleContent, 'table'); |
440 | $this->cleanConditionally($articleContent, 'ul'); | 440 | $this->cleanConditionally($articleContent, 'ul'); |
441 | $this->cleanConditionally($articleContent, 'div'); | 441 | $this->cleanConditionally($articleContent, 'div'); |
442 | 442 | ||
443 | /* Remove extra paragraphs */ | 443 | /* Remove extra paragraphs */ |
444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | 444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); |
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | 445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) |
446 | { | 446 | { |
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | 447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; |
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | 448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; |
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | 449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; |
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; | 450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; |
451 | 451 | ||
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | 452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') |
453 | { | 453 | { |
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | 454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); |
455 | } | 455 | } |
456 | } | 456 | } |
457 | 457 | ||
458 | try { | 458 | try { |
459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | 459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); |
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | 460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); |
461 | } | 461 | } |
462 | catch (Exception $e) { | 462 | catch (Exception $e) { |
463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | 463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); |
464 | } | 464 | } |
465 | } | 465 | } |
466 | 466 | ||
467 | /** | 467 | /** |
468 | * Initialize a node with the readability object. Also checks the | 468 | * Initialize a node with the readability object. Also checks the |
469 | * className/id for special names to add to its score. | 469 | * className/id for special names to add to its score. |
470 | * | 470 | * |
471 | * @param Element | 471 | * @param Element |
472 | * @return void | 472 | * @return void |
473 | **/ | 473 | **/ |
474 | protected function initializeNode($node) { | 474 | protected function initializeNode($node) { |
475 | $readability = $this->dom->createAttribute('readability'); | 475 | $readability = $this->dom->createAttribute('readability'); |
476 | $readability->value = 0; // this is our contentScore | 476 | $readability->value = 0; // this is our contentScore |
477 | $node->setAttributeNode($readability); | 477 | $node->setAttributeNode($readability); |
478 | 478 | ||
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | 479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case |
480 | case 'DIV': | 480 | case 'DIV': |
481 | $readability->value += 5; | 481 | $readability->value += 5; |
482 | break; | 482 | break; |
483 | 483 | ||
484 | case 'PRE': | 484 | case 'PRE': |
485 | case 'TD': | 485 | case 'TD': |
486 | case 'BLOCKQUOTE': | 486 | case 'BLOCKQUOTE': |
487 | $readability->value += 3; | 487 | $readability->value += 3; |
488 | break; | 488 | break; |
489 | 489 | ||
490 | case 'ADDRESS': | 490 | case 'ADDRESS': |
491 | case 'OL': | 491 | case 'OL': |
492 | case 'UL': | 492 | case 'UL': |
493 | case 'DL': | 493 | case 'DL': |
494 | case 'DD': | 494 | case 'DD': |
495 | case 'DT': | 495 | case 'DT': |
496 | case 'LI': | 496 | case 'LI': |
497 | case 'FORM': | 497 | case 'FORM': |
498 | $readability->value -= 3; | 498 | $readability->value -= 3; |
499 | break; | 499 | break; |
500 | 500 | ||
501 | case 'H1': | 501 | case 'H1': |
502 | case 'H2': | 502 | case 'H2': |
503 | case 'H3': | 503 | case 'H3': |
504 | case 'H4': | 504 | case 'H4': |
505 | case 'H5': | 505 | case 'H5': |
506 | case 'H6': | 506 | case 'H6': |
507 | case 'TH': | 507 | case 'TH': |
508 | $readability->value -= 5; | 508 | $readability->value -= 5; |
509 | break; | 509 | break; |
510 | } | 510 | } |
511 | $readability->value += $this->getClassWeight($node); | 511 | $readability->value += $this->getClassWeight($node); |
512 | } | 512 | } |
513 | 513 | ||
514 | /*** | 514 | /*** |
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | 515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | 516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
517 | * | 517 | * |
518 | * @return DOMElement | 518 | * @return DOMElement |
519 | **/ | 519 | **/ |
520 | protected function grabArticle($page=null) { | 520 | protected function grabArticle($page=null) { |
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | 521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); |
522 | if (!$page) $page = $this->dom; | 522 | if (!$page) $page = $this->dom; |
523 | $allElements = $page->getElementsByTagName('*'); | 523 | $allElements = $page->getElementsByTagName('*'); |
524 | /** | 524 | /** |
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | 525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs |
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | 526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) |
527 | * | 527 | * |
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | 528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
529 | * TODO: Shouldn't this be a reverse traversal? | 529 | * TODO: Shouldn't this be a reverse traversal? |
530 | **/ | 530 | **/ |
531 | $node = null; | 531 | $node = null; |
532 | $nodesToScore = array(); | 532 | $nodesToScore = array(); |
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | 533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { |
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | 534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { |
535 | //$node = $targetList->item($nodeIndex); | 535 | //$node = $targetList->item($nodeIndex); |
536 | $tagName = strtoupper($node->tagName); | 536 | $tagName = strtoupper($node->tagName); |
537 | /* Remove unlikely candidates */ | 537 | /* Remove unlikely candidates */ |
538 | if ($stripUnlikelyCandidates) { | 538 | if ($stripUnlikelyCandidates) { |
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | 539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); |
540 | if ( | 540 | if ( |
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | 541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && |
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | 542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && |
543 | $tagName != 'BODY' | 543 | $tagName != 'BODY' |
544 | ) | 544 | ) |
545 | { | 545 | { |
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | 546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); |
547 | //$nodesToRemove[] = $node; | 547 | //$nodesToRemove[] = $node; |
548 | $node->parentNode->removeChild($node); | 548 | $node->parentNode->removeChild($node); |
549 | $nodeIndex--; | 549 | $nodeIndex--; |
550 | continue; | 550 | continue; |
551 | } | 551 | } |
552 | } | 552 | } |
553 | 553 | ||
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | 554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { |
555 | $nodesToScore[] = $node; | 555 | $nodesToScore[] = $node; |
556 | } | 556 | } |
557 | 557 | ||
558 | /* Turn all divs that don't have children block level elements into p's */ | 558 | /* Turn all divs that don't have children block level elements into p's */ |
559 | if ($tagName == 'DIV') { | 559 | if ($tagName == 'DIV') { |
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | 560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { |
561 | //$this->dbg('Altering div to p'); | 561 | //$this->dbg('Altering div to p'); |
562 | $newNode = $this->dom->createElement('p'); | 562 | $newNode = $this->dom->createElement('p'); |
563 | try { | 563 | try { |
564 | $newNode->innerHTML = $node->innerHTML; | 564 | $newNode->innerHTML = $node->innerHTML; |
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | 565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); |
566 | $node->parentNode->replaceChild($newNode, $node); | 566 | $node->parentNode->replaceChild($newNode, $node); |
567 | $nodeIndex--; | 567 | $nodeIndex--; |
568 | $nodesToScore[] = $node; // or $newNode? | 568 | $nodesToScore[] = $node; // or $newNode? |
569 | } | 569 | } |
570 | catch(Exception $e) { | 570 | catch(Exception $e) { |
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | 571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); |
572 | } | 572 | } |
573 | } | 573 | } |
574 | else | 574 | else |
575 | { | 575 | { |
576 | /* EXPERIMENTAL */ | 576 | /* EXPERIMENTAL */ |
577 | // TODO: change these p elements back to text nodes after processing | 577 | // TODO: change these p elements back to text nodes after processing |
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | 578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { |
579 | $childNode = $node->childNodes->item($i); | 579 | $childNode = $node->childNodes->item($i); |
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | 580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE |
581 | //$this->dbg('replacing text node with a p tag with the same content.'); | 581 | //$this->dbg('replacing text node with a p tag with the same content.'); |
582 | $p = $this->dom->createElement('p'); | 582 | $p = $this->dom->createElement('p'); |
583 | $p->innerHTML = $childNode->nodeValue; | 583 | $p->innerHTML = $childNode->nodeValue; |
584 | $p->setAttribute('style', 'display: inline;'); | 584 | $p->setAttribute('style', 'display: inline;'); |
585 | $p->setAttribute('class', 'readability-styled'); | 585 | $p->setAttribute('class', 'readability-styled'); |
586 | $childNode->parentNode->replaceChild($p, $childNode); | 586 | $childNode->parentNode->replaceChild($p, $childNode); |
587 | } | 587 | } |
588 | } | 588 | } |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | /** | 593 | /** |
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | 594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
595 | * Then add their score to their parent node. | 595 | * Then add their score to their parent node. |
596 | * | 596 | * |
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | 597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
598 | **/ | 598 | **/ |
599 | $candidates = array(); | 599 | $candidates = array(); |
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | 600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { |
601 | $parentNode = $nodesToScore[$pt]->parentNode; | 601 | $parentNode = $nodesToScore[$pt]->parentNode; |
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | 602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; |
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | 603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); |
604 | $innerText = $this->getInnerText($nodesToScore[$pt]); | 604 | $innerText = $this->getInnerText($nodesToScore[$pt]); |
605 | 605 | ||
606 | if (!$parentNode || !isset($parentNode->tagName)) { | 606 | if (!$parentNode || !isset($parentNode->tagName)) { |
607 | continue; | 607 | continue; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* If this paragraph is less than 25 characters, don't even count it. */ | 610 | /* If this paragraph is less than 25 characters, don't even count it. */ |
611 | if(strlen($innerText) < 25) { | 611 | if(strlen($innerText) < 25) { |
612 | continue; | 612 | continue; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* Initialize readability data for the parent. */ | 615 | /* Initialize readability data for the parent. */ |
616 | if (!$parentNode->hasAttribute('readability')) | 616 | if (!$parentNode->hasAttribute('readability')) |
617 | { | 617 | { |
618 | $this->initializeNode($parentNode); | 618 | $this->initializeNode($parentNode); |
619 | $candidates[] = $parentNode; | 619 | $candidates[] = $parentNode; |
620 | } | 620 | } |
621 | 621 | ||
622 | /* Initialize readability data for the grandparent. */ | 622 | /* Initialize readability data for the grandparent. */ |
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | 623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) |
624 | { | 624 | { |
625 | $this->initializeNode($grandParentNode); | 625 | $this->initializeNode($grandParentNode); |
626 | $candidates[] = $grandParentNode; | 626 | $candidates[] = $grandParentNode; |
627 | } | 627 | } |
628 | 628 | ||
629 | $contentScore = 0; | 629 | $contentScore = 0; |
630 | 630 | ||
631 | /* Add a point for the paragraph itself as a base. */ | 631 | /* Add a point for the paragraph itself as a base. */ |
632 | $contentScore++; | 632 | $contentScore++; |
633 | 633 | ||
634 | /* Add points for any commas within this paragraph */ | 634 | /* Add points for any commas within this paragraph */ |
635 | $contentScore += count(explode(',', $innerText)); | 635 | $contentScore += count(explode(',', $innerText)); |
636 | 636 | ||
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | 637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
638 | $contentScore += min(floor(strlen($innerText) / 100), 3); | 638 | $contentScore += min(floor(strlen($innerText) / 100), 3); |
639 | 639 | ||
640 | /* Add the score to the parent. The grandparent gets half. */ | 640 | /* Add the score to the parent. The grandparent gets half. */ |
641 | $parentNode->getAttributeNode('readability')->value += $contentScore; | 641 | $parentNode->getAttributeNode('readability')->value += $contentScore; |
642 | 642 | ||
643 | if ($grandParentNode) { | 643 | if ($grandParentNode) { |
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | 644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; |
645 | } | 645 | } |
646 | } | 646 | } |
647 | 647 | ||
648 | /** | 648 | /** |
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found | 649 | * After we've calculated scores, loop through all of the possible candidate nodes we found |
650 | * and find the one with the highest score. | 650 | * and find the one with the highest score. |
651 | **/ | 651 | **/ |
652 | $topCandidate = null; | 652 | $topCandidate = null; |
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | 653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) |
654 | { | 654 | { |
655 | /** | 655 | /** |
656 | * Scale the final candidates score based on link density. Good content should have a | 656 | * Scale the final candidates score based on link density. Good content should have a |
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | 657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. |
658 | **/ | 658 | **/ |
659 | $readability = $candidates[$c]->getAttributeNode('readability'); | 659 | $readability = $candidates[$c]->getAttributeNode('readability'); |
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | 660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); |
661 | 661 | ||
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | 662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); |
663 | 663 | ||
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | 664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { |
665 | $topCandidate = $candidates[$c]; | 665 | $topCandidate = $candidates[$c]; |
666 | } | 666 | } |
667 | } | 667 | } |
668 | 668 | ||
669 | /** | 669 | /** |
670 | * If we still have no top candidate, just use the body as a last resort. | 670 | * If we still have no top candidate, just use the body as a last resort. |
671 | * We also have to copy the body node so it is something we can modify. | 671 | * We also have to copy the body node so it is something we can modify. |
672 | **/ | 672 | **/ |
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | 673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') |
674 | { | 674 | { |
675 | $topCandidate = $this->dom->createElement('div'); | 675 | $topCandidate = $this->dom->createElement('div'); |
676 | if ($page instanceof DOMDocument) { | 676 | if ($page instanceof DOMDocument) { |
677 | if (!isset($page->documentElement)) { | 677 | if (!isset($page->documentElement)) { |
678 | // we don't have a body either? what a mess! :) | 678 | // we don't have a body either? what a mess! :) |
679 | } else { | 679 | } else { |
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; | 680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; |
681 | $page->documentElement->innerHTML = ''; | 681 | $page->documentElement->innerHTML = ''; |
682 | $page->documentElement->appendChild($topCandidate); | 682 | $page->documentElement->appendChild($topCandidate); |
683 | } | 683 | } |
684 | } else { | 684 | } else { |
685 | $topCandidate->innerHTML = $page->innerHTML; | 685 | $topCandidate->innerHTML = $page->innerHTML; |
686 | $page->innerHTML = ''; | 686 | $page->innerHTML = ''; |
687 | $page->appendChild($topCandidate); | 687 | $page->appendChild($topCandidate); |
688 | } | 688 | } |
689 | $this->initializeNode($topCandidate); | 689 | $this->initializeNode($topCandidate); |
690 | } | 690 | } |
691 | 691 | ||
692 | /** | 692 | /** |
693 | * Now that we have the top candidate, look through its siblings for content that might also be related. | 693 | * Now that we have the top candidate, look through its siblings for content that might also be related. |
694 | * Things like preambles, content split by ads that we removed, etc. | 694 | * Things like preambles, content split by ads that we removed, etc. |
695 | **/ | 695 | **/ |
696 | $articleContent = $this->dom->createElement('div'); | 696 | $articleContent = $this->dom->createElement('div'); |
697 | $articleContent->setAttribute('id', 'readability-content'); | 697 | $articleContent->setAttribute('id', 'readability-content'); |
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | 698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); |
699 | $siblingNodes = $topCandidate->parentNode->childNodes; | 699 | $siblingNodes = $topCandidate->parentNode->childNodes; |
700 | if (!isset($siblingNodes)) { | 700 | if (!isset($siblingNodes)) { |
701 | $siblingNodes = new stdClass; | 701 | $siblingNodes = new stdClass; |
702 | $siblingNodes->length = 0; | 702 | $siblingNodes->length = 0; |
703 | } | 703 | } |
704 | 704 | ||
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | 705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) |
706 | { | 706 | { |
707 | $siblingNode = $siblingNodes->item($s); | 707 | $siblingNode = $siblingNodes->item($s); |
708 | $append = false; | 708 | $append = false; |
709 | 709 | ||
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | 710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); |
711 | 711 | ||
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | 712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); |
713 | 713 | ||
714 | if ($siblingNode === $topCandidate) | 714 | if ($siblingNode === $topCandidate) |
715 | // or if ($siblingNode->isSameNode($topCandidate)) | 715 | // or if ($siblingNode->isSameNode($topCandidate)) |
716 | { | 716 | { |
717 | $append = true; | 717 | $append = true; |
718 | } | 718 | } |
719 | 719 | ||
720 | $contentBonus = 0; | 720 | $contentBonus = 0; |
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | 721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | 722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { |
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | 723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; |
724 | } | 724 | } |
725 | 725 | ||
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | 726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) |
727 | { | 727 | { |
728 | $append = true; | 728 | $append = true; |
729 | } | 729 | } |
730 | 730 | ||
731 | if (strtoupper($siblingNode->nodeName) == 'P') { | 731 | if (strtoupper($siblingNode->nodeName) == 'P') { |
732 | $linkDensity = $this->getLinkDensity($siblingNode); | 732 | $linkDensity = $this->getLinkDensity($siblingNode); |
733 | $nodeContent = $this->getInnerText($siblingNode); | 733 | $nodeContent = $this->getInnerText($siblingNode); |
734 | $nodeLength = strlen($nodeContent); | 734 | $nodeLength = strlen($nodeContent); |
735 | 735 | ||
736 | if ($nodeLength > 80 && $linkDensity < 0.25) | 736 | if ($nodeLength > 80 && $linkDensity < 0.25) |
737 | { | 737 | { |
738 | $append = true; | 738 | $append = true; |
739 | } | 739 | } |
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | 740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) |
741 | { | 741 | { |
742 | $append = true; | 742 | $append = true; |
743 | } | 743 | } |
744 | } | 744 | } |
745 | 745 | ||
746 | if ($append) | 746 | if ($append) |
747 | { | 747 | { |
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | 748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); |
749 | 749 | ||
750 | $nodeToAppend = null; | 750 | $nodeToAppend = null; |
751 | $sibNodeName = strtoupper($siblingNode->nodeName); | 751 | $sibNodeName = strtoupper($siblingNode->nodeName); |
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | 752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { |
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | 753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
754 | 754 | ||
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | 755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); |
756 | $nodeToAppend = $this->dom->createElement('div'); | 756 | $nodeToAppend = $this->dom->createElement('div'); |
757 | try { | 757 | try { |
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | 758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); |
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | 759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; |
760 | } | 760 | } |
761 | catch(Exception $e) | 761 | catch(Exception $e) |
762 | { | 762 | { |
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | 763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); |
764 | $nodeToAppend = $siblingNode; | 764 | $nodeToAppend = $siblingNode; |
765 | $s--; | 765 | $s--; |
766 | $sl--; | 766 | $sl--; |
767 | } | 767 | } |
768 | } else { | 768 | } else { |
769 | $nodeToAppend = $siblingNode; | 769 | $nodeToAppend = $siblingNode; |
770 | $s--; | 770 | $s--; |
771 | $sl--; | 771 | $sl--; |
772 | } | 772 | } |
773 | 773 | ||
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | 774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ |
775 | $nodeToAppend->removeAttribute('class'); | 775 | $nodeToAppend->removeAttribute('class'); |
776 | 776 | ||
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | 777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
778 | $articleContent->appendChild($nodeToAppend); | 778 | $articleContent->appendChild($nodeToAppend); |
779 | } | 779 | } |
780 | } | 780 | } |
781 | 781 | ||
782 | /** | 782 | /** |
783 | * So we have all of the content that we need. Now we clean it up for presentation. | 783 | * So we have all of the content that we need. Now we clean it up for presentation. |
784 | **/ | 784 | **/ |
785 | $this->prepArticle($articleContent); | 785 | $this->prepArticle($articleContent); |
786 | 786 | ||
787 | /** | 787 | /** |
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | 788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. |
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | 789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | 790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
791 | * finding the -right- content. | 791 | * finding the -right- content. |
792 | **/ | 792 | **/ |
793 | if (strlen($this->getInnerText($articleContent, false)) < 250) | 793 | if (strlen($this->getInnerText($articleContent, false)) < 250) |
794 | { | 794 | { |
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 | 795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 |
796 | // in the meantime, we check and create an empty element if it's not there. | 796 | // in the meantime, we check and create an empty element if it's not there. |
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); | 797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); |
798 | $this->body->innerHTML = $this->bodyCache; | 798 | $this->body->innerHTML = $this->bodyCache; |
799 | 799 | ||
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | 800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { |
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | 801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); |
802 | return $this->grabArticle($this->body); | 802 | return $this->grabArticle($this->body); |
803 | } | 803 | } |
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | 805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); |
806 | return $this->grabArticle($this->body); | 806 | return $this->grabArticle($this->body); |
807 | } | 807 | } |
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | 809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); |
810 | return $this->grabArticle($this->body); | 810 | return $this->grabArticle($this->body); |
811 | } | 811 | } |
812 | else { | 812 | else { |
813 | return false; | 813 | return false; |
814 | } | 814 | } |
815 | } | 815 | } |
816 | return $articleContent; | 816 | return $articleContent; |
817 | } | 817 | } |
818 | 818 | ||
819 | /** | 819 | /** |
820 | * Remove script tags from document | 820 | * Remove script tags from document |
821 | * | 821 | * |
822 | * @param DOMElement | 822 | * @param DOMElement |
823 | * @return void | 823 | * @return void |
824 | */ | 824 | */ |
825 | public function removeScripts($doc) { | 825 | public function removeScripts($doc) { |
826 | $scripts = $doc->getElementsByTagName('script'); | 826 | $scripts = $doc->getElementsByTagName('script'); |
827 | for($i = $scripts->length-1; $i >= 0; $i--) | 827 | for($i = $scripts->length-1; $i >= 0; $i--) |
828 | { | 828 | { |
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | 829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); |
830 | } | 830 | } |
831 | } | 831 | } |
832 | 832 | ||
833 | /** | 833 | /** |
834 | * Get the inner text of a node. | 834 | * Get the inner text of a node. |
835 | * This also strips out any excess whitespace to be found. | 835 | * This also strips out any excess whitespace to be found. |
836 | * | 836 | * |
837 | * @param DOMElement $ | 837 | * @param DOMElement $ |
838 | * @param boolean $normalizeSpaces (default: true) | 838 | * @param boolean $normalizeSpaces (default: true) |
839 | * @return string | 839 | * @return string |
840 | **/ | 840 | **/ |
841 | public function getInnerText($e, $normalizeSpaces=true) { | 841 | public function getInnerText($e, $normalizeSpaces=true) { |
842 | $textContent = ''; | 842 | $textContent = ''; |
843 | 843 | ||
844 | if (!isset($e->textContent) || $e->textContent == '') { | 844 | if (!isset($e->textContent) || $e->textContent == '') { |
845 | return ''; | 845 | return ''; |
846 | } | 846 | } |
847 | 847 | ||
848 | $textContent = trim($e->textContent); | 848 | $textContent = trim($e->textContent); |
849 | 849 | ||
850 | if ($normalizeSpaces) { | 850 | if ($normalizeSpaces) { |
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | 851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); |
852 | } else { | 852 | } else { |
853 | return $textContent; | 853 | return $textContent; |
854 | } | 854 | } |
855 | } | 855 | } |
856 | 856 | ||
857 | /** | 857 | /** |
858 | * Get the number of times a string $s appears in the node $e. | 858 | * Get the number of times a string $s appears in the node $e. |
859 | * | 859 | * |
860 | * @param DOMElement $e | 860 | * @param DOMElement $e |
861 | * @param string - what to count. Default is "," | 861 | * @param string - what to count. Default is "," |
862 | * @return number (integer) | 862 | * @return number (integer) |
863 | **/ | 863 | **/ |
864 | public function getCharCount($e, $s=',') { | 864 | public function getCharCount($e, $s=',') { |
865 | return substr_count($this->getInnerText($e), $s); | 865 | return substr_count($this->getInnerText($e), $s); |
866 | } | 866 | } |
867 | 867 | ||
868 | /** | 868 | /** |
869 | * Remove the style attribute on every $e and under. | 869 | * Remove the style attribute on every $e and under. |
870 | * | 870 | * |
871 | * @param DOMElement $e | 871 | * @param DOMElement $e |
872 | * @return void | 872 | * @return void |
873 | */ | 873 | */ |
874 | public function cleanStyles($e) { | 874 | public function cleanStyles($e) { |
875 | if (!is_object($e)) return; | 875 | if (!is_object($e)) return; |
876 | $elems = $e->getElementsByTagName('*'); | 876 | $elems = $e->getElementsByTagName('*'); |
877 | foreach ($elems as $elem) { | 877 | foreach ($elems as $elem) { |
878 | $elem->removeAttribute('style'); | 878 | $elem->removeAttribute('style'); |
879 | } | 879 | } |
880 | } | 880 | } |
881 | 881 | ||
882 | /** | 882 | /** |
883 | * Get the density of links as a percentage of the content | 883 | * Get the density of links as a percentage of the content |
884 | * This is the amount of text that is inside a link divided by the total text in the node. | 884 | * This is the amount of text that is inside a link divided by the total text in the node. |
885 | * | 885 | * |
886 | * @param DOMElement $e | 886 | * @param DOMElement $e |
887 | * @return number (float) | 887 | * @return number (float) |
888 | */ | 888 | */ |
889 | public function getLinkDensity($e) { | 889 | public function getLinkDensity($e) { |
890 | $links = $e->getElementsByTagName('a'); | 890 | $links = $e->getElementsByTagName('a'); |
891 | $textLength = strlen($this->getInnerText($e)); | 891 | $textLength = strlen($this->getInnerText($e)); |
892 | $linkLength = 0; | 892 | $linkLength = 0; |
893 | for ($i=0, $il=$links->length; $i < $il; $i++) | 893 | for ($i=0, $il=$links->length; $i < $il; $i++) |
894 | { | 894 | { |
895 | $linkLength += strlen($this->getInnerText($links->item($i))); | 895 | $linkLength += strlen($this->getInnerText($links->item($i))); |
896 | } | 896 | } |
897 | if ($textLength > 0) { | 897 | if ($textLength > 0) { |
898 | return $linkLength / $textLength; | 898 | return $linkLength / $textLength; |
899 | } else { | 899 | } else { |
900 | return 0; | 900 | return 0; |
901 | } | 901 | } |
902 | } | 902 | } |
903 | 903 | ||
904 | /** | 904 | /** |
905 | * Get an elements class/id weight. Uses regular expressions to tell if this | 905 | * Get an elements class/id weight. Uses regular expressions to tell if this |
906 | * element looks good or bad. | 906 | * element looks good or bad. |
907 | * | 907 | * |
908 | * @param DOMElement $e | 908 | * @param DOMElement $e |
909 | * @return number (Integer) | 909 | * @return number (Integer) |
910 | */ | 910 | */ |
911 | public function getClassWeight($e) { | 911 | public function getClassWeight($e) { |
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
913 | return 0; | 913 | return 0; |
914 | } | 914 | } |
915 | 915 | ||
916 | $weight = 0; | 916 | $weight = 0; |
917 | 917 | ||
918 | /* Look for a special classname */ | 918 | /* Look for a special classname */ |
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | 919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') |
920 | { | 920 | { |
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | 921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { |
922 | $weight -= 25; | 922 | $weight -= 25; |
923 | } | 923 | } |
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | 924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { |
925 | $weight += 25; | 925 | $weight += 25; |
926 | } | 926 | } |
927 | } | 927 | } |
928 | 928 | ||
929 | /* Look for a special ID */ | 929 | /* Look for a special ID */ |
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | 930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') |
931 | { | 931 | { |
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | 932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { |
933 | $weight -= 25; | 933 | $weight -= 25; |
934 | } | 934 | } |
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | 935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { |
936 | $weight += 25; | 936 | $weight += 25; |
937 | } | 937 | } |
938 | } | 938 | } |
939 | return $weight; | 939 | return $weight; |
940 | } | 940 | } |
941 | 941 | ||
942 | /** | 942 | /** |
943 | * Remove extraneous break tags from a node. | 943 | * Remove extraneous break tags from a node. |
944 | * | 944 | * |
945 | * @param DOMElement $node | 945 | * @param DOMElement $node |
946 | * @return void | 946 | * @return void |
947 | */ | 947 | */ |
948 | public function killBreaks($node) { | 948 | public function killBreaks($node) { |
949 | $html = $node->innerHTML; | 949 | $html = $node->innerHTML; |
950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | 950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); |
951 | $node->innerHTML = $html; | 951 | $node->innerHTML = $html; |
952 | } | 952 | } |
953 | 953 | ||
954 | /** | 954 | /** |
955 | * Clean a node of all elements of type "tag". | 955 | * Clean a node of all elements of type "tag". |
956 | * (Unless it's a youtube/vimeo video. People love movies.) | 956 | * (Unless it's a youtube/vimeo video. People love movies.) |
957 | * | 957 | * |
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes | 958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes |
959 | * | 959 | * |
960 | * @param DOMElement $e | 960 | * @param DOMElement $e |
961 | * @param string $tag | 961 | * @param string $tag |
962 | * @return void | 962 | * @return void |
963 | */ | 963 | */ |
964 | public function clean($e, $tag) { | 964 | public function clean($e, $tag) { |
965 | $targetList = $e->getElementsByTagName($tag); | 965 | $targetList = $e->getElementsByTagName($tag); |
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); | 966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); |
967 | 967 | ||
968 | for ($y=$targetList->length-1; $y >= 0; $y--) { | 968 | for ($y=$targetList->length-1; $y >= 0; $y--) { |
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | 969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ |
970 | if ($isEmbed) { | 970 | if ($isEmbed) { |
971 | $attributeValues = ''; | 971 | $attributeValues = ''; |
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | 972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { |
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | 973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) |
974 | } | 974 | } |
975 | 975 | ||
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | 976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ |
977 | if (preg_match($this->regexps['video'], $attributeValues)) { | 977 | if (preg_match($this->regexps['video'], $attributeValues)) { |
978 | continue; | 978 | continue; |
979 | } | 979 | } |
980 | 980 | ||
981 | /* Then check the elements inside this element for the same. */ | 981 | /* Then check the elements inside this element for the same. */ |
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | 982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { |
983 | continue; | 983 | continue; |
984 | } | 984 | } |
985 | } | 985 | } |
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | 986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); |
987 | } | 987 | } |
988 | } | 988 | } |
989 | 989 | ||
990 | /** | 990 | /** |
991 | * Clean an element of all tags of type "tag" if they look fishy. | 991 | * Clean an element of all tags of type "tag" if they look fishy. |
992 | * "Fishy" is an algorithm based on content length, classnames, | 992 | * "Fishy" is an algorithm based on content length, classnames, |
993 | * link density, number of images & embeds, etc. | 993 | * link density, number of images & embeds, etc. |
994 | * | 994 | * |
995 | * @param DOMElement $e | 995 | * @param DOMElement $e |
996 | * @param string $tag | 996 | * @param string $tag |
997 | * @return void | 997 | * @return void |
998 | */ | 998 | */ |
999 | public function cleanConditionally($e, $tag) { | 999 | public function cleanConditionally($e, $tag) { |
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
1001 | return; | 1001 | return; |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | $tagsList = $e->getElementsByTagName($tag); | 1004 | $tagsList = $e->getElementsByTagName($tag); |
1005 | $curTagsLength = $tagsList->length; | 1005 | $curTagsLength = $tagsList->length; |
1006 | 1006 | ||
1007 | /** | 1007 | /** |
1008 | * Gather counts for other typical elements embedded within. | 1008 | * Gather counts for other typical elements embedded within. |
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | 1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. |
1010 | * | 1010 | * |
1011 | * TODO: Consider taking into account original contentScore here. | 1011 | * TODO: Consider taking into account original contentScore here. |
1012 | */ | 1012 | */ |
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | 1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { |
1014 | $weight = $this->getClassWeight($tagsList->item($i)); | 1014 | $weight = $this->getClassWeight($tagsList->item($i)); |
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | 1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; |
1016 | 1016 | ||
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | 1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); |
1018 | 1018 | ||
1019 | if ($weight + $contentScore < 0) { | 1019 | if ($weight + $contentScore < 0) { |
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1021 | } | 1021 | } |
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | 1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { |
1023 | /** | 1023 | /** |
1024 | * If there are not very many commas, and the number of | 1024 | * If there are not very many commas, and the number of |
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | 1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. |
1026 | **/ | 1026 | **/ |
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | 1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; |
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | 1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; |
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | 1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; |
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | 1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; |
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; | 1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; |
1032 | 1032 | ||
1033 | $embedCount = 0; | 1033 | $embedCount = 0; |
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | 1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); |
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1037 | $embedCount++; | 1037 | $embedCount++; |
1038 | } | 1038 | } |
1039 | } | 1039 | } |
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); | 1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); |
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1043 | $embedCount++; | 1043 | $embedCount++; |
1044 | } | 1044 | } |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | 1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); |
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | 1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); |
1049 | $toRemove = false; | 1049 | $toRemove = false; |
1050 | 1050 | ||
1051 | if ($this->lightClean) { | 1051 | if ($this->lightClean) { |
1052 | $this->dbg('Light clean...'); | 1052 | $this->dbg('Light clean...'); |
1053 | if ( ($img > $p) && ($img > 4) ) { | 1053 | if ( ($img > $p) && ($img > 4) ) { |
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); | 1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); |
1055 | $toRemove = true; | 1055 | $toRemove = true; |
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1058 | $toRemove = true; | 1058 | $toRemove = true; |
1059 | } else if ( $input > floor($p/3) ) { | 1059 | } else if ( $input > floor($p/3) ) { |
1060 | $this->dbg(' too many <input> elements'); | 1060 | $this->dbg(' too many <input> elements'); |
1061 | $toRemove = true; | 1061 | $toRemove = true; |
1062 | } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { | 1062 | } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) { |
1063 | $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); | 1063 | $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images'); |
1064 | $toRemove = true; | 1064 | $toRemove = true; |
1065 | } else if($weight < 25 && $linkDensity > 0.2) { | 1065 | } else if($weight < 25 && $linkDensity > 0.2) { |
1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1067 | $toRemove = true; | 1067 | $toRemove = true; |
1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { | 1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { |
1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); | 1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); |
1070 | $toRemove = true; | 1070 | $toRemove = true; |
1071 | } else if($embedCount > 3) { | 1071 | } else if($embedCount > 3) { |
1072 | $this->dbg(' more than 3 embeds'); | 1072 | $this->dbg(' more than 3 embeds'); |
1073 | $toRemove = true; | 1073 | $toRemove = true; |
1074 | } | 1074 | } |
1075 | } else { | 1075 | } else { |
1076 | $this->dbg('Standard clean...'); | 1076 | $this->dbg('Standard clean...'); |
1077 | if ( $img > $p ) { | 1077 | if ( $img > $p ) { |
1078 | $this->dbg(' more image elements than paragraph elements'); | 1078 | $this->dbg(' more image elements than paragraph elements'); |
1079 | $toRemove = true; | 1079 | $toRemove = true; |
1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1082 | $toRemove = true; | 1082 | $toRemove = true; |
1083 | } else if ( $input > floor($p/3) ) { | 1083 | } else if ( $input > floor($p/3) ) { |
1084 | $this->dbg(' too many <input> elements'); | 1084 | $this->dbg(' too many <input> elements'); |
1085 | $toRemove = true; | 1085 | $toRemove = true; |
1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | 1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { |
1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); | 1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); |
1088 | $toRemove = true; | 1088 | $toRemove = true; |
1089 | } else if($weight < 25 && $linkDensity > 0.2) { | 1089 | } else if($weight < 25 && $linkDensity > 0.2) { |
1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1091 | $toRemove = true; | 1091 | $toRemove = true; |
1092 | } else if($weight >= 25 && $linkDensity > 0.5) { | 1092 | } else if($weight >= 25 && $linkDensity > 0.5) { |
1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); | 1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); |
1094 | $toRemove = true; | 1094 | $toRemove = true; |
1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | 1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { |
1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); | 1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
1097 | $toRemove = true; | 1097 | $toRemove = true; |
1098 | } | 1098 | } |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | if ($toRemove) { | 1101 | if ($toRemove) { |
1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); | 1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); |
1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1104 | } | 1104 | } |
1105 | } | 1105 | } |
1106 | } | 1106 | } |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | /** | 1109 | /** |
1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | 1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
1111 | * | 1111 | * |
1112 | * @param DOMElement $e | 1112 | * @param DOMElement $e |
1113 | * @return void | 1113 | * @return void |
1114 | */ | 1114 | */ |
1115 | public function cleanHeaders($e) { | 1115 | public function cleanHeaders($e) { |
1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | 1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); | 1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); |
1118 | for ($i=$headers->length-1; $i >=0; $i--) { | 1118 | for ($i=$headers->length-1; $i >=0; $i--) { |
1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | 1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { |
1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | 1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); |
1121 | } | 1121 | } |
1122 | } | 1122 | } |
1123 | } | 1123 | } |
1124 | } | 1124 | } |
1125 | 1125 | ||
1126 | public function flagIsActive($flag) { | 1126 | public function flagIsActive($flag) { |
1127 | return ($this->flags & $flag) > 0; | 1127 | return ($this->flags & $flag) > 0; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | public function addFlag($flag) { | 1130 | public function addFlag($flag) { |
1131 | $this->flags = $this->flags | $flag; | 1131 | $this->flags = $this->flags | $flag; |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | public function removeFlag($flag) { | 1134 | public function removeFlag($flag) { |
1135 | $this->flags = $this->flags & ~$flag; | 1135 | $this->flags = $this->flags & ~$flag; |
1136 | } | 1136 | } |
1137 | } | 1137 | } |
1138 | ?> \ No newline at end of file | 1138 | ?> \ No newline at end of file |