diff options
author | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
---|---|---|
committer | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
commit | 7f667839764621b5aa01c9db8ce5dde2a29ef18f (patch) | |
tree | 93d8241ee81c87e18494325ae02f0589a8e328a2 /inc/3rdparty/libraries/html5/InputStream.php | |
parent | a84f77d6ba15a64ff00453f5d5190c021ce460ed (diff) | |
parent | 2abcccb37180c17318f5226f5d4bc28f30b621ea (diff) | |
download | wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.gz wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.zst wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.zip |
Merge pull request #1 from inthepoche/dev
Dev
Diffstat (limited to 'inc/3rdparty/libraries/html5/InputStream.php')
-rw-r--r-- | inc/3rdparty/libraries/html5/InputStream.php | 284 |
1 files changed, 284 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/html5/InputStream.php b/inc/3rdparty/libraries/html5/InputStream.php new file mode 100644 index 00000000..f98b4272 --- /dev/null +++ b/inc/3rdparty/libraries/html5/InputStream.php | |||
@@ -0,0 +1,284 @@ | |||
1 | <?php | ||
2 | |||
3 | /* | ||
4 | |||
5 | Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> | ||
6 | |||
7 | Permission is hereby granted, free of charge, to any person obtaining a | ||
8 | copy of this software and associated documentation files (the | ||
9 | "Software"), to deal in the Software without restriction, including | ||
10 | without limitation the rights to use, copy, modify, merge, publish, | ||
11 | distribute, sublicense, and/or sell copies of the Software, and to | ||
12 | permit persons to whom the Software is furnished to do so, subject to | ||
13 | the following conditions: | ||
14 | |||
15 | The above copyright notice and this permission notice shall be included | ||
16 | in all copies or substantial portions of the Software. | ||
17 | |||
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | ||
19 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | ||
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
25 | |||
26 | */ | ||
27 | |||
28 | // Some conventions: | ||
29 | // /* */ indicates verbatim text from the HTML 5 specification | ||
30 | // // indicates regular comments | ||
31 | |||
32 | class HTML5_InputStream { | ||
33 | /** | ||
34 | * The string data we're parsing. | ||
35 | */ | ||
36 | private $data; | ||
37 | |||
38 | /** | ||
39 | * The current integer byte position we are in $data | ||
40 | */ | ||
41 | private $char; | ||
42 | |||
43 | /** | ||
44 | * Length of $data; when $char === $data, we are at the end-of-file. | ||
45 | */ | ||
46 | private $EOF; | ||
47 | |||
48 | /** | ||
49 | * Parse errors. | ||
50 | */ | ||
51 | public $errors = array(); | ||
52 | |||
53 | /** | ||
54 | * @param $data Data to parse | ||
55 | */ | ||
56 | public function __construct($data) { | ||
57 | |||
58 | /* Given an encoding, the bytes in the input stream must be | ||
59 | converted to Unicode characters for the tokeniser, as | ||
60 | described by the rules for that encoding, except that the | ||
61 | leading U+FEFF BYTE ORDER MARK character, if any, must not | ||
62 | be stripped by the encoding layer (it is stripped by the rule below). | ||
63 | |||
64 | Bytes or sequences of bytes in the original byte stream that | ||
65 | could not be converted to Unicode characters must be converted | ||
66 | to U+FFFD REPLACEMENT CHARACTER code points. */ | ||
67 | |||
68 | // XXX currently assuming input data is UTF-8; once we | ||
69 | // build encoding detection this will no longer be the case | ||
70 | // | ||
71 | // We previously had an mbstring implementation here, but that | ||
72 | // implementation is heavily non-conforming, so it's been | ||
73 | // omitted. | ||
74 | if (extension_loaded('iconv')) { | ||
75 | // non-conforming | ||
76 | $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); | ||
77 | } else { | ||
78 | // we can make a conforming native implementation | ||
79 | throw new Exception('Not implemented, please install mbstring or iconv'); | ||
80 | } | ||
81 | |||
82 | /* One leading U+FEFF BYTE ORDER MARK character must be | ||
83 | ignored if any are present. */ | ||
84 | if (substr($data, 0, 3) === "\xEF\xBB\xBF") { | ||
85 | $data = substr($data, 3); | ||
86 | } | ||
87 | |||
88 | /* All U+0000 NULL characters in the input must be replaced | ||
89 | by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such | ||
90 | characters is a parse error. */ | ||
91 | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { | ||
92 | $this->errors[] = array( | ||
93 | 'type' => HTML5_Tokenizer::PARSEERROR, | ||
94 | 'data' => 'null-character' | ||
95 | ); | ||
96 | } | ||
97 | /* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED | ||
98 | (LF) characters are treated specially. Any CR characters | ||
99 | that are followed by LF characters must be removed, and any | ||
100 | CR characters not followed by LF characters must be converted | ||
101 | to LF characters. Thus, newlines in HTML DOMs are represented | ||
102 | by LF characters, and there are never any CR characters in the | ||
103 | input to the tokenization stage. */ | ||
104 | $data = str_replace( | ||
105 | array( | ||
106 | "\0", | ||
107 | "\r\n", | ||
108 | "\r" | ||
109 | ), | ||
110 | array( | ||
111 | "\xEF\xBF\xBD", | ||
112 | "\n", | ||
113 | "\n" | ||
114 | ), | ||
115 | $data | ||
116 | ); | ||
117 | |||
118 | /* Any occurrences of any characters in the ranges U+0001 to | ||
119 | U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, | ||
120 | U+D800 to U+DFFF , U+FDD0 to U+FDEF, and | ||
121 | characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, | ||
122 | U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, | ||
123 | U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, | ||
124 | U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, | ||
125 | U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and | ||
126 | U+10FFFF are parse errors. (These are all control characters | ||
127 | or permanently undefined Unicode characters.) */ | ||
128 | // Check PCRE is loaded. | ||
129 | if (extension_loaded('pcre')) { | ||
130 | $count = preg_match_all( | ||
131 | '/(?: | ||
132 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F | ||
133 | | | ||
134 | \xC2[\x80-\x9F] # U+0080 to U+009F | ||
135 | | | ||
136 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF | ||
137 | | | ||
138 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF | ||
139 | | | ||
140 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF | ||
141 | | | ||
142 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) | ||
143 | )/x', | ||
144 | $data, | ||
145 | $matches | ||
146 | ); | ||
147 | for ($i = 0; $i < $count; $i++) { | ||
148 | $this->errors[] = array( | ||
149 | 'type' => HTML5_Tokenizer::PARSEERROR, | ||
150 | 'data' => 'invalid-codepoint' | ||
151 | ); | ||
152 | } | ||
153 | } else { | ||
154 | // XXX: Need non-PCRE impl, probably using substr_count | ||
155 | } | ||
156 | |||
157 | $this->data = $data; | ||
158 | $this->char = 0; | ||
159 | $this->EOF = strlen($data); | ||
160 | } | ||
161 | |||
162 | /** | ||
163 | * Returns the current line that the tokenizer is at. | ||
164 | */ | ||
165 | public function getCurrentLine() { | ||
166 | // Check the string isn't empty | ||
167 | if($this->EOF) { | ||
168 | // Add one to $this->char because we want the number for the next | ||
169 | // byte to be processed. | ||
170 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; | ||
171 | } else { | ||
172 | // If the string is empty, we are on the first line (sorta). | ||
173 | return 1; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | /** | ||
178 | * Returns the current column of the current line that the tokenizer is at. | ||
179 | */ | ||
180 | public function getColumnOffset() { | ||
181 | // strrpos is weird, and the offset needs to be negative for what we | ||
182 | // want (i.e., the last \n before $this->char). This needs to not have | ||
183 | // one (to make it point to the next character, the one we want the | ||
184 | // position of) added to it because strrpos's behaviour includes the | ||
185 | // final offset byte. | ||
186 | $lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data)); | ||
187 | |||
188 | // However, for here we want the length up until the next byte to be | ||
189 | // processed, so add one to the current byte ($this->char). | ||
190 | if($lastLine !== false) { | ||
191 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); | ||
192 | } else { | ||
193 | $findLengthOf = substr($this->data, 0, $this->char); | ||
194 | } | ||
195 | |||
196 | // Get the length for the string we need. | ||
197 | if(extension_loaded('iconv')) { | ||
198 | return iconv_strlen($findLengthOf, 'utf-8'); | ||
199 | } elseif(extension_loaded('mbstring')) { | ||
200 | return mb_strlen($findLengthOf, 'utf-8'); | ||
201 | } elseif(extension_loaded('xml')) { | ||
202 | return strlen(utf8_decode($findLengthOf)); | ||
203 | } else { | ||
204 | $count = count_chars($findLengthOf); | ||
205 | // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) | ||
206 | // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) | ||
207 | return array_sum(array_slice($count, 0, 0x80)) + | ||
208 | array_sum(array_slice($count, 0xC2, 0x33)); | ||
209 | } | ||
210 | } | ||
211 | |||
212 | /** | ||
213 | * Retrieve the currently consume character. | ||
214 | * @note This performs bounds checking | ||
215 | */ | ||
216 | public function char() { | ||
217 | return ($this->char++ < $this->EOF) | ||
218 | ? $this->data[$this->char - 1] | ||
219 | : false; | ||
220 | } | ||
221 | |||
222 | /** | ||
223 | * Get all characters until EOF. | ||
224 | * @note This performs bounds checking | ||
225 | */ | ||
226 | public function remainingChars() { | ||
227 | if($this->char < $this->EOF) { | ||
228 | $data = substr($this->data, $this->char); | ||
229 | $this->char = $this->EOF; | ||
230 | return $data; | ||
231 | } else { | ||
232 | return false; | ||
233 | } | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * Matches as far as possible until we reach a certain set of bytes | ||
238 | * and returns the matched substring. | ||
239 | * @param $bytes Bytes to match. | ||
240 | */ | ||
241 | public function charsUntil($bytes, $max = null) { | ||
242 | if ($this->char < $this->EOF) { | ||
243 | if ($max === 0 || $max) { | ||
244 | $len = strcspn($this->data, $bytes, $this->char, $max); | ||
245 | } else { | ||
246 | $len = strcspn($this->data, $bytes, $this->char); | ||
247 | } | ||
248 | $string = (string) substr($this->data, $this->char, $len); | ||
249 | $this->char += $len; | ||
250 | return $string; | ||
251 | } else { | ||
252 | return false; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | /** | ||
257 | * Matches as far as possible with a certain set of bytes | ||
258 | * and returns the matched substring. | ||
259 | * @param $bytes Bytes to match. | ||
260 | */ | ||
261 | public function charsWhile($bytes, $max = null) { | ||
262 | if ($this->char < $this->EOF) { | ||
263 | if ($max === 0 || $max) { | ||
264 | $len = strspn($this->data, $bytes, $this->char, $max); | ||
265 | } else { | ||
266 | $len = strspn($this->data, $bytes, $this->char); | ||
267 | } | ||
268 | $string = (string) substr($this->data, $this->char, $len); | ||
269 | $this->char += $len; | ||
270 | return $string; | ||
271 | } else { | ||
272 | return false; | ||
273 | } | ||
274 | } | ||
275 | |||
276 | /** | ||
277 | * Unconsume one character. | ||
278 | */ | ||
279 | public function unget() { | ||
280 | if ($this->char <= $this->EOF) { | ||
281 | $this->char--; | ||
282 | } | ||
283 | } | ||
284 | } | ||