]>
Commit | Line | Data |
---|---|---|
1 | <?php | |
2 | ||
3 | /* | |
4 | ||
5 | Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> | |
6 | ||
7 | Permission is hereby granted, free of charge, to any person obtaining a | |
8 | copy of this software and associated documentation files (the | |
9 | "Software"), to deal in the Software without restriction, including | |
10 | without limitation the rights to use, copy, modify, merge, publish, | |
11 | distribute, sublicense, and/or sell copies of the Software, and to | |
12 | permit persons to whom the Software is furnished to do so, subject to | |
13 | the following conditions: | |
14 | ||
15 | The above copyright notice and this permission notice shall be included | |
16 | in all copies or substantial portions of the Software. | |
17 | ||
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
19 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |
23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
25 | ||
26 | */ | |
27 | ||
28 | // Some conventions: | |
29 | // /* */ indicates verbatim text from the HTML 5 specification | |
30 | // // indicates regular comments | |
31 | ||
32 | class HTML5_InputStream { | |
33 | /** | |
34 | * The string data we're parsing. | |
35 | */ | |
36 | private $data; | |
37 | ||
38 | /** | |
39 | * The current integer byte position we are in $data | |
40 | */ | |
41 | private $char; | |
42 | ||
43 | /** | |
44 | * Length of $data; when $char === $data, we are at the end-of-file. | |
45 | */ | |
46 | private $EOF; | |
47 | ||
48 | /** | |
49 | * Parse errors. | |
50 | */ | |
51 | public $errors = array(); | |
52 | ||
53 | /** | |
54 | * @param $data Data to parse | |
55 | */ | |
56 | public function __construct($data) { | |
57 | ||
58 | /* Given an encoding, the bytes in the input stream must be | |
59 | converted to Unicode characters for the tokeniser, as | |
60 | described by the rules for that encoding, except that the | |
61 | leading U+FEFF BYTE ORDER MARK character, if any, must not | |
62 | be stripped by the encoding layer (it is stripped by the rule below). | |
63 | ||
64 | Bytes or sequences of bytes in the original byte stream that | |
65 | could not be converted to Unicode characters must be converted | |
66 | to U+FFFD REPLACEMENT CHARACTER code points. */ | |
67 | ||
68 | // XXX currently assuming input data is UTF-8; once we | |
69 | // build encoding detection this will no longer be the case | |
70 | // | |
71 | // We previously had an mbstring implementation here, but that | |
72 | // implementation is heavily non-conforming, so it's been | |
73 | // omitted. | |
74 | if (extension_loaded('iconv')) { | |
75 | // non-conforming | |
76 | $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); | |
77 | } else { | |
78 | // we can make a conforming native implementation | |
79 | throw new Exception('Not implemented, please install mbstring or iconv'); | |
80 | } | |
81 | ||
82 | /* One leading U+FEFF BYTE ORDER MARK character must be | |
83 | ignored if any are present. */ | |
84 | if (substr($data, 0, 3) === "\xEF\xBB\xBF") { | |
85 | $data = substr($data, 3); | |
86 | } | |
87 | ||
88 | /* All U+0000 NULL characters in the input must be replaced | |
89 | by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such | |
90 | characters is a parse error. */ | |
91 | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { | |
92 | $this->errors[] = array( | |
93 | 'type' => HTML5_Tokenizer::PARSEERROR, | |
94 | 'data' => 'null-character' | |
95 | ); | |
96 | } | |
97 | /* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED | |
98 | (LF) characters are treated specially. Any CR characters | |
99 | that are followed by LF characters must be removed, and any | |
100 | CR characters not followed by LF characters must be converted | |
101 | to LF characters. Thus, newlines in HTML DOMs are represented | |
102 | by LF characters, and there are never any CR characters in the | |
103 | input to the tokenization stage. */ | |
104 | $data = str_replace( | |
105 | array( | |
106 | "\0", | |
107 | "\r\n", | |
108 | "\r" | |
109 | ), | |
110 | array( | |
111 | "\xEF\xBF\xBD", | |
112 | "\n", | |
113 | "\n" | |
114 | ), | |
115 | $data | |
116 | ); | |
117 | ||
118 | /* Any occurrences of any characters in the ranges U+0001 to | |
119 | U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, | |
120 | U+D800 to U+DFFF , U+FDD0 to U+FDEF, and | |
121 | characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, | |
122 | U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, | |
123 | U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, | |
124 | U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, | |
125 | U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and | |
126 | U+10FFFF are parse errors. (These are all control characters | |
127 | or permanently undefined Unicode characters.) */ | |
128 | // Check PCRE is loaded. | |
129 | if (extension_loaded('pcre')) { | |
130 | $count = preg_match_all( | |
131 | '/(?: | |
132 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F | |
133 | | | |
134 | \xC2[\x80-\x9F] # U+0080 to U+009F | |
135 | | | |
136 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF | |
137 | | | |
138 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF | |
139 | | | |
140 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF | |
141 | | | |
142 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) | |
143 | )/x', | |
144 | $data, | |
145 | $matches | |
146 | ); | |
147 | for ($i = 0; $i < $count; $i++) { | |
148 | $this->errors[] = array( | |
149 | 'type' => HTML5_Tokenizer::PARSEERROR, | |
150 | 'data' => 'invalid-codepoint' | |
151 | ); | |
152 | } | |
153 | } else { | |
154 | // XXX: Need non-PCRE impl, probably using substr_count | |
155 | } | |
156 | ||
157 | $this->data = $data; | |
158 | $this->char = 0; | |
159 | $this->EOF = strlen($data); | |
160 | } | |
161 | ||
162 | /** | |
163 | * Returns the current line that the tokenizer is at. | |
164 | */ | |
165 | public function getCurrentLine() { | |
166 | // Check the string isn't empty | |
167 | if($this->EOF) { | |
168 | // Add one to $this->char because we want the number for the next | |
169 | // byte to be processed. | |
170 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; | |
171 | } else { | |
172 | // If the string is empty, we are on the first line (sorta). | |
173 | return 1; | |
174 | } | |
175 | } | |
176 | ||
177 | /** | |
178 | * Returns the current column of the current line that the tokenizer is at. | |
179 | */ | |
180 | public function getColumnOffset() { | |
181 | // strrpos is weird, and the offset needs to be negative for what we | |
182 | // want (i.e., the last \n before $this->char). This needs to not have | |
183 | // one (to make it point to the next character, the one we want the | |
184 | // position of) added to it because strrpos's behaviour includes the | |
185 | // final offset byte. | |
186 | $lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data)); | |
187 | ||
188 | // However, for here we want the length up until the next byte to be | |
189 | // processed, so add one to the current byte ($this->char). | |
190 | if($lastLine !== false) { | |
191 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); | |
192 | } else { | |
193 | $findLengthOf = substr($this->data, 0, $this->char); | |
194 | } | |
195 | ||
196 | // Get the length for the string we need. | |
197 | if(extension_loaded('iconv')) { | |
198 | return iconv_strlen($findLengthOf, 'utf-8'); | |
199 | } elseif(extension_loaded('mbstring')) { | |
200 | return mb_strlen($findLengthOf, 'utf-8'); | |
201 | } elseif(extension_loaded('xml')) { | |
202 | return strlen(utf8_decode($findLengthOf)); | |
203 | } else { | |
204 | $count = count_chars($findLengthOf); | |
205 | // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) | |
206 | // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) | |
207 | return array_sum(array_slice($count, 0, 0x80)) + | |
208 | array_sum(array_slice($count, 0xC2, 0x33)); | |
209 | } | |
210 | } | |
211 | ||
212 | /** | |
213 | * Retrieve the currently consume character. | |
214 | * @note This performs bounds checking | |
215 | */ | |
216 | public function char() { | |
217 | return ($this->char++ < $this->EOF) | |
218 | ? $this->data[$this->char - 1] | |
219 | : false; | |
220 | } | |
221 | ||
222 | /** | |
223 | * Get all characters until EOF. | |
224 | * @note This performs bounds checking | |
225 | */ | |
226 | public function remainingChars() { | |
227 | if($this->char < $this->EOF) { | |
228 | $data = substr($this->data, $this->char); | |
229 | $this->char = $this->EOF; | |
230 | return $data; | |
231 | } else { | |
232 | return false; | |
233 | } | |
234 | } | |
235 | ||
236 | /** | |
237 | * Matches as far as possible until we reach a certain set of bytes | |
238 | * and returns the matched substring. | |
239 | * @param $bytes Bytes to match. | |
240 | */ | |
241 | public function charsUntil($bytes, $max = null) { | |
242 | if ($this->char < $this->EOF) { | |
243 | if ($max === 0 || $max) { | |
244 | $len = strcspn($this->data, $bytes, $this->char, $max); | |
245 | } else { | |
246 | $len = strcspn($this->data, $bytes, $this->char); | |
247 | } | |
248 | $string = (string) substr($this->data, $this->char, $len); | |
249 | $this->char += $len; | |
250 | return $string; | |
251 | } else { | |
252 | return false; | |
253 | } | |
254 | } | |
255 | ||
256 | /** | |
257 | * Matches as far as possible with a certain set of bytes | |
258 | * and returns the matched substring. | |
259 | * @param $bytes Bytes to match. | |
260 | */ | |
261 | public function charsWhile($bytes, $max = null) { | |
262 | if ($this->char < $this->EOF) { | |
263 | if ($max === 0 || $max) { | |
264 | $len = strspn($this->data, $bytes, $this->char, $max); | |
265 | } else { | |
266 | $len = strspn($this->data, $bytes, $this->char); | |
267 | } | |
268 | $string = (string) substr($this->data, $this->char, $len); | |
269 | $this->char += $len; | |
270 | return $string; | |
271 | } else { | |
272 | return false; | |
273 | } | |
274 | } | |
275 | ||
276 | /** | |
277 | * Unconsume one character. | |
278 | */ | |
279 | public function unget() { | |
280 | if ($this->char <= $this->EOF) { | |
281 | $this->char--; | |
282 | } | |
283 | } | |
284 | } |