]>
Commit | Line | Data |
---|---|---|
c680a8e1 RS |
1 | // Copyright 2010 The Go Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style | |
3 | // license that can be found in the LICENSE file. | |
4 | ||
5 | package html | |
6 | ||
7 | import ( | |
8 | "errors" | |
9 | "fmt" | |
10 | "io" | |
11 | "strings" | |
12 | ||
13 | a "golang.org/x/net/html/atom" | |
14 | ) | |
15 | ||
16 | // A parser implements the HTML5 parsing algorithm: | |
17 | // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction | |
18 | type parser struct { | |
19 | // tokenizer provides the tokens for the parser. | |
20 | tokenizer *Tokenizer | |
21 | // tok is the most recently read token. | |
22 | tok Token | |
23 | // Self-closing tags like <hr/> are treated as start tags, except that | |
24 | // hasSelfClosingToken is set while they are being processed. | |
25 | hasSelfClosingToken bool | |
26 | // doc is the document root element. | |
27 | doc *Node | |
28 | // The stack of open elements (section 12.2.3.2) and active formatting | |
29 | // elements (section 12.2.3.3). | |
30 | oe, afe nodeStack | |
31 | // Element pointers (section 12.2.3.4). | |
32 | head, form *Node | |
33 | // Other parsing state flags (section 12.2.3.5). | |
34 | scripting, framesetOK bool | |
35 | // im is the current insertion mode. | |
36 | im insertionMode | |
37 | // originalIM is the insertion mode to go back to after completing a text | |
38 | // or inTableText insertion mode. | |
39 | originalIM insertionMode | |
40 | // fosterParenting is whether new elements should be inserted according to | |
41 | // the foster parenting rules (section 12.2.5.3). | |
42 | fosterParenting bool | |
43 | // quirks is whether the parser is operating in "quirks mode." | |
44 | quirks bool | |
45 | // fragment is whether the parser is parsing an HTML fragment. | |
46 | fragment bool | |
47 | // context is the context element when parsing an HTML fragment | |
48 | // (section 12.4). | |
49 | context *Node | |
50 | } | |
51 | ||
52 | func (p *parser) top() *Node { | |
53 | if n := p.oe.top(); n != nil { | |
54 | return n | |
55 | } | |
56 | return p.doc | |
57 | } | |
58 | ||
59 | // Stop tags for use in popUntil. These come from section 12.2.3.2. | |
60 | var ( | |
61 | defaultScopeStopTags = map[string][]a.Atom{ | |
62 | "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, | |
63 | "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, | |
64 | "svg": {a.Desc, a.ForeignObject, a.Title}, | |
65 | } | |
66 | ) | |
67 | ||
68 | type scope int | |
69 | ||
70 | const ( | |
71 | defaultScope scope = iota | |
72 | listItemScope | |
73 | buttonScope | |
74 | tableScope | |
75 | tableRowScope | |
76 | tableBodyScope | |
77 | selectScope | |
78 | ) | |
79 | ||
80 | // popUntil pops the stack of open elements at the highest element whose tag | |
81 | // is in matchTags, provided there is no higher element in the scope's stop | |
82 | // tags (as defined in section 12.2.3.2). It returns whether or not there was | |
83 | // such an element. If there was not, popUntil leaves the stack unchanged. | |
84 | // | |
85 | // For example, the set of stop tags for table scope is: "html", "table". If | |
86 | // the stack was: | |
87 | // ["html", "body", "font", "table", "b", "i", "u"] | |
88 | // then popUntil(tableScope, "font") would return false, but | |
89 | // popUntil(tableScope, "i") would return true and the stack would become: | |
90 | // ["html", "body", "font", "table", "b"] | |
91 | // | |
92 | // If an element's tag is in both the stop tags and matchTags, then the stack | |
93 | // will be popped and the function returns true (provided, of course, there was | |
94 | // no higher element in the stack that was also in the stop tags). For example, | |
95 | // popUntil(tableScope, "table") returns true and leaves: | |
96 | // ["html", "body", "font"] | |
97 | func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { | |
98 | if i := p.indexOfElementInScope(s, matchTags...); i != -1 { | |
99 | p.oe = p.oe[:i] | |
100 | return true | |
101 | } | |
102 | return false | |
103 | } | |
104 | ||
105 | // indexOfElementInScope returns the index in p.oe of the highest element whose | |
106 | // tag is in matchTags that is in scope. If no matching element is in scope, it | |
107 | // returns -1. | |
108 | func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { | |
109 | for i := len(p.oe) - 1; i >= 0; i-- { | |
110 | tagAtom := p.oe[i].DataAtom | |
111 | if p.oe[i].Namespace == "" { | |
112 | for _, t := range matchTags { | |
113 | if t == tagAtom { | |
114 | return i | |
115 | } | |
116 | } | |
117 | switch s { | |
118 | case defaultScope: | |
119 | // No-op. | |
120 | case listItemScope: | |
121 | if tagAtom == a.Ol || tagAtom == a.Ul { | |
122 | return -1 | |
123 | } | |
124 | case buttonScope: | |
125 | if tagAtom == a.Button { | |
126 | return -1 | |
127 | } | |
128 | case tableScope: | |
129 | if tagAtom == a.Html || tagAtom == a.Table { | |
130 | return -1 | |
131 | } | |
132 | case selectScope: | |
133 | if tagAtom != a.Optgroup && tagAtom != a.Option { | |
134 | return -1 | |
135 | } | |
136 | default: | |
137 | panic("unreachable") | |
138 | } | |
139 | } | |
140 | switch s { | |
141 | case defaultScope, listItemScope, buttonScope: | |
142 | for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { | |
143 | if t == tagAtom { | |
144 | return -1 | |
145 | } | |
146 | } | |
147 | } | |
148 | } | |
149 | return -1 | |
150 | } | |
151 | ||
152 | // elementInScope is like popUntil, except that it doesn't modify the stack of | |
153 | // open elements. | |
154 | func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { | |
155 | return p.indexOfElementInScope(s, matchTags...) != -1 | |
156 | } | |
157 | ||
158 | // clearStackToContext pops elements off the stack of open elements until a | |
159 | // scope-defined element is found. | |
160 | func (p *parser) clearStackToContext(s scope) { | |
161 | for i := len(p.oe) - 1; i >= 0; i-- { | |
162 | tagAtom := p.oe[i].DataAtom | |
163 | switch s { | |
164 | case tableScope: | |
165 | if tagAtom == a.Html || tagAtom == a.Table { | |
166 | p.oe = p.oe[:i+1] | |
167 | return | |
168 | } | |
169 | case tableRowScope: | |
170 | if tagAtom == a.Html || tagAtom == a.Tr { | |
171 | p.oe = p.oe[:i+1] | |
172 | return | |
173 | } | |
174 | case tableBodyScope: | |
175 | if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead { | |
176 | p.oe = p.oe[:i+1] | |
177 | return | |
178 | } | |
179 | default: | |
180 | panic("unreachable") | |
181 | } | |
182 | } | |
183 | } | |
184 | ||
185 | // generateImpliedEndTags pops nodes off the stack of open elements as long as | |
186 | // the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt. | |
187 | // If exceptions are specified, nodes with that name will not be popped off. | |
188 | func (p *parser) generateImpliedEndTags(exceptions ...string) { | |
189 | var i int | |
190 | loop: | |
191 | for i = len(p.oe) - 1; i >= 0; i-- { | |
192 | n := p.oe[i] | |
193 | if n.Type == ElementNode { | |
194 | switch n.DataAtom { | |
195 | case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt: | |
196 | for _, except := range exceptions { | |
197 | if n.Data == except { | |
198 | break loop | |
199 | } | |
200 | } | |
201 | continue | |
202 | } | |
203 | } | |
204 | break | |
205 | } | |
206 | ||
207 | p.oe = p.oe[:i+1] | |
208 | } | |
209 | ||
210 | // addChild adds a child node n to the top element, and pushes n onto the stack | |
211 | // of open elements if it is an element node. | |
212 | func (p *parser) addChild(n *Node) { | |
213 | if p.shouldFosterParent() { | |
214 | p.fosterParent(n) | |
215 | } else { | |
216 | p.top().AppendChild(n) | |
217 | } | |
218 | ||
219 | if n.Type == ElementNode { | |
220 | p.oe = append(p.oe, n) | |
221 | } | |
222 | } | |
223 | ||
224 | // shouldFosterParent returns whether the next node to be added should be | |
225 | // foster parented. | |
226 | func (p *parser) shouldFosterParent() bool { | |
227 | if p.fosterParenting { | |
228 | switch p.top().DataAtom { | |
229 | case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: | |
230 | return true | |
231 | } | |
232 | } | |
233 | return false | |
234 | } | |
235 | ||
236 | // fosterParent adds a child node according to the foster parenting rules. | |
237 | // Section 12.2.5.3, "foster parenting". | |
238 | func (p *parser) fosterParent(n *Node) { | |
239 | var table, parent, prev *Node | |
240 | var i int | |
241 | for i = len(p.oe) - 1; i >= 0; i-- { | |
242 | if p.oe[i].DataAtom == a.Table { | |
243 | table = p.oe[i] | |
244 | break | |
245 | } | |
246 | } | |
247 | ||
248 | if table == nil { | |
249 | // The foster parent is the html element. | |
250 | parent = p.oe[0] | |
251 | } else { | |
252 | parent = table.Parent | |
253 | } | |
254 | if parent == nil { | |
255 | parent = p.oe[i-1] | |
256 | } | |
257 | ||
258 | if table != nil { | |
259 | prev = table.PrevSibling | |
260 | } else { | |
261 | prev = parent.LastChild | |
262 | } | |
263 | if prev != nil && prev.Type == TextNode && n.Type == TextNode { | |
264 | prev.Data += n.Data | |
265 | return | |
266 | } | |
267 | ||
268 | parent.InsertBefore(n, table) | |
269 | } | |
270 | ||
271 | // addText adds text to the preceding node if it is a text node, or else it | |
272 | // calls addChild with a new text node. | |
273 | func (p *parser) addText(text string) { | |
274 | if text == "" { | |
275 | return | |
276 | } | |
277 | ||
278 | if p.shouldFosterParent() { | |
279 | p.fosterParent(&Node{ | |
280 | Type: TextNode, | |
281 | Data: text, | |
282 | }) | |
283 | return | |
284 | } | |
285 | ||
286 | t := p.top() | |
287 | if n := t.LastChild; n != nil && n.Type == TextNode { | |
288 | n.Data += text | |
289 | return | |
290 | } | |
291 | p.addChild(&Node{ | |
292 | Type: TextNode, | |
293 | Data: text, | |
294 | }) | |
295 | } | |
296 | ||
297 | // addElement adds a child element based on the current token. | |
298 | func (p *parser) addElement() { | |
299 | p.addChild(&Node{ | |
300 | Type: ElementNode, | |
301 | DataAtom: p.tok.DataAtom, | |
302 | Data: p.tok.Data, | |
303 | Attr: p.tok.Attr, | |
304 | }) | |
305 | } | |
306 | ||
307 | // Section 12.2.3.3. | |
308 | func (p *parser) addFormattingElement() { | |
309 | tagAtom, attr := p.tok.DataAtom, p.tok.Attr | |
310 | p.addElement() | |
311 | ||
312 | // Implement the Noah's Ark clause, but with three per family instead of two. | |
313 | identicalElements := 0 | |
314 | findIdenticalElements: | |
315 | for i := len(p.afe) - 1; i >= 0; i-- { | |
316 | n := p.afe[i] | |
317 | if n.Type == scopeMarkerNode { | |
318 | break | |
319 | } | |
320 | if n.Type != ElementNode { | |
321 | continue | |
322 | } | |
323 | if n.Namespace != "" { | |
324 | continue | |
325 | } | |
326 | if n.DataAtom != tagAtom { | |
327 | continue | |
328 | } | |
329 | if len(n.Attr) != len(attr) { | |
330 | continue | |
331 | } | |
332 | compareAttributes: | |
333 | for _, t0 := range n.Attr { | |
334 | for _, t1 := range attr { | |
335 | if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { | |
336 | // Found a match for this attribute, continue with the next attribute. | |
337 | continue compareAttributes | |
338 | } | |
339 | } | |
340 | // If we get here, there is no attribute that matches a. | |
341 | // Therefore the element is not identical to the new one. | |
342 | continue findIdenticalElements | |
343 | } | |
344 | ||
345 | identicalElements++ | |
346 | if identicalElements >= 3 { | |
347 | p.afe.remove(n) | |
348 | } | |
349 | } | |
350 | ||
351 | p.afe = append(p.afe, p.top()) | |
352 | } | |
353 | ||
354 | // Section 12.2.3.3. | |
355 | func (p *parser) clearActiveFormattingElements() { | |
356 | for { | |
357 | n := p.afe.pop() | |
358 | if len(p.afe) == 0 || n.Type == scopeMarkerNode { | |
359 | return | |
360 | } | |
361 | } | |
362 | } | |
363 | ||
364 | // Section 12.2.3.3. | |
365 | func (p *parser) reconstructActiveFormattingElements() { | |
366 | n := p.afe.top() | |
367 | if n == nil { | |
368 | return | |
369 | } | |
370 | if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { | |
371 | return | |
372 | } | |
373 | i := len(p.afe) - 1 | |
374 | for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { | |
375 | if i == 0 { | |
376 | i = -1 | |
377 | break | |
378 | } | |
379 | i-- | |
380 | n = p.afe[i] | |
381 | } | |
382 | for { | |
383 | i++ | |
384 | clone := p.afe[i].clone() | |
385 | p.addChild(clone) | |
386 | p.afe[i] = clone | |
387 | if i == len(p.afe)-1 { | |
388 | break | |
389 | } | |
390 | } | |
391 | } | |
392 | ||
393 | // Section 12.2.4. | |
394 | func (p *parser) acknowledgeSelfClosingTag() { | |
395 | p.hasSelfClosingToken = false | |
396 | } | |
397 | ||
398 | // An insertion mode (section 12.2.3.1) is the state transition function from | |
399 | // a particular state in the HTML5 parser's state machine. It updates the | |
400 | // parser's fields depending on parser.tok (where ErrorToken means EOF). | |
401 | // It returns whether the token was consumed. | |
402 | type insertionMode func(*parser) bool | |
403 | ||
404 | // setOriginalIM sets the insertion mode to return to after completing a text or | |
405 | // inTableText insertion mode. | |
406 | // Section 12.2.3.1, "using the rules for". | |
407 | func (p *parser) setOriginalIM() { | |
408 | if p.originalIM != nil { | |
409 | panic("html: bad parser state: originalIM was set twice") | |
410 | } | |
411 | p.originalIM = p.im | |
412 | } | |
413 | ||
414 | // Section 12.2.3.1, "reset the insertion mode". | |
415 | func (p *parser) resetInsertionMode() { | |
416 | for i := len(p.oe) - 1; i >= 0; i-- { | |
417 | n := p.oe[i] | |
418 | if i == 0 && p.context != nil { | |
419 | n = p.context | |
420 | } | |
421 | ||
422 | switch n.DataAtom { | |
423 | case a.Select: | |
424 | p.im = inSelectIM | |
425 | case a.Td, a.Th: | |
426 | p.im = inCellIM | |
427 | case a.Tr: | |
428 | p.im = inRowIM | |
429 | case a.Tbody, a.Thead, a.Tfoot: | |
430 | p.im = inTableBodyIM | |
431 | case a.Caption: | |
432 | p.im = inCaptionIM | |
433 | case a.Colgroup: | |
434 | p.im = inColumnGroupIM | |
435 | case a.Table: | |
436 | p.im = inTableIM | |
437 | case a.Head: | |
438 | p.im = inBodyIM | |
439 | case a.Body: | |
440 | p.im = inBodyIM | |
441 | case a.Frameset: | |
442 | p.im = inFramesetIM | |
443 | case a.Html: | |
444 | p.im = beforeHeadIM | |
445 | default: | |
446 | continue | |
447 | } | |
448 | return | |
449 | } | |
450 | p.im = inBodyIM | |
451 | } | |
452 | ||
453 | const whitespace = " \t\r\n\f" | |
454 | ||
455 | // Section 12.2.5.4.1. | |
456 | func initialIM(p *parser) bool { | |
457 | switch p.tok.Type { | |
458 | case TextToken: | |
459 | p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) | |
460 | if len(p.tok.Data) == 0 { | |
461 | // It was all whitespace, so ignore it. | |
462 | return true | |
463 | } | |
464 | case CommentToken: | |
465 | p.doc.AppendChild(&Node{ | |
466 | Type: CommentNode, | |
467 | Data: p.tok.Data, | |
468 | }) | |
469 | return true | |
470 | case DoctypeToken: | |
471 | n, quirks := parseDoctype(p.tok.Data) | |
472 | p.doc.AppendChild(n) | |
473 | p.quirks = quirks | |
474 | p.im = beforeHTMLIM | |
475 | return true | |
476 | } | |
477 | p.quirks = true | |
478 | p.im = beforeHTMLIM | |
479 | return false | |
480 | } | |
481 | ||
482 | // Section 12.2.5.4.2. | |
483 | func beforeHTMLIM(p *parser) bool { | |
484 | switch p.tok.Type { | |
485 | case DoctypeToken: | |
486 | // Ignore the token. | |
487 | return true | |
488 | case TextToken: | |
489 | p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) | |
490 | if len(p.tok.Data) == 0 { | |
491 | // It was all whitespace, so ignore it. | |
492 | return true | |
493 | } | |
494 | case StartTagToken: | |
495 | if p.tok.DataAtom == a.Html { | |
496 | p.addElement() | |
497 | p.im = beforeHeadIM | |
498 | return true | |
499 | } | |
500 | case EndTagToken: | |
501 | switch p.tok.DataAtom { | |
502 | case a.Head, a.Body, a.Html, a.Br: | |
503 | p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) | |
504 | return false | |
505 | default: | |
506 | // Ignore the token. | |
507 | return true | |
508 | } | |
509 | case CommentToken: | |
510 | p.doc.AppendChild(&Node{ | |
511 | Type: CommentNode, | |
512 | Data: p.tok.Data, | |
513 | }) | |
514 | return true | |
515 | } | |
516 | p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) | |
517 | return false | |
518 | } | |
519 | ||
520 | // Section 12.2.5.4.3. | |
521 | func beforeHeadIM(p *parser) bool { | |
522 | switch p.tok.Type { | |
523 | case TextToken: | |
524 | p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) | |
525 | if len(p.tok.Data) == 0 { | |
526 | // It was all whitespace, so ignore it. | |
527 | return true | |
528 | } | |
529 | case StartTagToken: | |
530 | switch p.tok.DataAtom { | |
531 | case a.Head: | |
532 | p.addElement() | |
533 | p.head = p.top() | |
534 | p.im = inHeadIM | |
535 | return true | |
536 | case a.Html: | |
537 | return inBodyIM(p) | |
538 | } | |
539 | case EndTagToken: | |
540 | switch p.tok.DataAtom { | |
541 | case a.Head, a.Body, a.Html, a.Br: | |
542 | p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) | |
543 | return false | |
544 | default: | |
545 | // Ignore the token. | |
546 | return true | |
547 | } | |
548 | case CommentToken: | |
549 | p.addChild(&Node{ | |
550 | Type: CommentNode, | |
551 | Data: p.tok.Data, | |
552 | }) | |
553 | return true | |
554 | case DoctypeToken: | |
555 | // Ignore the token. | |
556 | return true | |
557 | } | |
558 | ||
559 | p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) | |
560 | return false | |
561 | } | |
562 | ||
563 | // Section 12.2.5.4.4. | |
564 | func inHeadIM(p *parser) bool { | |
565 | switch p.tok.Type { | |
566 | case TextToken: | |
567 | s := strings.TrimLeft(p.tok.Data, whitespace) | |
568 | if len(s) < len(p.tok.Data) { | |
569 | // Add the initial whitespace to the current node. | |
570 | p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) | |
571 | if s == "" { | |
572 | return true | |
573 | } | |
574 | p.tok.Data = s | |
575 | } | |
576 | case StartTagToken: | |
577 | switch p.tok.DataAtom { | |
578 | case a.Html: | |
579 | return inBodyIM(p) | |
580 | case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: | |
581 | p.addElement() | |
582 | p.oe.pop() | |
583 | p.acknowledgeSelfClosingTag() | |
584 | return true | |
585 | case a.Script, a.Title, a.Noscript, a.Noframes, a.Style: | |
586 | p.addElement() | |
587 | p.setOriginalIM() | |
588 | p.im = textIM | |
589 | return true | |
590 | case a.Head: | |
591 | // Ignore the token. | |
592 | return true | |
593 | } | |
594 | case EndTagToken: | |
595 | switch p.tok.DataAtom { | |
596 | case a.Head: | |
597 | n := p.oe.pop() | |
598 | if n.DataAtom != a.Head { | |
599 | panic("html: bad parser state: <head> element not found, in the in-head insertion mode") | |
600 | } | |
601 | p.im = afterHeadIM | |
602 | return true | |
603 | case a.Body, a.Html, a.Br: | |
604 | p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) | |
605 | return false | |
606 | default: | |
607 | // Ignore the token. | |
608 | return true | |
609 | } | |
610 | case CommentToken: | |
611 | p.addChild(&Node{ | |
612 | Type: CommentNode, | |
613 | Data: p.tok.Data, | |
614 | }) | |
615 | return true | |
616 | case DoctypeToken: | |
617 | // Ignore the token. | |
618 | return true | |
619 | } | |
620 | ||
621 | p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) | |
622 | return false | |
623 | } | |
624 | ||
625 | // Section 12.2.5.4.6. | |
626 | func afterHeadIM(p *parser) bool { | |
627 | switch p.tok.Type { | |
628 | case TextToken: | |
629 | s := strings.TrimLeft(p.tok.Data, whitespace) | |
630 | if len(s) < len(p.tok.Data) { | |
631 | // Add the initial whitespace to the current node. | |
632 | p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) | |
633 | if s == "" { | |
634 | return true | |
635 | } | |
636 | p.tok.Data = s | |
637 | } | |
638 | case StartTagToken: | |
639 | switch p.tok.DataAtom { | |
640 | case a.Html: | |
641 | return inBodyIM(p) | |
642 | case a.Body: | |
643 | p.addElement() | |
644 | p.framesetOK = false | |
645 | p.im = inBodyIM | |
646 | return true | |
647 | case a.Frameset: | |
648 | p.addElement() | |
649 | p.im = inFramesetIM | |
650 | return true | |
651 | case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: | |
652 | p.oe = append(p.oe, p.head) | |
653 | defer p.oe.remove(p.head) | |
654 | return inHeadIM(p) | |
655 | case a.Head: | |
656 | // Ignore the token. | |
657 | return true | |
658 | } | |
659 | case EndTagToken: | |
660 | switch p.tok.DataAtom { | |
661 | case a.Body, a.Html, a.Br: | |
662 | // Drop down to creating an implied <body> tag. | |
663 | default: | |
664 | // Ignore the token. | |
665 | return true | |
666 | } | |
667 | case CommentToken: | |
668 | p.addChild(&Node{ | |
669 | Type: CommentNode, | |
670 | Data: p.tok.Data, | |
671 | }) | |
672 | return true | |
673 | case DoctypeToken: | |
674 | // Ignore the token. | |
675 | return true | |
676 | } | |
677 | ||
678 | p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) | |
679 | p.framesetOK = true | |
680 | return false | |
681 | } | |
682 | ||
683 | // copyAttributes copies attributes of src not found on dst to dst. | |
684 | func copyAttributes(dst *Node, src Token) { | |
685 | if len(src.Attr) == 0 { | |
686 | return | |
687 | } | |
688 | attr := map[string]string{} | |
689 | for _, t := range dst.Attr { | |
690 | attr[t.Key] = t.Val | |
691 | } | |
692 | for _, t := range src.Attr { | |
693 | if _, ok := attr[t.Key]; !ok { | |
694 | dst.Attr = append(dst.Attr, t) | |
695 | attr[t.Key] = t.Val | |
696 | } | |
697 | } | |
698 | } | |
699 | ||
700 | // Section 12.2.5.4.7. | |
701 | func inBodyIM(p *parser) bool { | |
702 | switch p.tok.Type { | |
703 | case TextToken: | |
704 | d := p.tok.Data | |
705 | switch n := p.oe.top(); n.DataAtom { | |
706 | case a.Pre, a.Listing: | |
707 | if n.FirstChild == nil { | |
708 | // Ignore a newline at the start of a <pre> block. | |
709 | if d != "" && d[0] == '\r' { | |
710 | d = d[1:] | |
711 | } | |
712 | if d != "" && d[0] == '\n' { | |
713 | d = d[1:] | |
714 | } | |
715 | } | |
716 | } | |
717 | d = strings.Replace(d, "\x00", "", -1) | |
718 | if d == "" { | |
719 | return true | |
720 | } | |
721 | p.reconstructActiveFormattingElements() | |
722 | p.addText(d) | |
723 | if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { | |
724 | // There were non-whitespace characters inserted. | |
725 | p.framesetOK = false | |
726 | } | |
727 | case StartTagToken: | |
728 | switch p.tok.DataAtom { | |
729 | case a.Html: | |
730 | copyAttributes(p.oe[0], p.tok) | |
731 | case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: | |
732 | return inHeadIM(p) | |
733 | case a.Body: | |
734 | if len(p.oe) >= 2 { | |
735 | body := p.oe[1] | |
736 | if body.Type == ElementNode && body.DataAtom == a.Body { | |
737 | p.framesetOK = false | |
738 | copyAttributes(body, p.tok) | |
739 | } | |
740 | } | |
741 | case a.Frameset: | |
742 | if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { | |
743 | // Ignore the token. | |
744 | return true | |
745 | } | |
746 | body := p.oe[1] | |
747 | if body.Parent != nil { | |
748 | body.Parent.RemoveChild(body) | |
749 | } | |
750 | p.oe = p.oe[:1] | |
751 | p.addElement() | |
752 | p.im = inFramesetIM | |
753 | return true | |
754 | case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: | |
755 | p.popUntil(buttonScope, a.P) | |
756 | p.addElement() | |
757 | case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: | |
758 | p.popUntil(buttonScope, a.P) | |
759 | switch n := p.top(); n.DataAtom { | |
760 | case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: | |
761 | p.oe.pop() | |
762 | } | |
763 | p.addElement() | |
764 | case a.Pre, a.Listing: | |
765 | p.popUntil(buttonScope, a.P) | |
766 | p.addElement() | |
767 | // The newline, if any, will be dealt with by the TextToken case. | |
768 | p.framesetOK = false | |
769 | case a.Form: | |
770 | if p.form == nil { | |
771 | p.popUntil(buttonScope, a.P) | |
772 | p.addElement() | |
773 | p.form = p.top() | |
774 | } | |
775 | case a.Li: | |
776 | p.framesetOK = false | |
777 | for i := len(p.oe) - 1; i >= 0; i-- { | |
778 | node := p.oe[i] | |
779 | switch node.DataAtom { | |
780 | case a.Li: | |
781 | p.oe = p.oe[:i] | |
782 | case a.Address, a.Div, a.P: | |
783 | continue | |
784 | default: | |
785 | if !isSpecialElement(node) { | |
786 | continue | |
787 | } | |
788 | } | |
789 | break | |
790 | } | |
791 | p.popUntil(buttonScope, a.P) | |
792 | p.addElement() | |
793 | case a.Dd, a.Dt: | |
794 | p.framesetOK = false | |
795 | for i := len(p.oe) - 1; i >= 0; i-- { | |
796 | node := p.oe[i] | |
797 | switch node.DataAtom { | |
798 | case a.Dd, a.Dt: | |
799 | p.oe = p.oe[:i] | |
800 | case a.Address, a.Div, a.P: | |
801 | continue | |
802 | default: | |
803 | if !isSpecialElement(node) { | |
804 | continue | |
805 | } | |
806 | } | |
807 | break | |
808 | } | |
809 | p.popUntil(buttonScope, a.P) | |
810 | p.addElement() | |
811 | case a.Plaintext: | |
812 | p.popUntil(buttonScope, a.P) | |
813 | p.addElement() | |
814 | case a.Button: | |
815 | p.popUntil(defaultScope, a.Button) | |
816 | p.reconstructActiveFormattingElements() | |
817 | p.addElement() | |
818 | p.framesetOK = false | |
819 | case a.A: | |
820 | for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { | |
821 | if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { | |
822 | p.inBodyEndTagFormatting(a.A) | |
823 | p.oe.remove(n) | |
824 | p.afe.remove(n) | |
825 | break | |
826 | } | |
827 | } | |
828 | p.reconstructActiveFormattingElements() | |
829 | p.addFormattingElement() | |
830 | case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: | |
831 | p.reconstructActiveFormattingElements() | |
832 | p.addFormattingElement() | |
833 | case a.Nobr: | |
834 | p.reconstructActiveFormattingElements() | |
835 | if p.elementInScope(defaultScope, a.Nobr) { | |
836 | p.inBodyEndTagFormatting(a.Nobr) | |
837 | p.reconstructActiveFormattingElements() | |
838 | } | |
839 | p.addFormattingElement() | |
840 | case a.Applet, a.Marquee, a.Object: | |
841 | p.reconstructActiveFormattingElements() | |
842 | p.addElement() | |
843 | p.afe = append(p.afe, &scopeMarker) | |
844 | p.framesetOK = false | |
845 | case a.Table: | |
846 | if !p.quirks { | |
847 | p.popUntil(buttonScope, a.P) | |
848 | } | |
849 | p.addElement() | |
850 | p.framesetOK = false | |
851 | p.im = inTableIM | |
852 | return true | |
853 | case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: | |
854 | p.reconstructActiveFormattingElements() | |
855 | p.addElement() | |
856 | p.oe.pop() | |
857 | p.acknowledgeSelfClosingTag() | |
858 | if p.tok.DataAtom == a.Input { | |
859 | for _, t := range p.tok.Attr { | |
860 | if t.Key == "type" { | |
861 | if strings.ToLower(t.Val) == "hidden" { | |
862 | // Skip setting framesetOK = false | |
863 | return true | |
864 | } | |
865 | } | |
866 | } | |
867 | } | |
868 | p.framesetOK = false | |
869 | case a.Param, a.Source, a.Track: | |
870 | p.addElement() | |
871 | p.oe.pop() | |
872 | p.acknowledgeSelfClosingTag() | |
873 | case a.Hr: | |
874 | p.popUntil(buttonScope, a.P) | |
875 | p.addElement() | |
876 | p.oe.pop() | |
877 | p.acknowledgeSelfClosingTag() | |
878 | p.framesetOK = false | |
879 | case a.Image: | |
880 | p.tok.DataAtom = a.Img | |
881 | p.tok.Data = a.Img.String() | |
882 | return false | |
883 | case a.Isindex: | |
884 | if p.form != nil { | |
885 | // Ignore the token. | |
886 | return true | |
887 | } | |
888 | action := "" | |
889 | prompt := "This is a searchable index. Enter search keywords: " | |
890 | attr := []Attribute{{Key: "name", Val: "isindex"}} | |
891 | for _, t := range p.tok.Attr { | |
892 | switch t.Key { | |
893 | case "action": | |
894 | action = t.Val | |
895 | case "name": | |
896 | // Ignore the attribute. | |
897 | case "prompt": | |
898 | prompt = t.Val | |
899 | default: | |
900 | attr = append(attr, t) | |
901 | } | |
902 | } | |
903 | p.acknowledgeSelfClosingTag() | |
904 | p.popUntil(buttonScope, a.P) | |
905 | p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) | |
906 | if action != "" { | |
907 | p.form.Attr = []Attribute{{Key: "action", Val: action}} | |
908 | } | |
909 | p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) | |
910 | p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) | |
911 | p.addText(prompt) | |
912 | p.addChild(&Node{ | |
913 | Type: ElementNode, | |
914 | DataAtom: a.Input, | |
915 | Data: a.Input.String(), | |
916 | Attr: attr, | |
917 | }) | |
918 | p.oe.pop() | |
919 | p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) | |
920 | p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) | |
921 | p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) | |
922 | case a.Textarea: | |
923 | p.addElement() | |
924 | p.setOriginalIM() | |
925 | p.framesetOK = false | |
926 | p.im = textIM | |
927 | case a.Xmp: | |
928 | p.popUntil(buttonScope, a.P) | |
929 | p.reconstructActiveFormattingElements() | |
930 | p.framesetOK = false | |
931 | p.addElement() | |
932 | p.setOriginalIM() | |
933 | p.im = textIM | |
934 | case a.Iframe: | |
935 | p.framesetOK = false | |
936 | p.addElement() | |
937 | p.setOriginalIM() | |
938 | p.im = textIM | |
939 | case a.Noembed, a.Noscript: | |
940 | p.addElement() | |
941 | p.setOriginalIM() | |
942 | p.im = textIM | |
943 | case a.Select: | |
944 | p.reconstructActiveFormattingElements() | |
945 | p.addElement() | |
946 | p.framesetOK = false | |
947 | p.im = inSelectIM | |
948 | return true | |
949 | case a.Optgroup, a.Option: | |
950 | if p.top().DataAtom == a.Option { | |
951 | p.oe.pop() | |
952 | } | |
953 | p.reconstructActiveFormattingElements() | |
954 | p.addElement() | |
955 | case a.Rp, a.Rt: | |
956 | if p.elementInScope(defaultScope, a.Ruby) { | |
957 | p.generateImpliedEndTags() | |
958 | } | |
959 | p.addElement() | |
960 | case a.Math, a.Svg: | |
961 | p.reconstructActiveFormattingElements() | |
962 | if p.tok.DataAtom == a.Math { | |
963 | adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) | |
964 | } else { | |
965 | adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) | |
966 | } | |
967 | adjustForeignAttributes(p.tok.Attr) | |
968 | p.addElement() | |
969 | p.top().Namespace = p.tok.Data | |
970 | if p.hasSelfClosingToken { | |
971 | p.oe.pop() | |
972 | p.acknowledgeSelfClosingTag() | |
973 | } | |
974 | return true | |
975 | case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: | |
976 | // Ignore the token. | |
977 | default: | |
978 | p.reconstructActiveFormattingElements() | |
979 | p.addElement() | |
980 | } | |
981 | case EndTagToken: | |
982 | switch p.tok.DataAtom { | |
983 | case a.Body: | |
984 | if p.elementInScope(defaultScope, a.Body) { | |
985 | p.im = afterBodyIM | |
986 | } | |
987 | case a.Html: | |
988 | if p.elementInScope(defaultScope, a.Body) { | |
989 | p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) | |
990 | return false | |
991 | } | |
992 | return true | |
993 | case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: | |
994 | p.popUntil(defaultScope, p.tok.DataAtom) | |
995 | case a.Form: | |
996 | node := p.form | |
997 | p.form = nil | |
998 | i := p.indexOfElementInScope(defaultScope, a.Form) | |
999 | if node == nil || i == -1 || p.oe[i] != node { | |
1000 | // Ignore the token. | |
1001 | return true | |
1002 | } | |
1003 | p.generateImpliedEndTags() | |
1004 | p.oe.remove(node) | |
1005 | case a.P: | |
1006 | if !p.elementInScope(buttonScope, a.P) { | |
1007 | p.parseImpliedToken(StartTagToken, a.P, a.P.String()) | |
1008 | } | |
1009 | p.popUntil(buttonScope, a.P) | |
1010 | case a.Li: | |
1011 | p.popUntil(listItemScope, a.Li) | |
1012 | case a.Dd, a.Dt: | |
1013 | p.popUntil(defaultScope, p.tok.DataAtom) | |
1014 | case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: | |
1015 | p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) | |
1016 | case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: | |
1017 | p.inBodyEndTagFormatting(p.tok.DataAtom) | |
1018 | case a.Applet, a.Marquee, a.Object: | |
1019 | if p.popUntil(defaultScope, p.tok.DataAtom) { | |
1020 | p.clearActiveFormattingElements() | |
1021 | } | |
1022 | case a.Br: | |
1023 | p.tok.Type = StartTagToken | |
1024 | return false | |
1025 | default: | |
1026 | p.inBodyEndTagOther(p.tok.DataAtom) | |
1027 | } | |
1028 | case CommentToken: | |
1029 | p.addChild(&Node{ | |
1030 | Type: CommentNode, | |
1031 | Data: p.tok.Data, | |
1032 | }) | |
1033 | } | |
1034 | ||
1035 | return true | |
1036 | } | |
1037 | ||
1038 | func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) { | |
1039 | // This is the "adoption agency" algorithm, described at | |
1040 | // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency | |
1041 | ||
1042 | // TODO: this is a fairly literal line-by-line translation of that algorithm. | |
1043 | // Once the code successfully parses the comprehensive test suite, we should | |
1044 | // refactor this code to be more idiomatic. | |
1045 | ||
1046 | // Steps 1-4. The outer loop. | |
1047 | for i := 0; i < 8; i++ { | |
1048 | // Step 5. Find the formatting element. | |
1049 | var formattingElement *Node | |
1050 | for j := len(p.afe) - 1; j >= 0; j-- { | |
1051 | if p.afe[j].Type == scopeMarkerNode { | |
1052 | break | |
1053 | } | |
1054 | if p.afe[j].DataAtom == tagAtom { | |
1055 | formattingElement = p.afe[j] | |
1056 | break | |
1057 | } | |
1058 | } | |
1059 | if formattingElement == nil { | |
1060 | p.inBodyEndTagOther(tagAtom) | |
1061 | return | |
1062 | } | |
1063 | feIndex := p.oe.index(formattingElement) | |
1064 | if feIndex == -1 { | |
1065 | p.afe.remove(formattingElement) | |
1066 | return | |
1067 | } | |
1068 | if !p.elementInScope(defaultScope, tagAtom) { | |
1069 | // Ignore the tag. | |
1070 | return | |
1071 | } | |
1072 | ||
1073 | // Steps 9-10. Find the furthest block. | |
1074 | var furthestBlock *Node | |
1075 | for _, e := range p.oe[feIndex:] { | |
1076 | if isSpecialElement(e) { | |
1077 | furthestBlock = e | |
1078 | break | |
1079 | } | |
1080 | } | |
1081 | if furthestBlock == nil { | |
1082 | e := p.oe.pop() | |
1083 | for e != formattingElement { | |
1084 | e = p.oe.pop() | |
1085 | } | |
1086 | p.afe.remove(e) | |
1087 | return | |
1088 | } | |
1089 | ||
1090 | // Steps 11-12. Find the common ancestor and bookmark node. | |
1091 | commonAncestor := p.oe[feIndex-1] | |
1092 | bookmark := p.afe.index(formattingElement) | |
1093 | ||
1094 | // Step 13. The inner loop. Find the lastNode to reparent. | |
1095 | lastNode := furthestBlock | |
1096 | node := furthestBlock | |
1097 | x := p.oe.index(node) | |
1098 | // Steps 13.1-13.2 | |
1099 | for j := 0; j < 3; j++ { | |
1100 | // Step 13.3. | |
1101 | x-- | |
1102 | node = p.oe[x] | |
1103 | // Step 13.4 - 13.5. | |
1104 | if p.afe.index(node) == -1 { | |
1105 | p.oe.remove(node) | |
1106 | continue | |
1107 | } | |
1108 | // Step 13.6. | |
1109 | if node == formattingElement { | |
1110 | break | |
1111 | } | |
1112 | // Step 13.7. | |
1113 | clone := node.clone() | |
1114 | p.afe[p.afe.index(node)] = clone | |
1115 | p.oe[p.oe.index(node)] = clone | |
1116 | node = clone | |
1117 | // Step 13.8. | |
1118 | if lastNode == furthestBlock { | |
1119 | bookmark = p.afe.index(node) + 1 | |
1120 | } | |
1121 | // Step 13.9. | |
1122 | if lastNode.Parent != nil { | |
1123 | lastNode.Parent.RemoveChild(lastNode) | |
1124 | } | |
1125 | node.AppendChild(lastNode) | |
1126 | // Step 13.10. | |
1127 | lastNode = node | |
1128 | } | |
1129 | ||
1130 | // Step 14. Reparent lastNode to the common ancestor, | |
1131 | // or for misnested table nodes, to the foster parent. | |
1132 | if lastNode.Parent != nil { | |
1133 | lastNode.Parent.RemoveChild(lastNode) | |
1134 | } | |
1135 | switch commonAncestor.DataAtom { | |
1136 | case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: | |
1137 | p.fosterParent(lastNode) | |
1138 | default: | |
1139 | commonAncestor.AppendChild(lastNode) | |
1140 | } | |
1141 | ||
1142 | // Steps 15-17. Reparent nodes from the furthest block's children | |
1143 | // to a clone of the formatting element. | |
1144 | clone := formattingElement.clone() | |
1145 | reparentChildren(clone, furthestBlock) | |
1146 | furthestBlock.AppendChild(clone) | |
1147 | ||
1148 | // Step 18. Fix up the list of active formatting elements. | |
1149 | if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { | |
1150 | // Move the bookmark with the rest of the list. | |
1151 | bookmark-- | |
1152 | } | |
1153 | p.afe.remove(formattingElement) | |
1154 | p.afe.insert(bookmark, clone) | |
1155 | ||
1156 | // Step 19. Fix up the stack of open elements. | |
1157 | p.oe.remove(formattingElement) | |
1158 | p.oe.insert(p.oe.index(furthestBlock)+1, clone) | |
1159 | } | |
1160 | } | |
1161 | ||
1162 | // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. | |
1163 | // "Any other end tag" handling from 12.2.5.5 The rules for parsing tokens in foreign content | |
1164 | // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign | |
1165 | func (p *parser) inBodyEndTagOther(tagAtom a.Atom) { | |
1166 | for i := len(p.oe) - 1; i >= 0; i-- { | |
1167 | if p.oe[i].DataAtom == tagAtom { | |
1168 | p.oe = p.oe[:i] | |
1169 | break | |
1170 | } | |
1171 | if isSpecialElement(p.oe[i]) { | |
1172 | break | |
1173 | } | |
1174 | } | |
1175 | } | |
1176 | ||
1177 | // Section 12.2.5.4.8. | |
1178 | func textIM(p *parser) bool { | |
1179 | switch p.tok.Type { | |
1180 | case ErrorToken: | |
1181 | p.oe.pop() | |
1182 | case TextToken: | |
1183 | d := p.tok.Data | |
1184 | if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { | |
1185 | // Ignore a newline at the start of a <textarea> block. | |
1186 | if d != "" && d[0] == '\r' { | |
1187 | d = d[1:] | |
1188 | } | |
1189 | if d != "" && d[0] == '\n' { | |
1190 | d = d[1:] | |
1191 | } | |
1192 | } | |
1193 | if d == "" { | |
1194 | return true | |
1195 | } | |
1196 | p.addText(d) | |
1197 | return true | |
1198 | case EndTagToken: | |
1199 | p.oe.pop() | |
1200 | } | |
1201 | p.im = p.originalIM | |
1202 | p.originalIM = nil | |
1203 | return p.tok.Type == EndTagToken | |
1204 | } | |
1205 | ||
1206 | // Section 12.2.5.4.9. | |
1207 | func inTableIM(p *parser) bool { | |
1208 | switch p.tok.Type { | |
1209 | case ErrorToken: | |
1210 | // Stop parsing. | |
1211 | return true | |
1212 | case TextToken: | |
1213 | p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) | |
1214 | switch p.oe.top().DataAtom { | |
1215 | case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: | |
1216 | if strings.Trim(p.tok.Data, whitespace) == "" { | |
1217 | p.addText(p.tok.Data) | |
1218 | return true | |
1219 | } | |
1220 | } | |
1221 | case StartTagToken: | |
1222 | switch p.tok.DataAtom { | |
1223 | case a.Caption: | |
1224 | p.clearStackToContext(tableScope) | |
1225 | p.afe = append(p.afe, &scopeMarker) | |
1226 | p.addElement() | |
1227 | p.im = inCaptionIM | |
1228 | return true | |
1229 | case a.Colgroup: | |
1230 | p.clearStackToContext(tableScope) | |
1231 | p.addElement() | |
1232 | p.im = inColumnGroupIM | |
1233 | return true | |
1234 | case a.Col: | |
1235 | p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) | |
1236 | return false | |
1237 | case a.Tbody, a.Tfoot, a.Thead: | |
1238 | p.clearStackToContext(tableScope) | |
1239 | p.addElement() | |
1240 | p.im = inTableBodyIM | |
1241 | return true | |
1242 | case a.Td, a.Th, a.Tr: | |
1243 | p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) | |
1244 | return false | |
1245 | case a.Table: | |
1246 | if p.popUntil(tableScope, a.Table) { | |
1247 | p.resetInsertionMode() | |
1248 | return false | |
1249 | } | |
1250 | // Ignore the token. | |
1251 | return true | |
1252 | case a.Style, a.Script: | |
1253 | return inHeadIM(p) | |
1254 | case a.Input: | |
1255 | for _, t := range p.tok.Attr { | |
1256 | if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { | |
1257 | p.addElement() | |
1258 | p.oe.pop() | |
1259 | return true | |
1260 | } | |
1261 | } | |
1262 | // Otherwise drop down to the default action. | |
1263 | case a.Form: | |
1264 | if p.form != nil { | |
1265 | // Ignore the token. | |
1266 | return true | |
1267 | } | |
1268 | p.addElement() | |
1269 | p.form = p.oe.pop() | |
1270 | case a.Select: | |
1271 | p.reconstructActiveFormattingElements() | |
1272 | switch p.top().DataAtom { | |
1273 | case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: | |
1274 | p.fosterParenting = true | |
1275 | } | |
1276 | p.addElement() | |
1277 | p.fosterParenting = false | |
1278 | p.framesetOK = false | |
1279 | p.im = inSelectInTableIM | |
1280 | return true | |
1281 | } | |
1282 | case EndTagToken: | |
1283 | switch p.tok.DataAtom { | |
1284 | case a.Table: | |
1285 | if p.popUntil(tableScope, a.Table) { | |
1286 | p.resetInsertionMode() | |
1287 | return true | |
1288 | } | |
1289 | // Ignore the token. | |
1290 | return true | |
1291 | case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: | |
1292 | // Ignore the token. | |
1293 | return true | |
1294 | } | |
1295 | case CommentToken: | |
1296 | p.addChild(&Node{ | |
1297 | Type: CommentNode, | |
1298 | Data: p.tok.Data, | |
1299 | }) | |
1300 | return true | |
1301 | case DoctypeToken: | |
1302 | // Ignore the token. | |
1303 | return true | |
1304 | } | |
1305 | ||
1306 | p.fosterParenting = true | |
1307 | defer func() { p.fosterParenting = false }() | |
1308 | ||
1309 | return inBodyIM(p) | |
1310 | } | |
1311 | ||
1312 | // Section 12.2.5.4.11. | |
1313 | func inCaptionIM(p *parser) bool { | |
1314 | switch p.tok.Type { | |
1315 | case StartTagToken: | |
1316 | switch p.tok.DataAtom { | |
1317 | case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: | |
1318 | if p.popUntil(tableScope, a.Caption) { | |
1319 | p.clearActiveFormattingElements() | |
1320 | p.im = inTableIM | |
1321 | return false | |
1322 | } else { | |
1323 | // Ignore the token. | |
1324 | return true | |
1325 | } | |
1326 | case a.Select: | |
1327 | p.reconstructActiveFormattingElements() | |
1328 | p.addElement() | |
1329 | p.framesetOK = false | |
1330 | p.im = inSelectInTableIM | |
1331 | return true | |
1332 | } | |
1333 | case EndTagToken: | |
1334 | switch p.tok.DataAtom { | |
1335 | case a.Caption: | |
1336 | if p.popUntil(tableScope, a.Caption) { | |
1337 | p.clearActiveFormattingElements() | |
1338 | p.im = inTableIM | |
1339 | } | |
1340 | return true | |
1341 | case a.Table: | |
1342 | if p.popUntil(tableScope, a.Caption) { | |
1343 | p.clearActiveFormattingElements() | |
1344 | p.im = inTableIM | |
1345 | return false | |
1346 | } else { | |
1347 | // Ignore the token. | |
1348 | return true | |
1349 | } | |
1350 | case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: | |
1351 | // Ignore the token. | |
1352 | return true | |
1353 | } | |
1354 | } | |
1355 | return inBodyIM(p) | |
1356 | } | |
1357 | ||
1358 | // Section 12.2.5.4.12. | |
1359 | func inColumnGroupIM(p *parser) bool { | |
1360 | switch p.tok.Type { | |
1361 | case TextToken: | |
1362 | s := strings.TrimLeft(p.tok.Data, whitespace) | |
1363 | if len(s) < len(p.tok.Data) { | |
1364 | // Add the initial whitespace to the current node. | |
1365 | p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) | |
1366 | if s == "" { | |
1367 | return true | |
1368 | } | |
1369 | p.tok.Data = s | |
1370 | } | |
1371 | case CommentToken: | |
1372 | p.addChild(&Node{ | |
1373 | Type: CommentNode, | |
1374 | Data: p.tok.Data, | |
1375 | }) | |
1376 | return true | |
1377 | case DoctypeToken: | |
1378 | // Ignore the token. | |
1379 | return true | |
1380 | case StartTagToken: | |
1381 | switch p.tok.DataAtom { | |
1382 | case a.Html: | |
1383 | return inBodyIM(p) | |
1384 | case a.Col: | |
1385 | p.addElement() | |
1386 | p.oe.pop() | |
1387 | p.acknowledgeSelfClosingTag() | |
1388 | return true | |
1389 | } | |
1390 | case EndTagToken: | |
1391 | switch p.tok.DataAtom { | |
1392 | case a.Colgroup: | |
1393 | if p.oe.top().DataAtom != a.Html { | |
1394 | p.oe.pop() | |
1395 | p.im = inTableIM | |
1396 | } | |
1397 | return true | |
1398 | case a.Col: | |
1399 | // Ignore the token. | |
1400 | return true | |
1401 | } | |
1402 | } | |
1403 | if p.oe.top().DataAtom != a.Html { | |
1404 | p.oe.pop() | |
1405 | p.im = inTableIM | |
1406 | return false | |
1407 | } | |
1408 | return true | |
1409 | } | |
1410 | ||
1411 | // Section 12.2.5.4.13. | |
1412 | func inTableBodyIM(p *parser) bool { | |
1413 | switch p.tok.Type { | |
1414 | case StartTagToken: | |
1415 | switch p.tok.DataAtom { | |
1416 | case a.Tr: | |
1417 | p.clearStackToContext(tableBodyScope) | |
1418 | p.addElement() | |
1419 | p.im = inRowIM | |
1420 | return true | |
1421 | case a.Td, a.Th: | |
1422 | p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) | |
1423 | return false | |
1424 | case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: | |
1425 | if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { | |
1426 | p.im = inTableIM | |
1427 | return false | |
1428 | } | |
1429 | // Ignore the token. | |
1430 | return true | |
1431 | } | |
1432 | case EndTagToken: | |
1433 | switch p.tok.DataAtom { | |
1434 | case a.Tbody, a.Tfoot, a.Thead: | |
1435 | if p.elementInScope(tableScope, p.tok.DataAtom) { | |
1436 | p.clearStackToContext(tableBodyScope) | |
1437 | p.oe.pop() | |
1438 | p.im = inTableIM | |
1439 | } | |
1440 | return true | |
1441 | case a.Table: | |
1442 | if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { | |
1443 | p.im = inTableIM | |
1444 | return false | |
1445 | } | |
1446 | // Ignore the token. | |
1447 | return true | |
1448 | case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: | |
1449 | // Ignore the token. | |
1450 | return true | |
1451 | } | |
1452 | case CommentToken: | |
1453 | p.addChild(&Node{ | |
1454 | Type: CommentNode, | |
1455 | Data: p.tok.Data, | |
1456 | }) | |
1457 | return true | |
1458 | } | |
1459 | ||
1460 | return inTableIM(p) | |
1461 | } | |
1462 | ||
1463 | // Section 12.2.5.4.14. | |
1464 | func inRowIM(p *parser) bool { | |
1465 | switch p.tok.Type { | |
1466 | case StartTagToken: | |
1467 | switch p.tok.DataAtom { | |
1468 | case a.Td, a.Th: | |
1469 | p.clearStackToContext(tableRowScope) | |
1470 | p.addElement() | |
1471 | p.afe = append(p.afe, &scopeMarker) | |
1472 | p.im = inCellIM | |
1473 | return true | |
1474 | case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: | |
1475 | if p.popUntil(tableScope, a.Tr) { | |
1476 | p.im = inTableBodyIM | |
1477 | return false | |
1478 | } | |
1479 | // Ignore the token. | |
1480 | return true | |
1481 | } | |
1482 | case EndTagToken: | |
1483 | switch p.tok.DataAtom { | |
1484 | case a.Tr: | |
1485 | if p.popUntil(tableScope, a.Tr) { | |
1486 | p.im = inTableBodyIM | |
1487 | return true | |
1488 | } | |
1489 | // Ignore the token. | |
1490 | return true | |
1491 | case a.Table: | |
1492 | if p.popUntil(tableScope, a.Tr) { | |
1493 | p.im = inTableBodyIM | |
1494 | return false | |
1495 | } | |
1496 | // Ignore the token. | |
1497 | return true | |
1498 | case a.Tbody, a.Tfoot, a.Thead: | |
1499 | if p.elementInScope(tableScope, p.tok.DataAtom) { | |
1500 | p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) | |
1501 | return false | |
1502 | } | |
1503 | // Ignore the token. | |
1504 | return true | |
1505 | case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: | |
1506 | // Ignore the token. | |
1507 | return true | |
1508 | } | |
1509 | } | |
1510 | ||
1511 | return inTableIM(p) | |
1512 | } | |
1513 | ||
1514 | // Section 12.2.5.4.15. | |
1515 | func inCellIM(p *parser) bool { | |
1516 | switch p.tok.Type { | |
1517 | case StartTagToken: | |
1518 | switch p.tok.DataAtom { | |
1519 | case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: | |
1520 | if p.popUntil(tableScope, a.Td, a.Th) { | |
1521 | // Close the cell and reprocess. | |
1522 | p.clearActiveFormattingElements() | |
1523 | p.im = inRowIM | |
1524 | return false | |
1525 | } | |
1526 | // Ignore the token. | |
1527 | return true | |
1528 | case a.Select: | |
1529 | p.reconstructActiveFormattingElements() | |
1530 | p.addElement() | |
1531 | p.framesetOK = false | |
1532 | p.im = inSelectInTableIM | |
1533 | return true | |
1534 | } | |
1535 | case EndTagToken: | |
1536 | switch p.tok.DataAtom { | |
1537 | case a.Td, a.Th: | |
1538 | if !p.popUntil(tableScope, p.tok.DataAtom) { | |
1539 | // Ignore the token. | |
1540 | return true | |
1541 | } | |
1542 | p.clearActiveFormattingElements() | |
1543 | p.im = inRowIM | |
1544 | return true | |
1545 | case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: | |
1546 | // Ignore the token. | |
1547 | return true | |
1548 | case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: | |
1549 | if !p.elementInScope(tableScope, p.tok.DataAtom) { | |
1550 | // Ignore the token. | |
1551 | return true | |
1552 | } | |
1553 | // Close the cell and reprocess. | |
1554 | p.popUntil(tableScope, a.Td, a.Th) | |
1555 | p.clearActiveFormattingElements() | |
1556 | p.im = inRowIM | |
1557 | return false | |
1558 | } | |
1559 | } | |
1560 | return inBodyIM(p) | |
1561 | } | |
1562 | ||
1563 | // Section 12.2.5.4.16. | |
1564 | func inSelectIM(p *parser) bool { | |
1565 | switch p.tok.Type { | |
1566 | case ErrorToken: | |
1567 | // Stop parsing. | |
1568 | return true | |
1569 | case TextToken: | |
1570 | p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) | |
1571 | case StartTagToken: | |
1572 | switch p.tok.DataAtom { | |
1573 | case a.Html: | |
1574 | return inBodyIM(p) | |
1575 | case a.Option: | |
1576 | if p.top().DataAtom == a.Option { | |
1577 | p.oe.pop() | |
1578 | } | |
1579 | p.addElement() | |
1580 | case a.Optgroup: | |
1581 | if p.top().DataAtom == a.Option { | |
1582 | p.oe.pop() | |
1583 | } | |
1584 | if p.top().DataAtom == a.Optgroup { | |
1585 | p.oe.pop() | |
1586 | } | |
1587 | p.addElement() | |
1588 | case a.Select: | |
1589 | p.tok.Type = EndTagToken | |
1590 | return false | |
1591 | case a.Input, a.Keygen, a.Textarea: | |
1592 | if p.elementInScope(selectScope, a.Select) { | |
1593 | p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) | |
1594 | return false | |
1595 | } | |
1596 | // In order to properly ignore <textarea>, we need to change the tokenizer mode. | |
1597 | p.tokenizer.NextIsNotRawText() | |
1598 | // Ignore the token. | |
1599 | return true | |
1600 | case a.Script: | |
1601 | return inHeadIM(p) | |
1602 | } | |
1603 | case EndTagToken: | |
1604 | switch p.tok.DataAtom { | |
1605 | case a.Option: | |
1606 | if p.top().DataAtom == a.Option { | |
1607 | p.oe.pop() | |
1608 | } | |
1609 | case a.Optgroup: | |
1610 | i := len(p.oe) - 1 | |
1611 | if p.oe[i].DataAtom == a.Option { | |
1612 | i-- | |
1613 | } | |
1614 | if p.oe[i].DataAtom == a.Optgroup { | |
1615 | p.oe = p.oe[:i] | |
1616 | } | |
1617 | case a.Select: | |
1618 | if p.popUntil(selectScope, a.Select) { | |
1619 | p.resetInsertionMode() | |
1620 | } | |
1621 | } | |
1622 | case CommentToken: | |
1623 | p.addChild(&Node{ | |
1624 | Type: CommentNode, | |
1625 | Data: p.tok.Data, | |
1626 | }) | |
1627 | case DoctypeToken: | |
1628 | // Ignore the token. | |
1629 | return true | |
1630 | } | |
1631 | ||
1632 | return true | |
1633 | } | |
1634 | ||
1635 | // Section 12.2.5.4.17. | |
1636 | func inSelectInTableIM(p *parser) bool { | |
1637 | switch p.tok.Type { | |
1638 | case StartTagToken, EndTagToken: | |
1639 | switch p.tok.DataAtom { | |
1640 | case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: | |
1641 | if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) { | |
1642 | p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) | |
1643 | return false | |
1644 | } else { | |
1645 | // Ignore the token. | |
1646 | return true | |
1647 | } | |
1648 | } | |
1649 | } | |
1650 | return inSelectIM(p) | |
1651 | } | |
1652 | ||
1653 | // Section 12.2.5.4.18. | |
1654 | func afterBodyIM(p *parser) bool { | |
1655 | switch p.tok.Type { | |
1656 | case ErrorToken: | |
1657 | // Stop parsing. | |
1658 | return true | |
1659 | case TextToken: | |
1660 | s := strings.TrimLeft(p.tok.Data, whitespace) | |
1661 | if len(s) == 0 { | |
1662 | // It was all whitespace. | |
1663 | return inBodyIM(p) | |
1664 | } | |
1665 | case StartTagToken: | |
1666 | if p.tok.DataAtom == a.Html { | |
1667 | return inBodyIM(p) | |
1668 | } | |
1669 | case EndTagToken: | |
1670 | if p.tok.DataAtom == a.Html { | |
1671 | if !p.fragment { | |
1672 | p.im = afterAfterBodyIM | |
1673 | } | |
1674 | return true | |
1675 | } | |
1676 | case CommentToken: | |
1677 | // The comment is attached to the <html> element. | |
1678 | if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { | |
1679 | panic("html: bad parser state: <html> element not found, in the after-body insertion mode") | |
1680 | } | |
1681 | p.oe[0].AppendChild(&Node{ | |
1682 | Type: CommentNode, | |
1683 | Data: p.tok.Data, | |
1684 | }) | |
1685 | return true | |
1686 | } | |
1687 | p.im = inBodyIM | |
1688 | return false | |
1689 | } | |
1690 | ||
1691 | // Section 12.2.5.4.19. | |
1692 | func inFramesetIM(p *parser) bool { | |
1693 | switch p.tok.Type { | |
1694 | case CommentToken: | |
1695 | p.addChild(&Node{ | |
1696 | Type: CommentNode, | |
1697 | Data: p.tok.Data, | |
1698 | }) | |
1699 | case TextToken: | |
1700 | // Ignore all text but whitespace. | |
1701 | s := strings.Map(func(c rune) rune { | |
1702 | switch c { | |
1703 | case ' ', '\t', '\n', '\f', '\r': | |
1704 | return c | |
1705 | } | |
1706 | return -1 | |
1707 | }, p.tok.Data) | |
1708 | if s != "" { | |
1709 | p.addText(s) | |
1710 | } | |
1711 | case StartTagToken: | |
1712 | switch p.tok.DataAtom { | |
1713 | case a.Html: | |
1714 | return inBodyIM(p) | |
1715 | case a.Frameset: | |
1716 | p.addElement() | |
1717 | case a.Frame: | |
1718 | p.addElement() | |
1719 | p.oe.pop() | |
1720 | p.acknowledgeSelfClosingTag() | |
1721 | case a.Noframes: | |
1722 | return inHeadIM(p) | |
1723 | } | |
1724 | case EndTagToken: | |
1725 | switch p.tok.DataAtom { | |
1726 | case a.Frameset: | |
1727 | if p.oe.top().DataAtom != a.Html { | |
1728 | p.oe.pop() | |
1729 | if p.oe.top().DataAtom != a.Frameset { | |
1730 | p.im = afterFramesetIM | |
1731 | return true | |
1732 | } | |
1733 | } | |
1734 | } | |
1735 | default: | |
1736 | // Ignore the token. | |
1737 | } | |
1738 | return true | |
1739 | } | |
1740 | ||
1741 | // Section 12.2.5.4.20. | |
1742 | func afterFramesetIM(p *parser) bool { | |
1743 | switch p.tok.Type { | |
1744 | case CommentToken: | |
1745 | p.addChild(&Node{ | |
1746 | Type: CommentNode, | |
1747 | Data: p.tok.Data, | |
1748 | }) | |
1749 | case TextToken: | |
1750 | // Ignore all text but whitespace. | |
1751 | s := strings.Map(func(c rune) rune { | |
1752 | switch c { | |
1753 | case ' ', '\t', '\n', '\f', '\r': | |
1754 | return c | |
1755 | } | |
1756 | return -1 | |
1757 | }, p.tok.Data) | |
1758 | if s != "" { | |
1759 | p.addText(s) | |
1760 | } | |
1761 | case StartTagToken: | |
1762 | switch p.tok.DataAtom { | |
1763 | case a.Html: | |
1764 | return inBodyIM(p) | |
1765 | case a.Noframes: | |
1766 | return inHeadIM(p) | |
1767 | } | |
1768 | case EndTagToken: | |
1769 | switch p.tok.DataAtom { | |
1770 | case a.Html: | |
1771 | p.im = afterAfterFramesetIM | |
1772 | return true | |
1773 | } | |
1774 | default: | |
1775 | // Ignore the token. | |
1776 | } | |
1777 | return true | |
1778 | } | |
1779 | ||
1780 | // Section 12.2.5.4.21. | |
1781 | func afterAfterBodyIM(p *parser) bool { | |
1782 | switch p.tok.Type { | |
1783 | case ErrorToken: | |
1784 | // Stop parsing. | |
1785 | return true | |
1786 | case TextToken: | |
1787 | s := strings.TrimLeft(p.tok.Data, whitespace) | |
1788 | if len(s) == 0 { | |
1789 | // It was all whitespace. | |
1790 | return inBodyIM(p) | |
1791 | } | |
1792 | case StartTagToken: | |
1793 | if p.tok.DataAtom == a.Html { | |
1794 | return inBodyIM(p) | |
1795 | } | |
1796 | case CommentToken: | |
1797 | p.doc.AppendChild(&Node{ | |
1798 | Type: CommentNode, | |
1799 | Data: p.tok.Data, | |
1800 | }) | |
1801 | return true | |
1802 | case DoctypeToken: | |
1803 | return inBodyIM(p) | |
1804 | } | |
1805 | p.im = inBodyIM | |
1806 | return false | |
1807 | } | |
1808 | ||
1809 | // Section 12.2.5.4.22. | |
1810 | func afterAfterFramesetIM(p *parser) bool { | |
1811 | switch p.tok.Type { | |
1812 | case CommentToken: | |
1813 | p.doc.AppendChild(&Node{ | |
1814 | Type: CommentNode, | |
1815 | Data: p.tok.Data, | |
1816 | }) | |
1817 | case TextToken: | |
1818 | // Ignore all text but whitespace. | |
1819 | s := strings.Map(func(c rune) rune { | |
1820 | switch c { | |
1821 | case ' ', '\t', '\n', '\f', '\r': | |
1822 | return c | |
1823 | } | |
1824 | return -1 | |
1825 | }, p.tok.Data) | |
1826 | if s != "" { | |
1827 | p.tok.Data = s | |
1828 | return inBodyIM(p) | |
1829 | } | |
1830 | case StartTagToken: | |
1831 | switch p.tok.DataAtom { | |
1832 | case a.Html: | |
1833 | return inBodyIM(p) | |
1834 | case a.Noframes: | |
1835 | return inHeadIM(p) | |
1836 | } | |
1837 | case DoctypeToken: | |
1838 | return inBodyIM(p) | |
1839 | default: | |
1840 | // Ignore the token. | |
1841 | } | |
1842 | return true | |
1843 | } | |
1844 | ||
1845 | const whitespaceOrNUL = whitespace + "\x00" | |
1846 | ||
1847 | // Section 12.2.5.5. | |
1848 | func parseForeignContent(p *parser) bool { | |
1849 | switch p.tok.Type { | |
1850 | case TextToken: | |
1851 | if p.framesetOK { | |
1852 | p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" | |
1853 | } | |
1854 | p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) | |
1855 | p.addText(p.tok.Data) | |
1856 | case CommentToken: | |
1857 | p.addChild(&Node{ | |
1858 | Type: CommentNode, | |
1859 | Data: p.tok.Data, | |
1860 | }) | |
1861 | case StartTagToken: | |
1862 | b := breakout[p.tok.Data] | |
1863 | if p.tok.DataAtom == a.Font { | |
1864 | loop: | |
1865 | for _, attr := range p.tok.Attr { | |
1866 | switch attr.Key { | |
1867 | case "color", "face", "size": | |
1868 | b = true | |
1869 | break loop | |
1870 | } | |
1871 | } | |
1872 | } | |
1873 | if b { | |
1874 | for i := len(p.oe) - 1; i >= 0; i-- { | |
1875 | n := p.oe[i] | |
1876 | if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { | |
1877 | p.oe = p.oe[:i+1] | |
1878 | break | |
1879 | } | |
1880 | } | |
1881 | return false | |
1882 | } | |
1883 | switch p.top().Namespace { | |
1884 | case "math": | |
1885 | adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) | |
1886 | case "svg": | |
1887 | // Adjust SVG tag names. The tokenizer lower-cases tag names, but | |
1888 | // SVG wants e.g. "foreignObject" with a capital second "O". | |
1889 | if x := svgTagNameAdjustments[p.tok.Data]; x != "" { | |
1890 | p.tok.DataAtom = a.Lookup([]byte(x)) | |
1891 | p.tok.Data = x | |
1892 | } | |
1893 | adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) | |
1894 | default: | |
1895 | panic("html: bad parser state: unexpected namespace") | |
1896 | } | |
1897 | adjustForeignAttributes(p.tok.Attr) | |
1898 | namespace := p.top().Namespace | |
1899 | p.addElement() | |
1900 | p.top().Namespace = namespace | |
1901 | if namespace != "" { | |
1902 | // Don't let the tokenizer go into raw text mode in foreign content | |
1903 | // (e.g. in an SVG <title> tag). | |
1904 | p.tokenizer.NextIsNotRawText() | |
1905 | } | |
1906 | if p.hasSelfClosingToken { | |
1907 | p.oe.pop() | |
1908 | p.acknowledgeSelfClosingTag() | |
1909 | } | |
1910 | case EndTagToken: | |
1911 | for i := len(p.oe) - 1; i >= 0; i-- { | |
1912 | if p.oe[i].Namespace == "" { | |
1913 | return p.im(p) | |
1914 | } | |
1915 | if strings.EqualFold(p.oe[i].Data, p.tok.Data) { | |
1916 | p.oe = p.oe[:i] | |
1917 | break | |
1918 | } | |
1919 | } | |
1920 | return true | |
1921 | default: | |
1922 | // Ignore the token. | |
1923 | } | |
1924 | return true | |
1925 | } | |
1926 | ||
1927 | // Section 12.2.5. | |
1928 | func (p *parser) inForeignContent() bool { | |
1929 | if len(p.oe) == 0 { | |
1930 | return false | |
1931 | } | |
1932 | n := p.oe[len(p.oe)-1] | |
1933 | if n.Namespace == "" { | |
1934 | return false | |
1935 | } | |
1936 | if mathMLTextIntegrationPoint(n) { | |
1937 | if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { | |
1938 | return false | |
1939 | } | |
1940 | if p.tok.Type == TextToken { | |
1941 | return false | |
1942 | } | |
1943 | } | |
1944 | if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { | |
1945 | return false | |
1946 | } | |
1947 | if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { | |
1948 | return false | |
1949 | } | |
1950 | if p.tok.Type == ErrorToken { | |
1951 | return false | |
1952 | } | |
1953 | return true | |
1954 | } | |
1955 | ||
1956 | // parseImpliedToken parses a token as though it had appeared in the parser's | |
1957 | // input. | |
1958 | func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { | |
1959 | realToken, selfClosing := p.tok, p.hasSelfClosingToken | |
1960 | p.tok = Token{ | |
1961 | Type: t, | |
1962 | DataAtom: dataAtom, | |
1963 | Data: data, | |
1964 | } | |
1965 | p.hasSelfClosingToken = false | |
1966 | p.parseCurrentToken() | |
1967 | p.tok, p.hasSelfClosingToken = realToken, selfClosing | |
1968 | } | |
1969 | ||
1970 | // parseCurrentToken runs the current token through the parsing routines | |
1971 | // until it is consumed. | |
1972 | func (p *parser) parseCurrentToken() { | |
1973 | if p.tok.Type == SelfClosingTagToken { | |
1974 | p.hasSelfClosingToken = true | |
1975 | p.tok.Type = StartTagToken | |
1976 | } | |
1977 | ||
1978 | consumed := false | |
1979 | for !consumed { | |
1980 | if p.inForeignContent() { | |
1981 | consumed = parseForeignContent(p) | |
1982 | } else { | |
1983 | consumed = p.im(p) | |
1984 | } | |
1985 | } | |
1986 | ||
1987 | if p.hasSelfClosingToken { | |
1988 | // This is a parse error, but ignore it. | |
1989 | p.hasSelfClosingToken = false | |
1990 | } | |
1991 | } | |
1992 | ||
1993 | func (p *parser) parse() error { | |
1994 | // Iterate until EOF. Any other error will cause an early return. | |
1995 | var err error | |
1996 | for err != io.EOF { | |
1997 | // CDATA sections are allowed only in foreign content. | |
1998 | n := p.oe.top() | |
1999 | p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") | |
2000 | // Read and parse the next token. | |
2001 | p.tokenizer.Next() | |
2002 | p.tok = p.tokenizer.Token() | |
2003 | if p.tok.Type == ErrorToken { | |
2004 | err = p.tokenizer.Err() | |
2005 | if err != nil && err != io.EOF { | |
2006 | return err | |
2007 | } | |
2008 | } | |
2009 | p.parseCurrentToken() | |
2010 | } | |
2011 | return nil | |
2012 | } | |
2013 | ||
2014 | // Parse returns the parse tree for the HTML from the given Reader. | |
2015 | // The input is assumed to be UTF-8 encoded. | |
2016 | func Parse(r io.Reader) (*Node, error) { | |
2017 | p := &parser{ | |
2018 | tokenizer: NewTokenizer(r), | |
2019 | doc: &Node{ | |
2020 | Type: DocumentNode, | |
2021 | }, | |
2022 | scripting: true, | |
2023 | framesetOK: true, | |
2024 | im: initialIM, | |
2025 | } | |
2026 | err := p.parse() | |
2027 | if err != nil { | |
2028 | return nil, err | |
2029 | } | |
2030 | return p.doc, nil | |
2031 | } | |
2032 | ||
2033 | // ParseFragment parses a fragment of HTML and returns the nodes that were | |
2034 | // found. If the fragment is the InnerHTML for an existing element, pass that | |
2035 | // element in context. | |
2036 | func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { | |
2037 | contextTag := "" | |
2038 | if context != nil { | |
2039 | if context.Type != ElementNode { | |
2040 | return nil, errors.New("html: ParseFragment of non-element Node") | |
2041 | } | |
2042 | // The next check isn't just context.DataAtom.String() == context.Data because | |
2043 | // it is valid to pass an element whose tag isn't a known atom. For example, | |
2044 | // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. | |
2045 | if context.DataAtom != a.Lookup([]byte(context.Data)) { | |
2046 | return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) | |
2047 | } | |
2048 | contextTag = context.DataAtom.String() | |
2049 | } | |
2050 | p := &parser{ | |
2051 | tokenizer: NewTokenizerFragment(r, contextTag), | |
2052 | doc: &Node{ | |
2053 | Type: DocumentNode, | |
2054 | }, | |
2055 | scripting: true, | |
2056 | fragment: true, | |
2057 | context: context, | |
2058 | } | |
2059 | ||
2060 | root := &Node{ | |
2061 | Type: ElementNode, | |
2062 | DataAtom: a.Html, | |
2063 | Data: a.Html.String(), | |
2064 | } | |
2065 | p.doc.AppendChild(root) | |
2066 | p.oe = nodeStack{root} | |
2067 | p.resetInsertionMode() | |
2068 | ||
2069 | for n := context; n != nil; n = n.Parent { | |
2070 | if n.Type == ElementNode && n.DataAtom == a.Form { | |
2071 | p.form = n | |
2072 | break | |
2073 | } | |
2074 | } | |
2075 | ||
2076 | err := p.parse() | |
2077 | if err != nil { | |
2078 | return nil, err | |
2079 | } | |
2080 | ||
2081 | parent := p.doc | |
2082 | if context != nil { | |
2083 | parent = root | |
2084 | } | |
2085 | ||
2086 | var result []*Node | |
2087 | for c := parent.FirstChild; c != nil; { | |
2088 | next := c.NextSibling | |
2089 | parent.RemoveChild(c) | |
2090 | result = append(result, c) | |
2091 | c = next | |
2092 | } | |
2093 | return result, nil | |
2094 | } |