1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
13 a "golang.org/x/net/html/atom"
16 // A parser implements the HTML5 parsing algorithm:
17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
19 // tokenizer provides the tokens for the parser.
21 // tok is the most recently read token.
23 // Self-closing tags like <hr/> are treated as start tags, except that
24 // hasSelfClosingToken is set while they are being processed.
25 hasSelfClosingToken bool
26 // doc is the document root element.
28 // The stack of open elements (section 12.2.4.2) and active formatting
29 // elements (section 12.2.4.3).
31 // Element pointers (section 12.2.4.4).
33 // Other parsing state flags (section 12.2.4.5).
34 scripting, framesetOK bool
35 // The stack of template insertion modes
36 templateStack insertionModeStack
37 // im is the current insertion mode.
39 // originalIM is the insertion mode to go back to after completing a text
40 // or inTableText insertion mode.
41 originalIM insertionMode
42 // fosterParenting is whether new elements should be inserted according to
43 // the foster parenting rules (section 12.2.6.1).
45 // quirks is whether the parser is operating in "quirks mode."
47 // fragment is whether the parser is parsing an HTML fragment.
49 // context is the context element when parsing an HTML fragment
54 func (p *parser) top() *Node {
55 if n := p.oe.top(); n != nil {
61 // Stop tags for use in popUntil. These come from section 12.2.4.2.
63 defaultScopeStopTags = map[string][]a.Atom{
64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
66 "svg": {a.Desc, a.ForeignObject, a.Title},
73 defaultScope scope = iota
82 // popUntil pops the stack of open elements at the highest element whose tag
83 // is in matchTags, provided there is no higher element in the scope's stop
84 // tags (as defined in section 12.2.4.2). It returns whether or not there was
85 // such an element. If there was not, popUntil leaves the stack unchanged.
87 // For example, the set of stop tags for table scope is: "html", "table". If
89 // ["html", "body", "font", "table", "b", "i", "u"]
90 // then popUntil(tableScope, "font") would return false, but
91 // popUntil(tableScope, "i") would return true and the stack would become:
92 // ["html", "body", "font", "table", "b"]
94 // If an element's tag is in both the stop tags and matchTags, then the stack
95 // will be popped and the function returns true (provided, of course, there was
96 // no higher element in the stack that was also in the stop tags). For example,
97 // popUntil(tableScope, "table") returns true and leaves:
98 // ["html", "body", "font"]
99 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
107 // indexOfElementInScope returns the index in p.oe of the highest element whose
108 // tag is in matchTags that is in scope. If no matching element is in scope, it
110 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
111 for i := len(p.oe) - 1; i >= 0; i-- {
112 tagAtom := p.oe[i].DataAtom
113 if p.oe[i].Namespace == "" {
114 for _, t := range matchTags {
123 if tagAtom == a.Ol || tagAtom == a.Ul {
127 if tagAtom == a.Button {
131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
135 if tagAtom != a.Optgroup && tagAtom != a.Option {
143 case defaultScope, listItemScope, buttonScope:
144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
154 // elementInScope is like popUntil, except that it doesn't modify the stack of
156 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
157 return p.indexOfElementInScope(s, matchTags...) != -1
160 // clearStackToContext pops elements off the stack of open elements until a
161 // scope-defined element is found.
162 func (p *parser) clearStackToContext(s scope) {
163 for i := len(p.oe) - 1; i >= 0; i-- {
164 tagAtom := p.oe[i].DataAtom
167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
187 // generateImpliedEndTags pops nodes off the stack of open elements as long as
188 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
189 // If exceptions are specified, nodes with that name will not be popped off.
190 func (p *parser) generateImpliedEndTags(exceptions ...string) {
193 for i = len(p.oe) - 1; i >= 0; i-- {
195 if n.Type == ElementNode {
197 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
198 for _, except := range exceptions {
199 if n.Data == except {
212 // addChild adds a child node n to the top element, and pushes n onto the stack
213 // of open elements if it is an element node.
214 func (p *parser) addChild(n *Node) {
215 if p.shouldFosterParent() {
218 p.top().AppendChild(n)
221 if n.Type == ElementNode {
222 p.oe = append(p.oe, n)
226 // shouldFosterParent returns whether the next node to be added should be
228 func (p *parser) shouldFosterParent() bool {
229 if p.fosterParenting {
230 switch p.top().DataAtom {
231 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
238 // fosterParent adds a child node according to the foster parenting rules.
239 // Section 12.2.6.1, "foster parenting".
240 func (p *parser) fosterParent(n *Node) {
241 var table, parent, prev, template *Node
243 for i = len(p.oe) - 1; i >= 0; i-- {
244 if p.oe[i].DataAtom == a.Table {
251 for j = len(p.oe) - 1; j >= 0; j-- {
252 if p.oe[j].DataAtom == a.Template {
258 if template != nil && (table == nil || j > i) {
259 template.AppendChild(n)
264 // The foster parent is the html element.
267 parent = table.Parent
274 prev = table.PrevSibling
276 prev = parent.LastChild
278 if prev != nil && prev.Type == TextNode && n.Type == TextNode {
283 parent.InsertBefore(n, table)
286 // addText adds text to the preceding node if it is a text node, or else it
287 // calls addChild with a new text node.
288 func (p *parser) addText(text string) {
293 if p.shouldFosterParent() {
294 p.fosterParent(&Node{
302 if n := t.LastChild; n != nil && n.Type == TextNode {
312 // addElement adds a child element based on the current token.
313 func (p *parser) addElement() {
316 DataAtom: p.tok.DataAtom,
323 func (p *parser) addFormattingElement() {
324 tagAtom, attr := p.tok.DataAtom, p.tok.Attr
327 // Implement the Noah's Ark clause, but with three per family instead of two.
328 identicalElements := 0
329 findIdenticalElements:
330 for i := len(p.afe) - 1; i >= 0; i-- {
332 if n.Type == scopeMarkerNode {
335 if n.Type != ElementNode {
338 if n.Namespace != "" {
341 if n.DataAtom != tagAtom {
344 if len(n.Attr) != len(attr) {
348 for _, t0 := range n.Attr {
349 for _, t1 := range attr {
350 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
351 // Found a match for this attribute, continue with the next attribute.
352 continue compareAttributes
355 // If we get here, there is no attribute that matches a.
356 // Therefore the element is not identical to the new one.
357 continue findIdenticalElements
361 if identicalElements >= 3 {
366 p.afe = append(p.afe, p.top())
370 func (p *parser) clearActiveFormattingElements() {
373 if len(p.afe) == 0 || n.Type == scopeMarkerNode {
380 func (p *parser) reconstructActiveFormattingElements() {
385 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
389 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
399 clone := p.afe[i].clone()
402 if i == len(p.afe)-1 {
409 func (p *parser) acknowledgeSelfClosingTag() {
410 p.hasSelfClosingToken = false
413 // An insertion mode (section 12.2.4.1) is the state transition function from
414 // a particular state in the HTML5 parser's state machine. It updates the
415 // parser's fields depending on parser.tok (where ErrorToken means EOF).
416 // It returns whether the token was consumed.
417 type insertionMode func(*parser) bool
419 // setOriginalIM sets the insertion mode to return to after completing a text or
420 // inTableText insertion mode.
421 // Section 12.2.4.1, "using the rules for".
422 func (p *parser) setOriginalIM() {
423 if p.originalIM != nil {
424 panic("html: bad parser state: originalIM was set twice")
429 // Section 12.2.4.1, "reset the insertion mode".
430 func (p *parser) resetInsertionMode() {
431 for i := len(p.oe) - 1; i >= 0; i-- {
434 if last && p.context != nil {
441 for ancestor, first := n, p.oe[0]; ancestor != first; {
442 if ancestor == first {
445 ancestor = p.oe[p.oe.index(ancestor)-1]
446 switch ancestor.DataAtom {
451 p.im = inSelectInTableIM
458 // TODO: remove this divergence from the HTML5 spec.
460 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
464 case a.Tbody, a.Thead, a.Tfoot:
469 p.im = inColumnGroupIM
473 // TODO: remove this divergence from the HTML5 spec.
474 if n.Namespace != "" {
477 p.im = p.templateStack.top()
479 // TODO: remove this divergence from the HTML5 spec.
481 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
504 const whitespace = " \t\r\n\f"
506 // Section 12.2.6.4.1.
507 func initialIM(p *parser) bool {
510 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
511 if len(p.tok.Data) == 0 {
512 // It was all whitespace, so ignore it.
516 p.doc.AppendChild(&Node{
522 n, quirks := parseDoctype(p.tok.Data)
533 // Section 12.2.6.4.2.
534 func beforeHTMLIM(p *parser) bool {
540 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
541 if len(p.tok.Data) == 0 {
542 // It was all whitespace, so ignore it.
546 if p.tok.DataAtom == a.Html {
552 switch p.tok.DataAtom {
553 case a.Head, a.Body, a.Html, a.Br:
554 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
561 p.doc.AppendChild(&Node{
567 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
571 // Section 12.2.6.4.3.
572 func beforeHeadIM(p *parser) bool {
575 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
576 if len(p.tok.Data) == 0 {
577 // It was all whitespace, so ignore it.
581 switch p.tok.DataAtom {
591 switch p.tok.DataAtom {
592 case a.Head, a.Body, a.Html, a.Br:
593 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
610 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
614 // Section 12.2.6.4.4.
615 func inHeadIM(p *parser) bool {
618 s := strings.TrimLeft(p.tok.Data, whitespace)
619 if len(s) < len(p.tok.Data) {
620 // Add the initial whitespace to the current node.
621 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
628 switch p.tok.DataAtom {
631 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta:
634 p.acknowledgeSelfClosingTag()
636 case a.Script, a.Title, a.Noscript, a.Noframes, a.Style:
646 p.afe = append(p.afe, &scopeMarker)
649 p.templateStack = append(p.templateStack, inTemplateIM)
653 switch p.tok.DataAtom {
658 case a.Body, a.Html, a.Br:
659 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
662 if !p.oe.contains(a.Template) {
665 // TODO: remove this divergence from the HTML5 spec.
667 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
668 p.generateImpliedEndTags()
669 for i := len(p.oe) - 1; i >= 0; i-- {
670 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
675 p.clearActiveFormattingElements()
676 p.templateStack.pop()
677 p.resetInsertionMode()
694 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
698 // Section 12.2.6.4.6.
699 func afterHeadIM(p *parser) bool {
702 s := strings.TrimLeft(p.tok.Data, whitespace)
703 if len(s) < len(p.tok.Data) {
704 // Add the initial whitespace to the current node.
705 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
712 switch p.tok.DataAtom {
724 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
725 p.oe = append(p.oe, p.head)
726 defer p.oe.remove(p.head)
733 switch p.tok.DataAtom {
734 case a.Body, a.Html, a.Br:
735 // Drop down to creating an implied <body> tag.
753 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
758 // copyAttributes copies attributes of src not found on dst to dst.
759 func copyAttributes(dst *Node, src Token) {
760 if len(src.Attr) == 0 {
763 attr := map[string]string{}
764 for _, t := range dst.Attr {
767 for _, t := range src.Attr {
768 if _, ok := attr[t.Key]; !ok {
769 dst.Attr = append(dst.Attr, t)
775 // Section 12.2.6.4.7.
776 func inBodyIM(p *parser) bool {
780 switch n := p.oe.top(); n.DataAtom {
781 case a.Pre, a.Listing:
782 if n.FirstChild == nil {
783 // Ignore a newline at the start of a <pre> block.
784 if d != "" && d[0] == '\r' {
787 if d != "" && d[0] == '\n' {
792 d = strings.Replace(d, "\x00", "", -1)
796 p.reconstructActiveFormattingElements()
798 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
799 // There were non-whitespace characters inserted.
803 switch p.tok.DataAtom {
805 if p.oe.contains(a.Template) {
808 copyAttributes(p.oe[0], p.tok)
809 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
812 if p.oe.contains(a.Template) {
817 if body.Type == ElementNode && body.DataAtom == a.Body {
819 copyAttributes(body, p.tok)
823 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
828 if body.Parent != nil {
829 body.Parent.RemoveChild(body)
835 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul:
836 p.popUntil(buttonScope, a.P)
838 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
839 p.popUntil(buttonScope, a.P)
840 switch n := p.top(); n.DataAtom {
841 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
845 case a.Pre, a.Listing:
846 p.popUntil(buttonScope, a.P)
848 // The newline, if any, will be dealt with by the TextToken case.
851 if p.form != nil && !p.oe.contains(a.Template) {
855 p.popUntil(buttonScope, a.P)
857 if !p.oe.contains(a.Template) {
862 for i := len(p.oe) - 1; i >= 0; i-- {
864 switch node.DataAtom {
867 case a.Address, a.Div, a.P:
870 if !isSpecialElement(node) {
876 p.popUntil(buttonScope, a.P)
880 for i := len(p.oe) - 1; i >= 0; i-- {
882 switch node.DataAtom {
885 case a.Address, a.Div, a.P:
888 if !isSpecialElement(node) {
894 p.popUntil(buttonScope, a.P)
897 p.popUntil(buttonScope, a.P)
900 p.popUntil(defaultScope, a.Button)
901 p.reconstructActiveFormattingElements()
905 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
906 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
907 p.inBodyEndTagFormatting(a.A)
913 p.reconstructActiveFormattingElements()
914 p.addFormattingElement()
915 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
916 p.reconstructActiveFormattingElements()
917 p.addFormattingElement()
919 p.reconstructActiveFormattingElements()
920 if p.elementInScope(defaultScope, a.Nobr) {
921 p.inBodyEndTagFormatting(a.Nobr)
922 p.reconstructActiveFormattingElements()
924 p.addFormattingElement()
925 case a.Applet, a.Marquee, a.Object:
926 p.reconstructActiveFormattingElements()
928 p.afe = append(p.afe, &scopeMarker)
932 p.popUntil(buttonScope, a.P)
938 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
939 p.reconstructActiveFormattingElements()
942 p.acknowledgeSelfClosingTag()
943 if p.tok.DataAtom == a.Input {
944 for _, t := range p.tok.Attr {
946 if strings.ToLower(t.Val) == "hidden" {
947 // Skip setting framesetOK = false
954 case a.Param, a.Source, a.Track:
957 p.acknowledgeSelfClosingTag()
959 p.popUntil(buttonScope, a.P)
962 p.acknowledgeSelfClosingTag()
965 p.tok.DataAtom = a.Img
966 p.tok.Data = a.Img.String()
974 prompt := "This is a searchable index. Enter search keywords: "
975 attr := []Attribute{{Key: "name", Val: "isindex"}}
976 for _, t := range p.tok.Attr {
981 // Ignore the attribute.
985 attr = append(attr, t)
988 p.acknowledgeSelfClosingTag()
989 p.popUntil(buttonScope, a.P)
990 p.parseImpliedToken(StartTagToken, a.Form, a.Form.String())
992 // NOTE: The 'isindex' element has been removed,
993 // and the 'template' element has not been designed to be
994 // collaborative with the index element.
1000 p.form.Attr = []Attribute{{Key: "action", Val: action}}
1002 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
1003 p.parseImpliedToken(StartTagToken, a.Label, a.Label.String())
1008 Data: a.Input.String(),
1012 p.parseImpliedToken(EndTagToken, a.Label, a.Label.String())
1013 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String())
1014 p.parseImpliedToken(EndTagToken, a.Form, a.Form.String())
1018 p.framesetOK = false
1021 p.popUntil(buttonScope, a.P)
1022 p.reconstructActiveFormattingElements()
1023 p.framesetOK = false
1028 p.framesetOK = false
1032 case a.Noembed, a.Noscript:
1037 p.reconstructActiveFormattingElements()
1039 p.framesetOK = false
1042 case a.Optgroup, a.Option:
1043 if p.top().DataAtom == a.Option {
1046 p.reconstructActiveFormattingElements()
1049 if p.elementInScope(defaultScope, a.Ruby) {
1050 p.generateImpliedEndTags()
1054 if p.elementInScope(defaultScope, a.Ruby) {
1055 p.generateImpliedEndTags("rtc")
1059 p.reconstructActiveFormattingElements()
1060 if p.tok.DataAtom == a.Math {
1061 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1063 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1065 adjustForeignAttributes(p.tok.Attr)
1067 p.top().Namespace = p.tok.Data
1068 if p.hasSelfClosingToken {
1070 p.acknowledgeSelfClosingTag()
1073 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1074 // Ignore the token.
1076 p.reconstructActiveFormattingElements()
1080 switch p.tok.DataAtom {
1082 if p.elementInScope(defaultScope, a.Body) {
1086 if p.elementInScope(defaultScope, a.Body) {
1087 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
1091 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul:
1092 p.popUntil(defaultScope, p.tok.DataAtom)
1094 if p.oe.contains(a.Template) {
1095 i := p.indexOfElementInScope(defaultScope, a.Form)
1097 // Ignore the token.
1100 p.generateImpliedEndTags()
1101 if p.oe[i].DataAtom != a.Form {
1102 // Ignore the token.
1105 p.popUntil(defaultScope, a.Form)
1109 i := p.indexOfElementInScope(defaultScope, a.Form)
1110 if node == nil || i == -1 || p.oe[i] != node {
1111 // Ignore the token.
1114 p.generateImpliedEndTags()
1118 if !p.elementInScope(buttonScope, a.P) {
1119 p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1121 p.popUntil(buttonScope, a.P)
1123 p.popUntil(listItemScope, a.Li)
1125 p.popUntil(defaultScope, p.tok.DataAtom)
1126 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1127 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1128 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1129 p.inBodyEndTagFormatting(p.tok.DataAtom)
1130 case a.Applet, a.Marquee, a.Object:
1131 if p.popUntil(defaultScope, p.tok.DataAtom) {
1132 p.clearActiveFormattingElements()
1135 p.tok.Type = StartTagToken
1140 p.inBodyEndTagOther(p.tok.DataAtom)
1148 // TODO: remove this divergence from the HTML5 spec.
1149 if len(p.templateStack) > 0 {
1153 for _, e := range p.oe {
1155 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
1156 a.Thead, a.Tr, a.Body, a.Html:
1167 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
1168 // This is the "adoption agency" algorithm, described at
1169 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1171 // TODO: this is a fairly literal line-by-line translation of that algorithm.
1172 // Once the code successfully parses the comprehensive test suite, we should
1173 // refactor this code to be more idiomatic.
1175 // Steps 1-4. The outer loop.
1176 for i := 0; i < 8; i++ {
1177 // Step 5. Find the formatting element.
1178 var formattingElement *Node
1179 for j := len(p.afe) - 1; j >= 0; j-- {
1180 if p.afe[j].Type == scopeMarkerNode {
1183 if p.afe[j].DataAtom == tagAtom {
1184 formattingElement = p.afe[j]
1188 if formattingElement == nil {
1189 p.inBodyEndTagOther(tagAtom)
1192 feIndex := p.oe.index(formattingElement)
1194 p.afe.remove(formattingElement)
1197 if !p.elementInScope(defaultScope, tagAtom) {
1202 // Steps 9-10. Find the furthest block.
1203 var furthestBlock *Node
1204 for _, e := range p.oe[feIndex:] {
1205 if isSpecialElement(e) {
1210 if furthestBlock == nil {
1212 for e != formattingElement {
1219 // Steps 11-12. Find the common ancestor and bookmark node.
1220 commonAncestor := p.oe[feIndex-1]
1221 bookmark := p.afe.index(formattingElement)
1223 // Step 13. The inner loop. Find the lastNode to reparent.
1224 lastNode := furthestBlock
1225 node := furthestBlock
1226 x := p.oe.index(node)
1228 for j := 0; j < 3; j++ {
1232 // Step 13.4 - 13.5.
1233 if p.afe.index(node) == -1 {
1238 if node == formattingElement {
1242 clone := node.clone()
1243 p.afe[p.afe.index(node)] = clone
1244 p.oe[p.oe.index(node)] = clone
1247 if lastNode == furthestBlock {
1248 bookmark = p.afe.index(node) + 1
1251 if lastNode.Parent != nil {
1252 lastNode.Parent.RemoveChild(lastNode)
1254 node.AppendChild(lastNode)
1259 // Step 14. Reparent lastNode to the common ancestor,
1260 // or for misnested table nodes, to the foster parent.
1261 if lastNode.Parent != nil {
1262 lastNode.Parent.RemoveChild(lastNode)
1264 switch commonAncestor.DataAtom {
1265 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1266 p.fosterParent(lastNode)
1268 // TODO: remove namespace checking
1269 if commonAncestor.Namespace == "html" {
1270 commonAncestor = commonAncestor.LastChild
1274 commonAncestor.AppendChild(lastNode)
1277 // Steps 15-17. Reparent nodes from the furthest block's children
1278 // to a clone of the formatting element.
1279 clone := formattingElement.clone()
1280 reparentChildren(clone, furthestBlock)
1281 furthestBlock.AppendChild(clone)
1283 // Step 18. Fix up the list of active formatting elements.
1284 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1285 // Move the bookmark with the rest of the list.
1288 p.afe.remove(formattingElement)
1289 p.afe.insert(bookmark, clone)
1291 // Step 19. Fix up the stack of open elements.
1292 p.oe.remove(formattingElement)
1293 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1297 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1298 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
1299 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1300 func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
1301 for i := len(p.oe) - 1; i >= 0; i-- {
1302 if p.oe[i].DataAtom == tagAtom {
1306 if isSpecialElement(p.oe[i]) {
1312 // Section 12.2.6.4.8.
1313 func textIM(p *parser) bool {
1319 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1320 // Ignore a newline at the start of a <textarea> block.
1321 if d != "" && d[0] == '\r' {
1324 if d != "" && d[0] == '\n' {
1338 return p.tok.Type == EndTagToken
1341 // Section 12.2.6.4.9.
1342 func inTableIM(p *parser) bool {
1345 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1346 switch p.oe.top().DataAtom {
1347 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1348 if strings.Trim(p.tok.Data, whitespace) == "" {
1349 p.addText(p.tok.Data)
1354 switch p.tok.DataAtom {
1356 p.clearStackToContext(tableScope)
1357 p.afe = append(p.afe, &scopeMarker)
1362 p.clearStackToContext(tableScope)
1364 p.im = inColumnGroupIM
1367 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1369 case a.Tbody, a.Tfoot, a.Thead:
1370 p.clearStackToContext(tableScope)
1372 p.im = inTableBodyIM
1374 case a.Td, a.Th, a.Tr:
1375 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1378 if p.popUntil(tableScope, a.Table) {
1379 p.resetInsertionMode()
1382 // Ignore the token.
1384 case a.Style, a.Script, a.Template:
1387 for _, t := range p.tok.Attr {
1388 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
1394 // Otherwise drop down to the default action.
1396 if p.oe.contains(a.Template) || p.form != nil {
1397 // Ignore the token.
1403 p.reconstructActiveFormattingElements()
1404 switch p.top().DataAtom {
1405 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1406 p.fosterParenting = true
1409 p.fosterParenting = false
1410 p.framesetOK = false
1411 p.im = inSelectInTableIM
1415 switch p.tok.DataAtom {
1417 if p.popUntil(tableScope, a.Table) {
1418 p.resetInsertionMode()
1421 // Ignore the token.
1423 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1424 // Ignore the token.
1436 // Ignore the token.
1442 p.fosterParenting = true
1443 defer func() { p.fosterParenting = false }()
1448 // Section 12.2.6.4.11.
1449 func inCaptionIM(p *parser) bool {
1452 switch p.tok.DataAtom {
1453 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1454 if p.popUntil(tableScope, a.Caption) {
1455 p.clearActiveFormattingElements()
1459 // Ignore the token.
1463 p.reconstructActiveFormattingElements()
1465 p.framesetOK = false
1466 p.im = inSelectInTableIM
1470 switch p.tok.DataAtom {
1472 if p.popUntil(tableScope, a.Caption) {
1473 p.clearActiveFormattingElements()
1478 if p.popUntil(tableScope, a.Caption) {
1479 p.clearActiveFormattingElements()
1483 // Ignore the token.
1486 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1487 // Ignore the token.
1494 // Section 12.2.6.4.12.
1495 func inColumnGroupIM(p *parser) bool {
1498 s := strings.TrimLeft(p.tok.Data, whitespace)
1499 if len(s) < len(p.tok.Data) {
1500 // Add the initial whitespace to the current node.
1501 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1514 // Ignore the token.
1517 switch p.tok.DataAtom {
1523 p.acknowledgeSelfClosingTag()
1529 switch p.tok.DataAtom {
1531 if p.oe.top().DataAtom == a.Colgroup {
1537 // Ignore the token.
1545 if p.oe.top().DataAtom != a.Colgroup {
1553 // Section 12.2.6.4.13.
1554 func inTableBodyIM(p *parser) bool {
1557 switch p.tok.DataAtom {
1559 p.clearStackToContext(tableBodyScope)
1564 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1566 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1567 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1571 // Ignore the token.
1575 switch p.tok.DataAtom {
1576 case a.Tbody, a.Tfoot, a.Thead:
1577 if p.elementInScope(tableScope, p.tok.DataAtom) {
1578 p.clearStackToContext(tableBodyScope)
1584 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1588 // Ignore the token.
1590 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1591 // Ignore the token.
1605 // Section 12.2.6.4.14.
1606 func inRowIM(p *parser) bool {
1609 switch p.tok.DataAtom {
1611 p.clearStackToContext(tableRowScope)
1613 p.afe = append(p.afe, &scopeMarker)
1616 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1617 if p.popUntil(tableScope, a.Tr) {
1618 p.im = inTableBodyIM
1621 // Ignore the token.
1625 switch p.tok.DataAtom {
1627 if p.popUntil(tableScope, a.Tr) {
1628 p.im = inTableBodyIM
1631 // Ignore the token.
1634 if p.popUntil(tableScope, a.Tr) {
1635 p.im = inTableBodyIM
1638 // Ignore the token.
1640 case a.Tbody, a.Tfoot, a.Thead:
1641 if p.elementInScope(tableScope, p.tok.DataAtom) {
1642 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String())
1645 // Ignore the token.
1647 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1648 // Ignore the token.
1656 // Section 12.2.6.4.15.
1657 func inCellIM(p *parser) bool {
1660 switch p.tok.DataAtom {
1661 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1662 if p.popUntil(tableScope, a.Td, a.Th) {
1663 // Close the cell and reprocess.
1664 p.clearActiveFormattingElements()
1668 // Ignore the token.
1671 p.reconstructActiveFormattingElements()
1673 p.framesetOK = false
1674 p.im = inSelectInTableIM
1678 switch p.tok.DataAtom {
1680 if !p.popUntil(tableScope, p.tok.DataAtom) {
1681 // Ignore the token.
1684 p.clearActiveFormattingElements()
1687 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1688 // Ignore the token.
1690 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1691 if !p.elementInScope(tableScope, p.tok.DataAtom) {
1692 // Ignore the token.
1695 // Close the cell and reprocess.
1696 p.popUntil(tableScope, a.Td, a.Th)
1697 p.clearActiveFormattingElements()
1705 // Section 12.2.6.4.16.
1706 func inSelectIM(p *parser) bool {
1709 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1711 switch p.tok.DataAtom {
1715 if p.top().DataAtom == a.Option {
1720 if p.top().DataAtom == a.Option {
1723 if p.top().DataAtom == a.Optgroup {
1728 p.tok.Type = EndTagToken
1730 case a.Input, a.Keygen, a.Textarea:
1731 if p.elementInScope(selectScope, a.Select) {
1732 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1735 // In order to properly ignore <textarea>, we need to change the tokenizer mode.
1736 p.tokenizer.NextIsNotRawText()
1737 // Ignore the token.
1739 case a.Script, a.Template:
1743 switch p.tok.DataAtom {
1745 if p.top().DataAtom == a.Option {
1750 if p.oe[i].DataAtom == a.Option {
1753 if p.oe[i].DataAtom == a.Optgroup {
1757 if p.popUntil(selectScope, a.Select) {
1758 p.resetInsertionMode()
1769 // Ignore the token.
1778 // Section 12.2.6.4.17.
1779 func inSelectInTableIM(p *parser) bool {
1781 case StartTagToken, EndTagToken:
1782 switch p.tok.DataAtom {
1783 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1784 if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) {
1785 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1788 // Ignore the token.
1793 return inSelectIM(p)
1796 // Section 12.2.6.4.18.
1797 func inTemplateIM(p *parser) bool {
1799 case TextToken, CommentToken, DoctypeToken:
1802 switch p.tok.DataAtom {
1803 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
1805 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1806 p.templateStack.pop()
1807 p.templateStack = append(p.templateStack, inTableIM)
1811 p.templateStack.pop()
1812 p.templateStack = append(p.templateStack, inColumnGroupIM)
1813 p.im = inColumnGroupIM
1816 p.templateStack.pop()
1817 p.templateStack = append(p.templateStack, inTableBodyIM)
1818 p.im = inTableBodyIM
1821 p.templateStack.pop()
1822 p.templateStack = append(p.templateStack, inRowIM)
1826 p.templateStack.pop()
1827 p.templateStack = append(p.templateStack, inBodyIM)
1832 switch p.tok.DataAtom {
1836 // Ignore the token.
1840 if !p.oe.contains(a.Template) {
1841 // Ignore the token.
1844 // TODO: remove this divergence from the HTML5 spec.
1846 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
1847 p.generateImpliedEndTags()
1848 for i := len(p.oe) - 1; i >= 0; i-- {
1849 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
1854 p.clearActiveFormattingElements()
1855 p.templateStack.pop()
1856 p.resetInsertionMode()
1862 // Section 12.2.6.4.19.
1863 func afterBodyIM(p *parser) bool {
1869 s := strings.TrimLeft(p.tok.Data, whitespace)
1871 // It was all whitespace.
1875 if p.tok.DataAtom == a.Html {
1879 if p.tok.DataAtom == a.Html {
1881 p.im = afterAfterBodyIM
1886 // The comment is attached to the <html> element.
1887 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
1888 panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1890 p.oe[0].AppendChild(&Node{
1900 // Section 12.2.6.4.20.
1901 func inFramesetIM(p *parser) bool {
1909 // Ignore all text but whitespace.
1910 s := strings.Map(func(c rune) rune {
1912 case ' ', '\t', '\n', '\f', '\r':
1921 switch p.tok.DataAtom {
1929 p.acknowledgeSelfClosingTag()
1934 switch p.tok.DataAtom {
1936 if p.oe.top().DataAtom != a.Html {
1938 if p.oe.top().DataAtom != a.Frameset {
1939 p.im = afterFramesetIM
1945 // Ignore the token.
1950 // Section 12.2.6.4.21.
1951 func afterFramesetIM(p *parser) bool {
1959 // Ignore all text but whitespace.
1960 s := strings.Map(func(c rune) rune {
1962 case ' ', '\t', '\n', '\f', '\r':
1971 switch p.tok.DataAtom {
1978 switch p.tok.DataAtom {
1980 p.im = afterAfterFramesetIM
1984 // Ignore the token.
1989 // Section 12.2.6.4.22.
1990 func afterAfterBodyIM(p *parser) bool {
1996 s := strings.TrimLeft(p.tok.Data, whitespace)
1998 // It was all whitespace.
2002 if p.tok.DataAtom == a.Html {
2006 p.doc.AppendChild(&Node{
2018 // Section 12.2.6.4.23.
2019 func afterAfterFramesetIM(p *parser) bool {
2022 p.doc.AppendChild(&Node{
2027 // Ignore all text but whitespace.
2028 s := strings.Map(func(c rune) rune {
2030 case ' ', '\t', '\n', '\f', '\r':
2040 switch p.tok.DataAtom {
2049 // Ignore the token.
2054 const whitespaceOrNUL = whitespace + "\x00"
2057 func parseForeignContent(p *parser) bool {
2061 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
2063 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
2064 p.addText(p.tok.Data)
2071 b := breakout[p.tok.Data]
2072 if p.tok.DataAtom == a.Font {
2074 for _, attr := range p.tok.Attr {
2076 case "color", "face", "size":
2083 for i := len(p.oe) - 1; i >= 0; i-- {
2085 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
2092 switch p.top().Namespace {
2094 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
2096 // Adjust SVG tag names. The tokenizer lower-cases tag names, but
2097 // SVG wants e.g. "foreignObject" with a capital second "O".
2098 if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
2099 p.tok.DataAtom = a.Lookup([]byte(x))
2102 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
2104 panic("html: bad parser state: unexpected namespace")
2106 adjustForeignAttributes(p.tok.Attr)
2107 namespace := p.top().Namespace
2109 p.top().Namespace = namespace
2110 if namespace != "" {
2111 // Don't let the tokenizer go into raw text mode in foreign content
2112 // (e.g. in an SVG <title> tag).
2113 p.tokenizer.NextIsNotRawText()
2115 if p.hasSelfClosingToken {
2117 p.acknowledgeSelfClosingTag()
2120 for i := len(p.oe) - 1; i >= 0; i-- {
2121 if p.oe[i].Namespace == "" {
2124 if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
2131 // Ignore the token.
2137 func (p *parser) inForeignContent() bool {
2141 n := p.oe[len(p.oe)-1]
2142 if n.Namespace == "" {
2145 if mathMLTextIntegrationPoint(n) {
2146 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
2149 if p.tok.Type == TextToken {
2153 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
2156 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
2159 if p.tok.Type == ErrorToken {
2165 // parseImpliedToken parses a token as though it had appeared in the parser's
2167 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
2168 realToken, selfClosing := p.tok, p.hasSelfClosingToken
2174 p.hasSelfClosingToken = false
2175 p.parseCurrentToken()
2176 p.tok, p.hasSelfClosingToken = realToken, selfClosing
2179 // parseCurrentToken runs the current token through the parsing routines
2180 // until it is consumed.
2181 func (p *parser) parseCurrentToken() {
2182 if p.tok.Type == SelfClosingTagToken {
2183 p.hasSelfClosingToken = true
2184 p.tok.Type = StartTagToken
2189 if p.inForeignContent() {
2190 consumed = parseForeignContent(p)
2196 if p.hasSelfClosingToken {
2197 // This is a parse error, but ignore it.
2198 p.hasSelfClosingToken = false
2202 func (p *parser) parse() error {
2203 // Iterate until EOF. Any other error will cause an early return.
2206 // CDATA sections are allowed only in foreign content.
2208 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2209 // Read and parse the next token.
2211 p.tok = p.tokenizer.Token()
2212 if p.tok.Type == ErrorToken {
2213 err = p.tokenizer.Err()
2214 if err != nil && err != io.EOF {
2218 p.parseCurrentToken()
2223 // Parse returns the parse tree for the HTML from the given Reader.
2225 // It implements the HTML5 parsing algorithm
2226 // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
2227 // which is very complicated. The resultant tree can contain implicitly created
2228 // nodes that have no explicit <tag> listed in r's data, and nodes' parents can
2229 // differ from the nesting implied by a naive processing of start and end
2230 // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
2231 // with no corresponding node in the resulting tree.
2233 // The input is assumed to be UTF-8 encoded.
2234 func Parse(r io.Reader) (*Node, error) {
2236 tokenizer: NewTokenizer(r),
2251 // ParseFragment parses a fragment of HTML and returns the nodes that were
2252 // found. If the fragment is the InnerHTML for an existing element, pass that
2253 // element in context.
2255 // It has the same intricacies as Parse.
2256 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2259 if context.Type != ElementNode {
2260 return nil, errors.New("html: ParseFragment of non-element Node")
2262 // The next check isn't just context.DataAtom.String() == context.Data because
2263 // it is valid to pass an element whose tag isn't a known atom. For example,
2264 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2265 if context.DataAtom != a.Lookup([]byte(context.Data)) {
2266 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2268 contextTag = context.DataAtom.String()
2271 tokenizer: NewTokenizerFragment(r, contextTag),
2283 Data: a.Html.String(),
2285 p.doc.AppendChild(root)
2286 p.oe = nodeStack{root}
2287 if context != nil && context.DataAtom == a.Template {
2288 p.templateStack = append(p.templateStack, inTemplateIM)
2290 p.resetInsertionMode()
2292 for n := context; n != nil; n = n.Parent {
2293 if n.Type == ElementNode && n.DataAtom == a.Form {
2310 for c := parent.FirstChild; c != nil; {
2311 next := c.NextSibling
2312 parent.RemoveChild(c)
2313 result = append(result, c)