Ver Fonte

html: fix parsing where nested tags of unknown types inadvertently close one another

The existing implementation behaves differently to all major browsers, for the instance where a self-closing element of an unknown tag type is the child of another element of an unknown tag type. The issue appears to be that nested tags of an differing unknown types will all have an atom value of 0 and `inBodyEndTagOther` will incorrectly match them to one another.

Fixes golang/go#30961

Change-Id: I62b0aa49c027c8432df7d077ffba135201b3b786
GitHub-Last-Rev: fb25181f9ae5ab9e74d0053cd322d507902b9054
GitHub-Pull-Request: golang/net#37
Reviewed-on: https://go-review.googlesource.com/c/net/+/168638
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Tom Anthony há 6 anos atrás
pai
commit
e3b2ff56ed
2 ficheiros alterados com 28 adições e 8 exclusões
  1. 16 8
      html/parse.go
  2. 12 0
      html/testdata/go/template.dat

+ 16 - 8
html/parse.go

@@ -901,7 +901,7 @@ func inBodyIM(p *parser) bool {
 		case a.A:
 		case a.A:
 			for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
 			for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
 				if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
 				if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
-					p.inBodyEndTagFormatting(a.A)
+					p.inBodyEndTagFormatting(a.A, "a")
 					p.oe.remove(n)
 					p.oe.remove(n)
 					p.afe.remove(n)
 					p.afe.remove(n)
 					break
 					break
@@ -915,7 +915,7 @@ func inBodyIM(p *parser) bool {
 		case a.Nobr:
 		case a.Nobr:
 			p.reconstructActiveFormattingElements()
 			p.reconstructActiveFormattingElements()
 			if p.elementInScope(defaultScope, a.Nobr) {
 			if p.elementInScope(defaultScope, a.Nobr) {
-				p.inBodyEndTagFormatting(a.Nobr)
+				p.inBodyEndTagFormatting(a.Nobr, "nobr")
 				p.reconstructActiveFormattingElements()
 				p.reconstructActiveFormattingElements()
 			}
 			}
 			p.addFormattingElement()
 			p.addFormattingElement()
@@ -1123,7 +1123,7 @@ func inBodyIM(p *parser) bool {
 		case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
 		case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
 			p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
 			p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
 		case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
 		case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
-			p.inBodyEndTagFormatting(p.tok.DataAtom)
+			p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
 		case a.Applet, a.Marquee, a.Object:
 		case a.Applet, a.Marquee, a.Object:
 			if p.popUntil(defaultScope, p.tok.DataAtom) {
 			if p.popUntil(defaultScope, p.tok.DataAtom) {
 				p.clearActiveFormattingElements()
 				p.clearActiveFormattingElements()
@@ -1134,7 +1134,7 @@ func inBodyIM(p *parser) bool {
 		case a.Template:
 		case a.Template:
 			return inHeadIM(p)
 			return inHeadIM(p)
 		default:
 		default:
-			p.inBodyEndTagOther(p.tok.DataAtom)
+			p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
 		}
 		}
 	case CommentToken:
 	case CommentToken:
 		p.addChild(&Node{
 		p.addChild(&Node{
@@ -1161,7 +1161,7 @@ func inBodyIM(p *parser) bool {
 	return true
 	return true
 }
 }
 
 
-func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
+func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
 	// This is the "adoption agency" algorithm, described at
 	// This is the "adoption agency" algorithm, described at
 	// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
 	// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
 
 
@@ -1183,7 +1183,7 @@ func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
 			}
 			}
 		}
 		}
 		if formattingElement == nil {
 		if formattingElement == nil {
-			p.inBodyEndTagOther(tagAtom)
+			p.inBodyEndTagOther(tagAtom, tagName)
 			return
 			return
 		}
 		}
 		feIndex := p.oe.index(formattingElement)
 		feIndex := p.oe.index(formattingElement)
@@ -1288,9 +1288,17 @@ func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) {
 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
-func (p *parser) inBodyEndTagOther(tagAtom a.Atom) {
+func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
 	for i := len(p.oe) - 1; i >= 0; i-- {
 	for i := len(p.oe) - 1; i >= 0; i-- {
-		if p.oe[i].DataAtom == tagAtom {
+		// Two element nodes have the same tag if they have the same Data (a
+		// string-typed field). As an optimization, for common HTML tags, each
+		// Data string is assigned a unique, non-zero DataAtom (a uint32-typed
+		// field), since integer comparison is faster than string comparison.
+		// Uncommon (custom) tags get a zero DataAtom.
+		//
+		// The if condition here is equivalent to (p.oe[i].Data == tagName).
+		if (p.oe[i].DataAtom == tagAtom) &&
+		    ((tagAtom != 0) || (p.oe[i].Data == tagName)) {
 			p.oe = p.oe[:i]
 			p.oe = p.oe[:i]
 			break
 			break
 		}
 		}

+ 12 - 0
html/testdata/go/template.dat

@@ -60,3 +60,15 @@
 |       <math template>
 |       <math template>
 |         <math mn>
 |         <math mn>
 |           <b>
 |           <b>
+
+#data
+<html><head></head><body><tag1><tag2 /><p></p></tag1><div></div></body></html>
+#errors
+#document
+| <html>
+|   <head>
+|   <body>
+|     <tag1>
+|       <tag2>
+|         <p>
+|     <div>