Browse Source

html: implement generic raw text element parsing algorithm

See: https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text

This follows up on golang.org/cl/205617

Change-Id: Id99054bc25e9ea90bb3f03b15c14c13573520997
Reviewed-on: https://go-review.googlesource.com/c/net/+/210318
Run-TryBot: Kunpei Sakai <namusyaka@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Kunpei Sakai 6 years ago
parent
commit
1ddd1de85c
5 changed files with 332 additions and 150 deletions
  1. 33 15
      html/parse.go
  2. 2 1
      html/parse_test.go
  3. 197 116
      html/testdata/webkit/tests16.dat
  4. 37 18
      html/testdata/webkit/tests5.dat
  5. 63 0
      html/testdata/webkit/webkit02.dat

+ 33 - 15
html/parse.go

@@ -184,6 +184,17 @@ func (p *parser) clearStackToContext(s scope) {
 	}
 }
 
+// parseGenericRawTextElements implements the generic raw text element parsing
+// algorithm defined in 12.2.6.2.
+// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
+// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
+// officially, need to make tokenizer consider both states.
+func (p *parser) parseGenericRawTextElement() {
+	p.addElement()
+	p.originalIM = p.im
+	p.im = textIM
+}
+
 // generateImpliedEndTags pops nodes off the stack of open elements as long as
 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
 // If exceptions are specified, nodes with that name will not be popped off.
@@ -631,19 +642,23 @@ func inHeadIM(p *parser) bool {
 			p.acknowledgeSelfClosingTag()
 			return true
 		case a.Noscript:
-			p.addElement()
 			if p.scripting {
-				p.setOriginalIM()
-				p.im = textIM
-			} else {
-				p.im = inHeadNoscriptIM
+				p.parseGenericRawTextElement()
+				return true
 			}
+			p.addElement()
+			p.im = inHeadNoscriptIM
+			// Don't let the tokenizer go into raw text mode when scripting is disabled.
+			p.tokenizer.NextIsNotRawText()
 			return true
-		case a.Script, a.Title, a.Noframes, a.Style:
+		case a.Script, a.Title:
 			p.addElement()
 			p.setOriginalIM()
 			p.im = textIM
 			return true
+		case a.Noframes, a.Style:
+			p.parseGenericRawTextElement()
+			return true
 		case a.Head:
 			// Ignore the token.
 			return true
@@ -1023,18 +1038,21 @@ func inBodyIM(p *parser) bool {
 			p.popUntil(buttonScope, a.P)
 			p.reconstructActiveFormattingElements()
 			p.framesetOK = false
-			p.addElement()
-			p.setOriginalIM()
-			p.im = textIM
+			p.parseGenericRawTextElement()
 		case a.Iframe:
 			p.framesetOK = false
+			p.parseGenericRawTextElement()
+		case a.Noembed:
+			p.parseGenericRawTextElement()
+		case a.Noscript:
+			if p.scripting {
+				p.parseGenericRawTextElement()
+				return true
+			}
+			p.reconstructActiveFormattingElements()
 			p.addElement()
-			p.setOriginalIM()
-			p.im = textIM
-		case a.Noembed, a.Noscript:
-			p.addElement()
-			p.setOriginalIM()
-			p.im = textIM
+			// Don't let the tokenizer go into raw text mode when scripting is disabled.
+			p.tokenizer.NextIsNotRawText()
 		case a.Select:
 			p.reconstructActiveFormattingElements()
 			p.addElement()

+ 2 - 1
html/parse_test.go

@@ -272,7 +272,8 @@ func TestParserWithoutScripting(t *testing.T) {
 |   <head>
 |     <noscript>
 |   <body>
-|     "<img src='https://golang.org/doc/gopher/frontpage.png' />"
+|     <img>
+|       src="https://golang.org/doc/gopher/frontpage.png"
 |     <p>
 |       <img>
 |         src="https://golang.org/doc/gopher/doc.png"

File diff suppressed because it is too large
+ 197 - 116
html/testdata/webkit/tests16.dat


+ 37 - 18
html/testdata/webkit/tests5.dat

@@ -1,8 +1,7 @@
 #data
 <style> <!-- </style>x
 #errors
-Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
-Line: 1 Col: 22 Unexpected end of file. Expected end tag (style).
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -14,7 +13,8 @@ Line: 1 Col: 22 Unexpected end of file. Expected end tag (style).
 #data
 <style> <!-- </style> --> </style>x
 #errors
-Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
+(1,34): unexpected-end-tag
 #document
 | <html>
 |   <head>
@@ -27,7 +27,7 @@ Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #data
 <style> <!--> </style>x
 #errors
-Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -39,7 +39,7 @@ Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #data
 <style> <!---> </style>x
 #errors
-Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -51,7 +51,7 @@ Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #data
 <iframe> <!---> </iframe>x
 #errors
-Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE.
+(1,8): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -63,7 +63,9 @@ Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE.
 #data
 <iframe> <!--- </iframe>->x</iframe> --> </iframe>x
 #errors
-Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE.
+(1,8): expected-doctype-but-got-start-tag
+(1,36): unexpected-end-tag
+(1,50): unexpected-end-tag
 #document
 | <html>
 |   <head>
@@ -75,7 +77,8 @@ Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE.
 #data
 <script> <!-- </script> --> </script>x
 #errors
-Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE.
+(1,8): expected-doctype-but-got-start-tag
+(1,37): unexpected-end-tag
 #document
 | <html>
 |   <head>
@@ -88,7 +91,8 @@ Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE.
 #data
 <title> <!-- </title> --> </title>x
 #errors
-Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
+(1,34): unexpected-end-tag
 #document
 | <html>
 |   <head>
@@ -101,7 +105,9 @@ Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 #data
 <textarea> <!--- </textarea>->x</textarea> --> </textarea>x
 #errors
-Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE.
+(1,10): expected-doctype-but-got-start-tag
+(1,42): unexpected-end-tag
+(1,58): unexpected-end-tag
 #document
 | <html>
 |   <head>
@@ -113,7 +119,7 @@ Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE.
 #data
 <style> <!</-- </style>x
 #errors
-Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -125,7 +131,7 @@ Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #data
 <p><xmp></xmp>
 #errors
-XXX: Unknown
+(1,3): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -136,7 +142,7 @@ XXX: Unknown
 #data
 <xmp> <!-- > --> </xmp>
 #errors
-Line: 1 Col: 5 Unexpected start tag (xmp). Expected DOCTYPE.
+(1,5): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -147,7 +153,7 @@ Line: 1 Col: 5 Unexpected start tag (xmp). Expected DOCTYPE.
 #data
 <title>&amp;</title>
 #errors
-Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -158,7 +164,7 @@ Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 #data
 <title><!--&amp;--></title>
 #errors
-Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -169,8 +175,7 @@ Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 #data
 <title><!--</title>
 #errors
-Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
-Line: 1 Col: 19 Unexpected end of file. Expected end tag (title).
+(1,7): expected-doctype-but-got-start-tag
 #document
 | <html>
 |   <head>
@@ -181,7 +186,9 @@ Line: 1 Col: 19 Unexpected end of file. Expected end tag (title).
 #data
 <noscript><!--</noscript>--></noscript>
 #errors
-Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE.
+(1,10): expected-doctype-but-got-start-tag
+(1,39): unexpected-end-tag
+#script-on
 #document
 | <html>
 |   <head>
@@ -189,3 +196,15 @@ Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE.
 |       "<!--"
 |   <body>
 |     "-->"
+
+#data
+<noscript><!--</noscript>--></noscript>
+#errors
+(1,10): expected-doctype-but-got-start-tag
+#script-off
+#document
+| <html>
+|   <head>
+|     <noscript>
+|       <!-- </noscript> -->
+|   <body>

+ 63 - 0
html/testdata/webkit/webkit02.dat

@@ -26,6 +26,23 @@
 |       <span>
 |         "B"
 
+#data
+<p id="status"><noscript><strong>A</strong></noscript><span>B</span></p>
+#errors
+(1,15): expected-doctype-but-got-start-tag
+#script-off
+#document
+| <html>
+|   <head>
+|   <body>
+|     <p>
+|       id="status"
+|       <noscript>
+|         <strong>
+|           "A"
+|       <span>
+|         "B"
+
 #data
 <div><sarcasm><div></div></sarcasm></div>
 #errors
@@ -136,6 +153,52 @@
 |     <input>
 |     <table>
 
+#data
+<b><em><foo><foo><aside></b>
+#errors
+#document
+| <html>
+|   <head>
+|   <body>
+|     <b>
+|       <em>
+|         <foo>
+|           <foo>
+|     <em>
+|       <aside>
+|         <b>
+
+#data
+<b><em><foo><foo><aside></b></em>
+#errors
+#document
+| <html>
+|   <head>
+|   <body>
+|     <b>
+|       <em>
+|         <foo>
+|           <foo>
+|     <em>
+|     <aside>
+|       <em>
+|         <b>
+
+#data
+<b><em><foo><foo><foo><aside></b>
+#errors
+#document
+| <html>
+|   <head>
+|   <body>
+|     <b>
+|       <em>
+|         <foo>
+|           <foo>
+|             <foo>
+|     <aside>
+|       <b>
+
 #data
 <b><em><dcell><postfield><postfield><postfield><postfield><missing_glyph><missing_glyph><missing_glyph><missing_glyph><hkern><aside></b></em>
 #errors

Some files were not shown because too many files changed in this diff