parse_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bufio"
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "io"
  11. "io/ioutil"
  12. "os"
  13. "path/filepath"
  14. "runtime"
  15. "sort"
  16. "strings"
  17. "testing"
  18. "golang.org/x/net/html/atom"
  19. )
  20. type testAttrs struct {
  21. text, want, context string
  22. scripting bool
  23. }
  24. // readParseTest reads a single test case from r.
  25. func readParseTest(r *bufio.Reader) (*testAttrs, error) {
  26. ta := &testAttrs{scripting: true}
  27. line, err := r.ReadSlice('\n')
  28. if err != nil {
  29. return nil, err
  30. }
  31. var b []byte
  32. // Read the HTML.
  33. if string(line) != "#data\n" {
  34. return nil, fmt.Errorf(`got %q want "#data\n"`, line)
  35. }
  36. for {
  37. line, err = r.ReadSlice('\n')
  38. if err != nil {
  39. return nil, err
  40. }
  41. if line[0] == '#' {
  42. break
  43. }
  44. b = append(b, line...)
  45. }
  46. ta.text = strings.TrimSuffix(string(b), "\n")
  47. b = b[:0]
  48. // Skip the error list.
  49. if string(line) != "#errors\n" {
  50. return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
  51. }
  52. for {
  53. line, err = r.ReadSlice('\n')
  54. if err != nil {
  55. return nil, err
  56. }
  57. if line[0] == '#' {
  58. break
  59. }
  60. }
  61. if ls := string(line); strings.HasPrefix(ls, "#script-") {
  62. switch {
  63. case strings.HasSuffix(ls, "-on\n"):
  64. ta.scripting = true
  65. case strings.HasSuffix(ls, "-off\n"):
  66. ta.scripting = false
  67. default:
  68. return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
  69. }
  70. for {
  71. line, err = r.ReadSlice('\n')
  72. if err != nil {
  73. return nil, err
  74. }
  75. if line[0] == '#' {
  76. break
  77. }
  78. }
  79. }
  80. if string(line) == "#document-fragment\n" {
  81. line, err = r.ReadSlice('\n')
  82. if err != nil {
  83. return nil, err
  84. }
  85. ta.context = strings.TrimSpace(string(line))
  86. line, err = r.ReadSlice('\n')
  87. if err != nil {
  88. return nil, err
  89. }
  90. }
  91. // Read the dump of what the parse tree should be.
  92. if string(line) != "#document\n" {
  93. return nil, fmt.Errorf(`got %q want "#document\n"`, line)
  94. }
  95. inQuote := false
  96. for {
  97. line, err = r.ReadSlice('\n')
  98. if err != nil && err != io.EOF {
  99. return nil, err
  100. }
  101. trimmed := bytes.Trim(line, "| \n")
  102. if len(trimmed) > 0 {
  103. if line[0] == '|' && trimmed[0] == '"' {
  104. inQuote = true
  105. }
  106. if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
  107. inQuote = false
  108. }
  109. }
  110. if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
  111. break
  112. }
  113. b = append(b, line...)
  114. }
  115. ta.want = string(b)
  116. return ta, nil
  117. }
  118. func dumpIndent(w io.Writer, level int) {
  119. io.WriteString(w, "| ")
  120. for i := 0; i < level; i++ {
  121. io.WriteString(w, " ")
  122. }
  123. }
  124. type sortedAttributes []Attribute
  125. func (a sortedAttributes) Len() int {
  126. return len(a)
  127. }
  128. func (a sortedAttributes) Less(i, j int) bool {
  129. if a[i].Namespace != a[j].Namespace {
  130. return a[i].Namespace < a[j].Namespace
  131. }
  132. return a[i].Key < a[j].Key
  133. }
  134. func (a sortedAttributes) Swap(i, j int) {
  135. a[i], a[j] = a[j], a[i]
  136. }
  137. func dumpLevel(w io.Writer, n *Node, level int) error {
  138. dumpIndent(w, level)
  139. level++
  140. switch n.Type {
  141. case ErrorNode:
  142. return errors.New("unexpected ErrorNode")
  143. case DocumentNode:
  144. return errors.New("unexpected DocumentNode")
  145. case ElementNode:
  146. if n.Namespace != "" {
  147. fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
  148. } else {
  149. fmt.Fprintf(w, "<%s>", n.Data)
  150. }
  151. attr := sortedAttributes(n.Attr)
  152. sort.Sort(attr)
  153. for _, a := range attr {
  154. io.WriteString(w, "\n")
  155. dumpIndent(w, level)
  156. if a.Namespace != "" {
  157. fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
  158. } else {
  159. fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
  160. }
  161. }
  162. if n.Namespace == "" && n.DataAtom == atom.Template {
  163. io.WriteString(w, "\n")
  164. dumpIndent(w, level)
  165. level++
  166. io.WriteString(w, "content")
  167. }
  168. case TextNode:
  169. fmt.Fprintf(w, `"%s"`, n.Data)
  170. case CommentNode:
  171. fmt.Fprintf(w, "<!-- %s -->", n.Data)
  172. case DoctypeNode:
  173. fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
  174. if n.Attr != nil {
  175. var p, s string
  176. for _, a := range n.Attr {
  177. switch a.Key {
  178. case "public":
  179. p = a.Val
  180. case "system":
  181. s = a.Val
  182. }
  183. }
  184. if p != "" || s != "" {
  185. fmt.Fprintf(w, ` "%s"`, p)
  186. fmt.Fprintf(w, ` "%s"`, s)
  187. }
  188. }
  189. io.WriteString(w, ">")
  190. case scopeMarkerNode:
  191. return errors.New("unexpected scopeMarkerNode")
  192. default:
  193. return errors.New("unknown node type")
  194. }
  195. io.WriteString(w, "\n")
  196. for c := n.FirstChild; c != nil; c = c.NextSibling {
  197. if err := dumpLevel(w, c, level); err != nil {
  198. return err
  199. }
  200. }
  201. return nil
  202. }
  203. func dump(n *Node) (string, error) {
  204. if n == nil || n.FirstChild == nil {
  205. return "", nil
  206. }
  207. var b bytes.Buffer
  208. for c := n.FirstChild; c != nil; c = c.NextSibling {
  209. if err := dumpLevel(&b, c, 0); err != nil {
  210. return "", err
  211. }
  212. }
  213. return b.String(), nil
  214. }
  215. var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
  216. func TestParser(t *testing.T) {
  217. for _, testDataDir := range testDataDirs {
  218. testFiles, err := filepath.Glob(testDataDir + "*.dat")
  219. if err != nil {
  220. t.Fatal(err)
  221. }
  222. for _, tf := range testFiles {
  223. f, err := os.Open(tf)
  224. if err != nil {
  225. t.Fatal(err)
  226. }
  227. defer f.Close()
  228. r := bufio.NewReader(f)
  229. for i := 0; ; i++ {
  230. ta, err := readParseTest(r)
  231. if err == io.EOF {
  232. break
  233. }
  234. if err != nil {
  235. t.Fatal(err)
  236. }
  237. err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
  238. if err != nil {
  239. t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
  240. }
  241. }
  242. }
  243. }
  244. }
  245. // Issue 16318
  246. func TestParserWithoutScripting(t *testing.T) {
  247. text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
  248. want := `| <html>
  249. | <head>
  250. | <noscript>
  251. | <body>
  252. | <img>
  253. | src="https://golang.org/doc/gopher/frontpage.png"
  254. | <p>
  255. | <img>
  256. | src="https://golang.org/doc/gopher/doc.png"
  257. `
  258. if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
  259. t.Errorf("test with scripting is disabled, %q, %s", text, err)
  260. }
  261. }
  262. // testParseCase tests one test case from the test files. If the test does not
  263. // pass, it returns an error that explains the failure.
  264. // text is the HTML to be parsed, want is a dump of the correct parse tree,
  265. // and context is the name of the context node, if any.
  266. func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
  267. defer func() {
  268. if x := recover(); x != nil {
  269. switch e := x.(type) {
  270. case error:
  271. err = e
  272. default:
  273. err = fmt.Errorf("%v", e)
  274. }
  275. }
  276. }()
  277. var doc *Node
  278. if context == "" {
  279. doc, err = ParseWithOptions(strings.NewReader(text), opts...)
  280. if err != nil {
  281. return err
  282. }
  283. } else {
  284. namespace := ""
  285. if i := strings.IndexByte(context, ' '); i >= 0 {
  286. namespace, context = context[:i], context[i+1:]
  287. }
  288. contextNode := &Node{
  289. Data: context,
  290. DataAtom: atom.Lookup([]byte(context)),
  291. Namespace: namespace,
  292. Type: ElementNode,
  293. }
  294. nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
  295. if err != nil {
  296. return err
  297. }
  298. doc = &Node{
  299. Type: DocumentNode,
  300. }
  301. for _, n := range nodes {
  302. doc.AppendChild(n)
  303. }
  304. }
  305. if err := checkTreeConsistency(doc); err != nil {
  306. return err
  307. }
  308. got, err := dump(doc)
  309. if err != nil {
  310. return err
  311. }
  312. // Compare the parsed tree to the #document section.
  313. if got != want {
  314. return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
  315. }
  316. if renderTestBlacklist[text] || context != "" {
  317. return nil
  318. }
  319. // Check that rendering and re-parsing results in an identical tree.
  320. pr, pw := io.Pipe()
  321. go func() {
  322. pw.CloseWithError(Render(pw, doc))
  323. }()
  324. doc1, err := ParseWithOptions(pr, opts...)
  325. if err != nil {
  326. return err
  327. }
  328. got1, err := dump(doc1)
  329. if err != nil {
  330. return err
  331. }
  332. if got != got1 {
  333. return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
  334. }
  335. return nil
  336. }
  337. // Some test input result in parse trees are not 'well-formed' despite
  338. // following the HTML5 recovery algorithms. Rendering and re-parsing such a
  339. // tree will not result in an exact clone of that tree. We blacklist such
  340. // inputs from the render test.
  341. var renderTestBlacklist = map[string]bool{
  342. // The second <a> will be reparented to the first <table>'s parent. This
  343. // results in an <a> whose parent is an <a>, which is not 'well-formed'.
  344. `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
  345. // The same thing with a <p>:
  346. `<p><table></p>`: true,
  347. // More cases of <a> being reparented:
  348. `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
  349. `<a><table><a></table><p><a><div><a>`: true,
  350. `<a><table><td><a><table></table><a></tr><a></table><a>`: true,
  351. `<template><a><table><a>`: true,
  352. // A similar reparenting situation involving <nobr>:
  353. `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
  354. // A <plaintext> element is reparented, putting it before a table.
  355. // A <plaintext> element can't have anything after it in HTML.
  356. `<table><plaintext><td>`: true,
  357. `<!doctype html><table><plaintext></plaintext>`: true,
  358. `<!doctype html><table><tbody><plaintext></plaintext>`: true,
  359. `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
  360. // A form inside a table inside a form doesn't work either.
  361. `<!doctype html><form><table></form><form></table></form>`: true,
  362. // A script that ends at EOF may escape its own closing tag when rendered.
  363. `<!doctype html><script><!--<script `: true,
  364. `<!doctype html><script><!--<script <`: true,
  365. `<!doctype html><script><!--<script <a`: true,
  366. `<!doctype html><script><!--<script </`: true,
  367. `<!doctype html><script><!--<script </s`: true,
  368. `<!doctype html><script><!--<script </script`: true,
  369. `<!doctype html><script><!--<script </scripta`: true,
  370. `<!doctype html><script><!--<script -`: true,
  371. `<!doctype html><script><!--<script -a`: true,
  372. `<!doctype html><script><!--<script -<`: true,
  373. `<!doctype html><script><!--<script --`: true,
  374. `<!doctype html><script><!--<script --a`: true,
  375. `<!doctype html><script><!--<script --<`: true,
  376. `<script><!--<script `: true,
  377. `<script><!--<script <a`: true,
  378. `<script><!--<script </script`: true,
  379. `<script><!--<script </scripta`: true,
  380. `<script><!--<script -`: true,
  381. `<script><!--<script -a`: true,
  382. `<script><!--<script --`: true,
  383. `<script><!--<script --a`: true,
  384. `<script><!--<script <`: true,
  385. `<script><!--<script </`: true,
  386. `<script><!--<script </s`: true,
  387. // Reconstructing the active formatting elements results in a <plaintext>
  388. // element that contains an <a> element.
  389. `<!doctype html><p><a><plaintext>b`: true,
  390. `<table><math><select><mi><select></table>`: true,
  391. }
  392. func TestNodeConsistency(t *testing.T) {
  393. // inconsistentNode is a Node whose DataAtom and Data do not agree.
  394. inconsistentNode := &Node{
  395. Type: ElementNode,
  396. DataAtom: atom.Frameset,
  397. Data: "table",
  398. }
  399. if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
  400. t.Errorf("got nil error, want non-nil")
  401. }
  402. }
  403. func TestParseFragmentWithNilContext(t *testing.T) {
  404. // This shouldn't panic.
  405. ParseFragment(strings.NewReader("<p>hello</p>"), nil)
  406. }
  407. func BenchmarkParser(b *testing.B) {
  408. buf, err := ioutil.ReadFile("testdata/go1.html")
  409. if err != nil {
  410. b.Fatalf("could not read testdata/go1.html: %v", err)
  411. }
  412. b.SetBytes(int64(len(buf)))
  413. runtime.GC()
  414. b.ReportAllocs()
  415. b.ResetTimer()
  416. for i := 0; i < b.N; i++ {
  417. Parse(bytes.NewBuffer(buf))
  418. }
  419. }