parse_test.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bufio"
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "io"
  11. "io/ioutil"
  12. "os"
  13. "path/filepath"
  14. "runtime"
  15. "sort"
  16. "strings"
  17. "testing"
  18. "golang.org/x/net/html/atom"
  19. )
  20. // readParseTest reads a single test case from r.
  21. func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
  22. line, err := r.ReadSlice('\n')
  23. if err != nil {
  24. return "", "", "", err
  25. }
  26. var b []byte
  27. // Read the HTML.
  28. if string(line) != "#data\n" {
  29. return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
  30. }
  31. for {
  32. line, err = r.ReadSlice('\n')
  33. if err != nil {
  34. return "", "", "", err
  35. }
  36. if line[0] == '#' {
  37. break
  38. }
  39. b = append(b, line...)
  40. }
  41. text = strings.TrimSuffix(string(b), "\n")
  42. b = b[:0]
  43. // Skip the error list.
  44. if string(line) != "#errors\n" {
  45. return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
  46. }
  47. for {
  48. line, err = r.ReadSlice('\n')
  49. if err != nil {
  50. return "", "", "", err
  51. }
  52. if line[0] == '#' {
  53. break
  54. }
  55. }
  56. if string(line) == "#document-fragment\n" {
  57. line, err = r.ReadSlice('\n')
  58. if err != nil {
  59. return "", "", "", err
  60. }
  61. context = strings.TrimSpace(string(line))
  62. line, err = r.ReadSlice('\n')
  63. if err != nil {
  64. return "", "", "", err
  65. }
  66. }
  67. // Read the dump of what the parse tree should be.
  68. if string(line) != "#document\n" {
  69. return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
  70. }
  71. inQuote := false
  72. for {
  73. line, err = r.ReadSlice('\n')
  74. if err != nil && err != io.EOF {
  75. return "", "", "", err
  76. }
  77. trimmed := bytes.Trim(line, "| \n")
  78. if len(trimmed) > 0 {
  79. if line[0] == '|' && trimmed[0] == '"' {
  80. inQuote = true
  81. }
  82. if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
  83. inQuote = false
  84. }
  85. }
  86. if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
  87. break
  88. }
  89. b = append(b, line...)
  90. }
  91. return text, string(b), context, nil
  92. }
  93. func dumpIndent(w io.Writer, level int) {
  94. io.WriteString(w, "| ")
  95. for i := 0; i < level; i++ {
  96. io.WriteString(w, " ")
  97. }
  98. }
  99. type sortedAttributes []Attribute
  100. func (a sortedAttributes) Len() int {
  101. return len(a)
  102. }
  103. func (a sortedAttributes) Less(i, j int) bool {
  104. if a[i].Namespace != a[j].Namespace {
  105. return a[i].Namespace < a[j].Namespace
  106. }
  107. return a[i].Key < a[j].Key
  108. }
  109. func (a sortedAttributes) Swap(i, j int) {
  110. a[i], a[j] = a[j], a[i]
  111. }
  112. func dumpLevel(w io.Writer, n *Node, level int) error {
  113. dumpIndent(w, level)
  114. level++
  115. switch n.Type {
  116. case ErrorNode:
  117. return errors.New("unexpected ErrorNode")
  118. case DocumentNode:
  119. return errors.New("unexpected DocumentNode")
  120. case ElementNode:
  121. if n.Namespace != "" {
  122. fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
  123. } else {
  124. fmt.Fprintf(w, "<%s>", n.Data)
  125. }
  126. attr := sortedAttributes(n.Attr)
  127. sort.Sort(attr)
  128. for _, a := range attr {
  129. io.WriteString(w, "\n")
  130. dumpIndent(w, level)
  131. if a.Namespace != "" {
  132. fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
  133. } else {
  134. fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
  135. }
  136. }
  137. if n.Namespace == "" && n.DataAtom == atom.Template {
  138. io.WriteString(w, "\n")
  139. dumpIndent(w, level)
  140. level++
  141. io.WriteString(w, "content")
  142. }
  143. case TextNode:
  144. fmt.Fprintf(w, `"%s"`, n.Data)
  145. case CommentNode:
  146. fmt.Fprintf(w, "<!-- %s -->", n.Data)
  147. case DoctypeNode:
  148. fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
  149. if n.Attr != nil {
  150. var p, s string
  151. for _, a := range n.Attr {
  152. switch a.Key {
  153. case "public":
  154. p = a.Val
  155. case "system":
  156. s = a.Val
  157. }
  158. }
  159. if p != "" || s != "" {
  160. fmt.Fprintf(w, ` "%s"`, p)
  161. fmt.Fprintf(w, ` "%s"`, s)
  162. }
  163. }
  164. io.WriteString(w, ">")
  165. case scopeMarkerNode:
  166. return errors.New("unexpected scopeMarkerNode")
  167. default:
  168. return errors.New("unknown node type")
  169. }
  170. io.WriteString(w, "\n")
  171. for c := n.FirstChild; c != nil; c = c.NextSibling {
  172. if err := dumpLevel(w, c, level); err != nil {
  173. return err
  174. }
  175. }
  176. return nil
  177. }
  178. func dump(n *Node) (string, error) {
  179. if n == nil || n.FirstChild == nil {
  180. return "", nil
  181. }
  182. var b bytes.Buffer
  183. for c := n.FirstChild; c != nil; c = c.NextSibling {
  184. if err := dumpLevel(&b, c, 0); err != nil {
  185. return "", err
  186. }
  187. }
  188. return b.String(), nil
  189. }
  190. var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
  191. func TestParser(t *testing.T) {
  192. for _, testDataDir := range testDataDirs {
  193. testFiles, err := filepath.Glob(testDataDir + "*.dat")
  194. if err != nil {
  195. t.Fatal(err)
  196. }
  197. for _, tf := range testFiles {
  198. f, err := os.Open(tf)
  199. if err != nil {
  200. t.Fatal(err)
  201. }
  202. defer f.Close()
  203. r := bufio.NewReader(f)
  204. for i := 0; ; i++ {
  205. text, want, context, err := readParseTest(r)
  206. if err == io.EOF {
  207. break
  208. }
  209. if err != nil {
  210. t.Fatal(err)
  211. }
  212. err = testParseCase(text, want, context)
  213. if err != nil {
  214. t.Errorf("%s test #%d %q, %s", tf, i, text, err)
  215. }
  216. }
  217. }
  218. }
  219. }
  220. // testParseCase tests one test case from the test files. If the test does not
  221. // pass, it returns an error that explains the failure.
  222. // text is the HTML to be parsed, want is a dump of the correct parse tree,
  223. // and context is the name of the context node, if any.
  224. func testParseCase(text, want, context string) (err error) {
  225. defer func() {
  226. if x := recover(); x != nil {
  227. switch e := x.(type) {
  228. case error:
  229. err = e
  230. default:
  231. err = fmt.Errorf("%v", e)
  232. }
  233. }
  234. }()
  235. var doc *Node
  236. if context == "" {
  237. doc, err = Parse(strings.NewReader(text))
  238. if err != nil {
  239. return err
  240. }
  241. } else {
  242. contextNode := &Node{
  243. Type: ElementNode,
  244. DataAtom: atom.Lookup([]byte(context)),
  245. Data: context,
  246. }
  247. nodes, err := ParseFragment(strings.NewReader(text), contextNode)
  248. if err != nil {
  249. return err
  250. }
  251. doc = &Node{
  252. Type: DocumentNode,
  253. }
  254. for _, n := range nodes {
  255. doc.AppendChild(n)
  256. }
  257. }
  258. if err := checkTreeConsistency(doc); err != nil {
  259. return err
  260. }
  261. got, err := dump(doc)
  262. if err != nil {
  263. return err
  264. }
  265. // Compare the parsed tree to the #document section.
  266. if got != want {
  267. return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
  268. }
  269. if renderTestBlacklist[text] || context != "" {
  270. return nil
  271. }
  272. // Check that rendering and re-parsing results in an identical tree.
  273. pr, pw := io.Pipe()
  274. go func() {
  275. pw.CloseWithError(Render(pw, doc))
  276. }()
  277. doc1, err := Parse(pr)
  278. if err != nil {
  279. return err
  280. }
  281. got1, err := dump(doc1)
  282. if err != nil {
  283. return err
  284. }
  285. if got != got1 {
  286. return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
  287. }
  288. return nil
  289. }
  290. // Some test input result in parse trees are not 'well-formed' despite
  291. // following the HTML5 recovery algorithms. Rendering and re-parsing such a
  292. // tree will not result in an exact clone of that tree. We blacklist such
  293. // inputs from the render test.
  294. var renderTestBlacklist = map[string]bool{
  295. // The second <a> will be reparented to the first <table>'s parent. This
  296. // results in an <a> whose parent is an <a>, which is not 'well-formed'.
  297. `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
  298. // The same thing with a <p>:
  299. `<p><table></p>`: true,
  300. // More cases of <a> being reparented:
  301. `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
  302. `<a><table><a></table><p><a><div><a>`: true,
  303. `<a><table><td><a><table></table><a></tr><a></table><a>`: true,
  304. `<template><a><table><a>`: true,
  305. // A similar reparenting situation involving <nobr>:
  306. `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
  307. // A <plaintext> element is reparented, putting it before a table.
  308. // A <plaintext> element can't have anything after it in HTML.
  309. `<table><plaintext><td>`: true,
  310. `<!doctype html><table><plaintext></plaintext>`: true,
  311. `<!doctype html><table><tbody><plaintext></plaintext>`: true,
  312. `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
  313. // A form inside a table inside a form doesn't work either.
  314. `<!doctype html><form><table></form><form></table></form>`: true,
  315. // A script that ends at EOF may escape its own closing tag when rendered.
  316. `<!doctype html><script><!--<script `: true,
  317. `<!doctype html><script><!--<script <`: true,
  318. `<!doctype html><script><!--<script <a`: true,
  319. `<!doctype html><script><!--<script </`: true,
  320. `<!doctype html><script><!--<script </s`: true,
  321. `<!doctype html><script><!--<script </script`: true,
  322. `<!doctype html><script><!--<script </scripta`: true,
  323. `<!doctype html><script><!--<script -`: true,
  324. `<!doctype html><script><!--<script -a`: true,
  325. `<!doctype html><script><!--<script -<`: true,
  326. `<!doctype html><script><!--<script --`: true,
  327. `<!doctype html><script><!--<script --a`: true,
  328. `<!doctype html><script><!--<script --<`: true,
  329. `<script><!--<script `: true,
  330. `<script><!--<script <a`: true,
  331. `<script><!--<script </script`: true,
  332. `<script><!--<script </scripta`: true,
  333. `<script><!--<script -`: true,
  334. `<script><!--<script -a`: true,
  335. `<script><!--<script --`: true,
  336. `<script><!--<script --a`: true,
  337. `<script><!--<script <`: true,
  338. `<script><!--<script </`: true,
  339. `<script><!--<script </s`: true,
  340. // Reconstructing the active formatting elements results in a <plaintext>
  341. // element that contains an <a> element.
  342. `<!doctype html><p><a><plaintext>b`: true,
  343. `<table><math><select><mi><select></table>`: true,
  344. }
  345. func TestNodeConsistency(t *testing.T) {
  346. // inconsistentNode is a Node whose DataAtom and Data do not agree.
  347. inconsistentNode := &Node{
  348. Type: ElementNode,
  349. DataAtom: atom.Frameset,
  350. Data: "table",
  351. }
  352. _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode)
  353. if err == nil {
  354. t.Errorf("got nil error, want non-nil")
  355. }
  356. }
  357. func TestParseFragmentWithNilContext(t *testing.T) {
  358. // This shouldn't panic.
  359. ParseFragment(strings.NewReader("<p>hello</p>"), nil)
  360. }
  361. func BenchmarkParser(b *testing.B) {
  362. buf, err := ioutil.ReadFile("testdata/go1.html")
  363. if err != nil {
  364. b.Fatalf("could not read testdata/go1.html: %v", err)
  365. }
  366. b.SetBytes(int64(len(buf)))
  367. runtime.GC()
  368. b.ReportAllocs()
  369. b.ResetTimer()
  370. for i := 0; i < b.N; i++ {
  371. Parse(bytes.NewBuffer(buf))
  372. }
  373. }