|
- // Copyright 2010 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package html
- import (
- "bufio"
- "bytes"
- "errors"
- "fmt"
- "io"
- "io/ioutil"
- "os"
- "path/filepath"
- "runtime"
- "sort"
- "strings"
- "testing"
- "golang.org/x/net/html/atom"
- )
- type testAttrs struct {
- text, want, context string
- scripting bool
- }
- // readParseTest reads a single test case from r.
- func readParseTest(r *bufio.Reader) (*testAttrs, error) {
- ta := &testAttrs{scripting: true}
- line, err := r.ReadSlice('\n')
- if err != nil {
- return nil, err
- }
- var b []byte
- // Read the HTML.
- if string(line) != "#data\n" {
- return nil, fmt.Errorf(`got %q want "#data\n"`, line)
- }
- for {
- line, err = r.ReadSlice('\n')
- if err != nil {
- return nil, err
- }
- if line[0] == '#' {
- break
- }
- b = append(b, line...)
- }
- ta.text = strings.TrimSuffix(string(b), "\n")
- b = b[:0]
- // Skip the error list.
- if string(line) != "#errors\n" {
- return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
- }
- for {
- line, err = r.ReadSlice('\n')
- if err != nil {
- return nil, err
- }
- if line[0] == '#' {
- break
- }
- }
- if ls := string(line); strings.HasPrefix(ls, "#script-") {
- switch {
- case strings.HasSuffix(ls, "-on\n"):
- ta.scripting = true
- case strings.HasSuffix(ls, "-off\n"):
- ta.scripting = false
- default:
- return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
- }
- for {
- line, err = r.ReadSlice('\n')
- if err != nil {
- return nil, err
- }
- if line[0] == '#' {
- break
- }
- }
- }
- if string(line) == "#document-fragment\n" {
- line, err = r.ReadSlice('\n')
- if err != nil {
- return nil, err
- }
- ta.context = strings.TrimSpace(string(line))
- line, err = r.ReadSlice('\n')
- if err != nil {
- return nil, err
- }
- }
- // Read the dump of what the parse tree should be.
- if string(line) != "#document\n" {
- return nil, fmt.Errorf(`got %q want "#document\n"`, line)
- }
- inQuote := false
- for {
- line, err = r.ReadSlice('\n')
- if err != nil && err != io.EOF {
- return nil, err
- }
- trimmed := bytes.Trim(line, "| \n")
- if len(trimmed) > 0 {
- if line[0] == '|' && trimmed[0] == '"' {
- inQuote = true
- }
- if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
- inQuote = false
- }
- }
- if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
- break
- }
- b = append(b, line...)
- }
- ta.want = string(b)
- return ta, nil
- }
- func dumpIndent(w io.Writer, level int) {
- io.WriteString(w, "| ")
- for i := 0; i < level; i++ {
- io.WriteString(w, " ")
- }
- }
- type sortedAttributes []Attribute
- func (a sortedAttributes) Len() int {
- return len(a)
- }
- func (a sortedAttributes) Less(i, j int) bool {
- if a[i].Namespace != a[j].Namespace {
- return a[i].Namespace < a[j].Namespace
- }
- return a[i].Key < a[j].Key
- }
- func (a sortedAttributes) Swap(i, j int) {
- a[i], a[j] = a[j], a[i]
- }
- func dumpLevel(w io.Writer, n *Node, level int) error {
- dumpIndent(w, level)
- level++
- switch n.Type {
- case ErrorNode:
- return errors.New("unexpected ErrorNode")
- case DocumentNode:
- return errors.New("unexpected DocumentNode")
- case ElementNode:
- if n.Namespace != "" {
- fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
- } else {
- fmt.Fprintf(w, "<%s>", n.Data)
- }
- attr := sortedAttributes(n.Attr)
- sort.Sort(attr)
- for _, a := range attr {
- io.WriteString(w, "\n")
- dumpIndent(w, level)
- if a.Namespace != "" {
- fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
- } else {
- fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
- }
- }
- if n.Namespace == "" && n.DataAtom == atom.Template {
- io.WriteString(w, "\n")
- dumpIndent(w, level)
- level++
- io.WriteString(w, "content")
- }
- case TextNode:
- fmt.Fprintf(w, `"%s"`, n.Data)
- case CommentNode:
- fmt.Fprintf(w, "<!-- %s -->", n.Data)
- case DoctypeNode:
- fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
- if n.Attr != nil {
- var p, s string
- for _, a := range n.Attr {
- switch a.Key {
- case "public":
- p = a.Val
- case "system":
- s = a.Val
- }
- }
- if p != "" || s != "" {
- fmt.Fprintf(w, ` "%s"`, p)
- fmt.Fprintf(w, ` "%s"`, s)
- }
- }
- io.WriteString(w, ">")
- case scopeMarkerNode:
- return errors.New("unexpected scopeMarkerNode")
- default:
- return errors.New("unknown node type")
- }
- io.WriteString(w, "\n")
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- if err := dumpLevel(w, c, level); err != nil {
- return err
- }
- }
- return nil
- }
- func dump(n *Node) (string, error) {
- if n == nil || n.FirstChild == nil {
- return "", nil
- }
- var b bytes.Buffer
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- if err := dumpLevel(&b, c, 0); err != nil {
- return "", err
- }
- }
- return b.String(), nil
- }
- var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
- func TestParser(t *testing.T) {
- for _, testDataDir := range testDataDirs {
- testFiles, err := filepath.Glob(testDataDir + "*.dat")
- if err != nil {
- t.Fatal(err)
- }
- for _, tf := range testFiles {
- f, err := os.Open(tf)
- if err != nil {
- t.Fatal(err)
- }
- defer f.Close()
- r := bufio.NewReader(f)
- for i := 0; ; i++ {
- ta, err := readParseTest(r)
- if err == io.EOF {
- break
- }
- if err != nil {
- t.Fatal(err)
- }
- err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
- if err != nil {
- t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
- }
- }
- }
- }
- }
- // Issue 16318
- func TestParserWithoutScripting(t *testing.T) {
- text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
- want := `| <html>
- | <head>
- | <noscript>
- | <body>
- | <img>
- | src="https://golang.org/doc/gopher/frontpage.png"
- | <p>
- | <img>
- | src="https://golang.org/doc/gopher/doc.png"
- `
- if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
- t.Errorf("test with scripting is disabled, %q, %s", text, err)
- }
- }
- // testParseCase tests one test case from the test files. If the test does not
- // pass, it returns an error that explains the failure.
- // text is the HTML to be parsed, want is a dump of the correct parse tree,
- // and context is the name of the context node, if any.
- func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
- defer func() {
- if x := recover(); x != nil {
- switch e := x.(type) {
- case error:
- err = e
- default:
- err = fmt.Errorf("%v", e)
- }
- }
- }()
- var doc *Node
- if context == "" {
- doc, err = ParseWithOptions(strings.NewReader(text), opts...)
- if err != nil {
- return err
- }
- } else {
- namespace := ""
- if i := strings.IndexByte(context, ' '); i >= 0 {
- namespace, context = context[:i], context[i+1:]
- }
- contextNode := &Node{
- Data: context,
- DataAtom: atom.Lookup([]byte(context)),
- Namespace: namespace,
- Type: ElementNode,
- }
- nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
- if err != nil {
- return err
- }
- doc = &Node{
- Type: DocumentNode,
- }
- for _, n := range nodes {
- doc.AppendChild(n)
- }
- }
- if err := checkTreeConsistency(doc); err != nil {
- return err
- }
- got, err := dump(doc)
- if err != nil {
- return err
- }
- // Compare the parsed tree to the #document section.
- if got != want {
- return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
- }
- if renderTestBlacklist[text] || context != "" {
- return nil
- }
- // Check that rendering and re-parsing results in an identical tree.
- pr, pw := io.Pipe()
- go func() {
- pw.CloseWithError(Render(pw, doc))
- }()
- doc1, err := ParseWithOptions(pr, opts...)
- if err != nil {
- return err
- }
- got1, err := dump(doc1)
- if err != nil {
- return err
- }
- if got != got1 {
- return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
- }
- return nil
- }
- // Some test input result in parse trees are not 'well-formed' despite
- // following the HTML5 recovery algorithms. Rendering and re-parsing such a
- // tree will not result in an exact clone of that tree. We blacklist such
- // inputs from the render test.
- var renderTestBlacklist = map[string]bool{
- // The second <a> will be reparented to the first <table>'s parent. This
- // results in an <a> whose parent is an <a>, which is not 'well-formed'.
- `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
- // The same thing with a <p>:
- `<p><table></p>`: true,
- // More cases of <a> being reparented:
- `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
- `<a><table><a></table><p><a><div><a>`: true,
- `<a><table><td><a><table></table><a></tr><a></table><a>`: true,
- `<template><a><table><a>`: true,
- // A similar reparenting situation involving <nobr>:
- `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
- // A <plaintext> element is reparented, putting it before a table.
- // A <plaintext> element can't have anything after it in HTML.
- `<table><plaintext><td>`: true,
- `<!doctype html><table><plaintext></plaintext>`: true,
- `<!doctype html><table><tbody><plaintext></plaintext>`: true,
- `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
- // A form inside a table inside a form doesn't work either.
- `<!doctype html><form><table></form><form></table></form>`: true,
- // A script that ends at EOF may escape its own closing tag when rendered.
- `<!doctype html><script><!--<script `: true,
- `<!doctype html><script><!--<script <`: true,
- `<!doctype html><script><!--<script <a`: true,
- `<!doctype html><script><!--<script </`: true,
- `<!doctype html><script><!--<script </s`: true,
- `<!doctype html><script><!--<script </script`: true,
- `<!doctype html><script><!--<script </scripta`: true,
- `<!doctype html><script><!--<script -`: true,
- `<!doctype html><script><!--<script -a`: true,
- `<!doctype html><script><!--<script -<`: true,
- `<!doctype html><script><!--<script --`: true,
- `<!doctype html><script><!--<script --a`: true,
- `<!doctype html><script><!--<script --<`: true,
- `<script><!--<script `: true,
- `<script><!--<script <a`: true,
- `<script><!--<script </script`: true,
- `<script><!--<script </scripta`: true,
- `<script><!--<script -`: true,
- `<script><!--<script -a`: true,
- `<script><!--<script --`: true,
- `<script><!--<script --a`: true,
- `<script><!--<script <`: true,
- `<script><!--<script </`: true,
- `<script><!--<script </s`: true,
- // Reconstructing the active formatting elements results in a <plaintext>
- // element that contains an <a> element.
- `<!doctype html><p><a><plaintext>b`: true,
- `<table><math><select><mi><select></table>`: true,
- }
- func TestNodeConsistency(t *testing.T) {
- // inconsistentNode is a Node whose DataAtom and Data do not agree.
- inconsistentNode := &Node{
- Type: ElementNode,
- DataAtom: atom.Frameset,
- Data: "table",
- }
- if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
- t.Errorf("got nil error, want non-nil")
- }
- }
- func TestParseFragmentWithNilContext(t *testing.T) {
- // This shouldn't panic.
- ParseFragment(strings.NewReader("<p>hello</p>"), nil)
- }
- func BenchmarkParser(b *testing.B) {
- buf, err := ioutil.ReadFile("testdata/go1.html")
- if err != nil {
- b.Fatalf("could not read testdata/go1.html: %v", err)
- }
- b.SetBytes(int64(len(buf)))
- runtime.GC()
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- Parse(bytes.NewBuffer(buf))
- }
- }
|