123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753 |
- // Copyright 2010 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package html
- import (
- "bytes"
- "io"
- "io/ioutil"
- "reflect"
- "runtime"
- "strings"
- "testing"
- )
- type tokenTest struct {
- // A short description of the test case.
- desc string
- // The HTML to parse.
- html string
- // The string representations of the expected tokens, joined by '$'.
- golden string
- }
- var tokenTests = []tokenTest{
- {
- "empty",
- "",
- "",
- },
- // A single text node. The tokenizer should not break text nodes on whitespace,
- // nor should it normalize whitespace within a text node.
- {
- "text",
- "foo bar",
- "foo bar",
- },
- // An entity.
- {
- "entity",
- "one < two",
- "one < two",
- },
- // A start, self-closing and end tag. The tokenizer does not care if the start
- // and end tokens don't match; that is the job of the parser.
- {
- "tags",
- "<a>b<c/>d</e>",
- "<a>$b$<c/>$d$</e>",
- },
- // Angle brackets that aren't a tag.
- {
- "not a tag #0",
- "<",
- "<",
- },
- {
- "not a tag #1",
- "</",
- "</",
- },
- {
- "not a tag #2",
- "</>",
- "<!---->",
- },
- {
- "not a tag #3",
- "a</>b",
- "a$<!---->$b",
- },
- {
- "not a tag #4",
- "</ >",
- "<!-- -->",
- },
- {
- "not a tag #5",
- "</.",
- "<!--.-->",
- },
- {
- "not a tag #6",
- "</.>",
- "<!--.-->",
- },
- {
- "not a tag #7",
- "a < b",
- "a < b",
- },
- {
- "not a tag #8",
- "<.>",
- "<.>",
- },
- {
- "not a tag #9",
- "a<<<b>>>c",
- "a<<$<b>$>>c",
- },
- {
- "not a tag #10",
- "if x<0 and y < 0 then x*y>0",
- "if x<0 and y < 0 then x*y>0",
- },
- {
- "not a tag #11",
- "<<p>",
- "<$<p>",
- },
- // EOF in a tag name.
- {
- "tag name eof #0",
- "<a",
- "",
- },
- {
- "tag name eof #1",
- "<a ",
- "",
- },
- {
- "tag name eof #2",
- "a<b",
- "a",
- },
- {
- "tag name eof #3",
- "<a><b",
- "<a>",
- },
- {
- "tag name eof #4",
- `<a x`,
- ``,
- },
- // Some malformed tags that are missing a '>'.
- {
- "malformed tag #0",
- `<p</p>`,
- `<p< p="">`,
- },
- {
- "malformed tag #1",
- `<p </p>`,
- `<p <="" p="">`,
- },
- {
- "malformed tag #2",
- `<p id`,
- ``,
- },
- {
- "malformed tag #3",
- `<p id=`,
- ``,
- },
- {
- "malformed tag #4",
- `<p id=>`,
- `<p id="">`,
- },
- {
- "malformed tag #5",
- `<p id=0`,
- ``,
- },
- {
- "malformed tag #6",
- `<p id=0</p>`,
- `<p id="0</p">`,
- },
- {
- "malformed tag #7",
- `<p id="0</p>`,
- ``,
- },
- {
- "malformed tag #8",
- `<p id="0"</p>`,
- `<p id="0" <="" p="">`,
- },
- {
- "malformed tag #9",
- `<p></p id`,
- `<p>`,
- },
- // Raw text and RCDATA.
- {
- "basic raw text",
- "<script><a></b></script>",
- "<script>$<a></b>$</script>",
- },
- {
- "unfinished script end tag",
- "<SCRIPT>a</SCR",
- "<script>$a</SCR",
- },
- {
- "broken script end tag",
- "<SCRIPT>a</SCR ipt>",
- "<script>$a</SCR ipt>",
- },
- {
- "EOF in script end tag",
- "<SCRIPT>a</SCRipt",
- "<script>$a</SCRipt",
- },
- {
- "scriptx end tag",
- "<SCRIPT>a</SCRiptx",
- "<script>$a</SCRiptx",
- },
- {
- "' ' completes script end tag",
- "<SCRIPT>a</SCRipt ",
- "<script>$a",
- },
- {
- "'>' completes script end tag",
- "<SCRIPT>a</SCRipt>",
- "<script>$a$</script>",
- },
- {
- "self-closing script end tag",
- "<SCRIPT>a</SCRipt/>",
- "<script>$a$</script>",
- },
- {
- "nested script tag",
- "<SCRIPT>a</SCRipt<script>",
- "<script>$a</SCRipt<script>",
- },
- {
- "script end tag after unfinished",
- "<SCRIPT>a</SCRipt</script>",
- "<script>$a</SCRipt$</script>",
- },
- {
- "script/style mismatched tags",
- "<script>a</style>",
- "<script>$a</style>",
- },
- {
- "style element with entity",
- "<style>'",
- "<style>$&apos;",
- },
- {
- "textarea with tag",
- "<textarea><div></textarea>",
- "<textarea>$<div>$</textarea>",
- },
- {
- "title with tag and entity",
- "<title><b>K&R C</b></title>",
- "<title>$<b>K&R C</b>$</title>",
- },
- {
- "title with trailing '<' entity",
- "<title>foobar<</title>",
- "<title>$foobar<$</title>",
- },
- // DOCTYPE tests.
- {
- "Proper DOCTYPE",
- "<!DOCTYPE html>",
- "<!DOCTYPE html>",
- },
- {
- "DOCTYPE with no space",
- "<!doctypehtml>",
- "<!DOCTYPE html>",
- },
- {
- "DOCTYPE with two spaces",
- "<!doctype html>",
- "<!DOCTYPE html>",
- },
- {
- "looks like DOCTYPE but isn't",
- "<!DOCUMENT html>",
- "<!--DOCUMENT html-->",
- },
- {
- "DOCTYPE at EOF",
- "<!DOCtype",
- "<!DOCTYPE >",
- },
- // XML processing instructions.
- {
- "XML processing instruction",
- "<?xml?>",
- "<!--?xml?-->",
- },
- // Comments.
- {
- "comment0",
- "abc<b><!-- skipme --></b>def",
- "abc$<b>$<!-- skipme -->$</b>$def",
- },
- {
- "comment1",
- "a<!-->z",
- "a$<!---->$z",
- },
- {
- "comment2",
- "a<!--->z",
- "a$<!---->$z",
- },
- {
- "comment3",
- "a<!--x>-->z",
- "a$<!--x>-->$z",
- },
- {
- "comment4",
- "a<!--x->-->z",
- "a$<!--x->-->$z",
- },
- {
- "comment5",
- "a<!>z",
- "a$<!---->$z",
- },
- {
- "comment6",
- "a<!->z",
- "a$<!----->$z",
- },
- {
- "comment7",
- "a<!---<>z",
- "a$<!---<>z-->",
- },
- {
- "comment8",
- "a<!--z",
- "a$<!--z-->",
- },
- {
- "comment9",
- "a<!--z-",
- "a$<!--z-->",
- },
- {
- "comment10",
- "a<!--z--",
- "a$<!--z-->",
- },
- {
- "comment11",
- "a<!--z---",
- "a$<!--z--->",
- },
- {
- "comment12",
- "a<!--z----",
- "a$<!--z---->",
- },
- {
- "comment13",
- "a<!--x--!>z",
- "a$<!--x-->$z",
- },
- // An attribute with a backslash.
- {
- "backslash",
- `<p id="a\"b">`,
- `<p id="a\" b"="">`,
- },
- // Entities, tag name and attribute key lower-casing, and whitespace
- // normalization within a tag.
- {
- "tricky",
- "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
- `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
- },
- // A nonexistent entity. Tokenizing and converting back to a string should
- // escape the "&" to become "&".
- {
- "noSuchEntity",
- `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
- `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
- },
- {
- "entity without semicolon",
- `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
- `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
- },
- {
- "entity with digits",
- "½",
- "½",
- },
- // Attribute tests:
- // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
- {
- "Empty attribute",
- `<input disabled FOO>`,
- `<input disabled="" foo="">`,
- },
- {
- "Empty attribute, whitespace",
- `<input disabled FOO >`,
- `<input disabled="" foo="">`,
- },
- {
- "Unquoted attribute value",
- `<input value=yes FOO=BAR>`,
- `<input value="yes" foo="BAR">`,
- },
- {
- "Unquoted attribute value, spaces",
- `<input value = yes FOO = BAR>`,
- `<input value="yes" foo="BAR">`,
- },
- {
- "Unquoted attribute value, trailing space",
- `<input value=yes FOO=BAR >`,
- `<input value="yes" foo="BAR">`,
- },
- {
- "Single-quoted attribute value",
- `<input value='yes' FOO='BAR'>`,
- `<input value="yes" foo="BAR">`,
- },
- {
- "Single-quoted attribute value, trailing space",
- `<input value='yes' FOO='BAR' >`,
- `<input value="yes" foo="BAR">`,
- },
- {
- "Double-quoted attribute value",
- `<input value="I'm an attribute" FOO="BAR">`,
- `<input value="I'm an attribute" foo="BAR">`,
- },
- {
- "Attribute name characters",
- `<meta http-equiv="content-type">`,
- `<meta http-equiv="content-type">`,
- },
- {
- "Mixed attributes",
- `a<P V="0 1" w='2' X=3 y>z`,
- `a$<p v="0 1" w="2" x="3" y="">$z`,
- },
- {
- "Attributes with a solitary single quote",
- `<p id=can't><p id=won't>`,
- `<p id="can't">$<p id="won't">`,
- },
- }
- func TestTokenizer(t *testing.T) {
- loop:
- for _, tt := range tokenTests {
- z := NewTokenizer(strings.NewReader(tt.html))
- if tt.golden != "" {
- for i, s := range strings.Split(tt.golden, "$") {
- if z.Next() == ErrorToken {
- t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
- continue loop
- }
- actual := z.Token().String()
- if s != actual {
- t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
- continue loop
- }
- }
- }
- z.Next()
- if z.Err() != io.EOF {
- t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
- }
- }
- }
- func TestMaxBuffer(t *testing.T) {
- // Exceeding the maximum buffer size generates ErrBufferExceeded.
- z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
- z.SetMaxBuf(5)
- tt := z.Next()
- if got, want := tt, ErrorToken; got != want {
- t.Fatalf("token type: got: %v want: %v", got, want)
- }
- if got, want := z.Err(), ErrBufferExceeded; got != want {
- t.Errorf("error type: got: %v want: %v", got, want)
- }
- if got, want := string(z.Raw()), "<tttt"; got != want {
- t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
- }
- }
- func TestMaxBufferReconstruction(t *testing.T) {
- // Exceeding the maximum buffer size at any point while tokenizing permits
- // reconstructing the original input.
- tests:
- for _, test := range tokenTests {
- for maxBuf := 1; ; maxBuf++ {
- r := strings.NewReader(test.html)
- z := NewTokenizer(r)
- z.SetMaxBuf(maxBuf)
- var tokenized bytes.Buffer
- for {
- tt := z.Next()
- tokenized.Write(z.Raw())
- if tt == ErrorToken {
- if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
- t.Errorf("%s: unexpected error: %v", test.desc, err)
- }
- break
- }
- }
- // Anything tokenized along with untokenized input or data left in the reader.
- assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
- if err != nil {
- t.Errorf("%s: ReadAll: %v", test.desc, err)
- continue tests
- }
- if got, want := string(assembled), test.html; got != want {
- t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
- continue tests
- }
- // EOF indicates that we completed tokenization and hence found the max
- // maxBuf that generates ErrBufferExceeded, so continue to the next test.
- if z.Err() == io.EOF {
- break
- }
- } // buffer sizes
- } // tests
- }
- func TestPassthrough(t *testing.T) {
- // Accumulating the raw output for each parse event should reconstruct the
- // original input.
- for _, test := range tokenTests {
- z := NewTokenizer(strings.NewReader(test.html))
- var parsed bytes.Buffer
- for {
- tt := z.Next()
- parsed.Write(z.Raw())
- if tt == ErrorToken {
- break
- }
- }
- if got, want := parsed.String(), test.html; got != want {
- t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
- }
- }
- }
- func TestBufAPI(t *testing.T) {
- s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
- z := NewTokenizer(bytes.NewBufferString(s))
- var result bytes.Buffer
- depth := 0
- loop:
- for {
- tt := z.Next()
- switch tt {
- case ErrorToken:
- if z.Err() != io.EOF {
- t.Error(z.Err())
- }
- break loop
- case TextToken:
- if depth > 0 {
- result.Write(z.Text())
- }
- case StartTagToken, EndTagToken:
- tn, _ := z.TagName()
- if len(tn) == 1 && tn[0] == 'a' {
- if tt == StartTagToken {
- depth++
- } else {
- depth--
- }
- }
- }
- }
- u := "14567"
- v := string(result.Bytes())
- if u != v {
- t.Errorf("TestBufAPI: want %q got %q", u, v)
- }
- }
- func TestConvertNewlines(t *testing.T) {
- testCases := map[string]string{
- "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
- "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
- "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
- "": "",
- "\n": "\n",
- "\n\r": "\n\n",
- "\r": "\n",
- "\r\n": "\n",
- "\r\n\n": "\n\n",
- "\r\n\r": "\n\n",
- "\r\n\r\n": "\n\n",
- "\r\r": "\n\n",
- "\r\r\n": "\n\n",
- "\r\r\n\n": "\n\n\n",
- "\r\r\r\n": "\n\n\n",
- "\r \n": "\n \n",
- "xyz": "xyz",
- }
- for in, want := range testCases {
- if got := string(convertNewlines([]byte(in))); got != want {
- t.Errorf("input %q: got %q, want %q", in, got, want)
- }
- }
- }
- func TestReaderEdgeCases(t *testing.T) {
- const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
- testCases := []io.Reader{
- &zeroOneByteReader{s: s},
- &eofStringsReader{s: s},
- &stuckReader{},
- }
- for i, tc := range testCases {
- got := []TokenType{}
- z := NewTokenizer(tc)
- for {
- tt := z.Next()
- if tt == ErrorToken {
- break
- }
- got = append(got, tt)
- }
- if err := z.Err(); err != nil && err != io.EOF {
- if err != io.ErrNoProgress {
- t.Errorf("i=%d: %v", i, err)
- }
- continue
- }
- want := []TokenType{
- StartTagToken,
- TextToken,
- EndTagToken,
- }
- if !reflect.DeepEqual(got, want) {
- t.Errorf("i=%d: got %v, want %v", i, got, want)
- continue
- }
- }
- }
- // zeroOneByteReader is like a strings.Reader that alternates between
- // returning 0 bytes and 1 byte at a time.
- type zeroOneByteReader struct {
- s string
- n int
- }
- func (r *zeroOneByteReader) Read(p []byte) (int, error) {
- if len(p) == 0 {
- return 0, nil
- }
- if len(r.s) == 0 {
- return 0, io.EOF
- }
- r.n++
- if r.n%2 != 0 {
- return 0, nil
- }
- p[0], r.s = r.s[0], r.s[1:]
- return 1, nil
- }
- // eofStringsReader is like a strings.Reader but can return an (n, err) where
- // n > 0 && err != nil.
- type eofStringsReader struct {
- s string
- }
- func (r *eofStringsReader) Read(p []byte) (int, error) {
- n := copy(p, r.s)
- r.s = r.s[n:]
- if r.s != "" {
- return n, nil
- }
- return n, io.EOF
- }
- // stuckReader is an io.Reader that always returns no data and no error.
- type stuckReader struct{}
- func (*stuckReader) Read(p []byte) (int, error) {
- return 0, nil
- }
- const (
- rawLevel = iota
- lowLevel
- highLevel
- )
- func benchmarkTokenizer(b *testing.B, level int) {
- buf, err := ioutil.ReadFile("testdata/go1.html")
- if err != nil {
- b.Fatalf("could not read testdata/go1.html: %v", err)
- }
- b.SetBytes(int64(len(buf)))
- runtime.GC()
- b.ReportAllocs()
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- z := NewTokenizer(bytes.NewBuffer(buf))
- for {
- tt := z.Next()
- if tt == ErrorToken {
- if err := z.Err(); err != nil && err != io.EOF {
- b.Fatalf("tokenizer error: %v", err)
- }
- break
- }
- switch level {
- case rawLevel:
- // Calling z.Raw just returns the raw bytes of the token. It does
- // not unescape < to <, or lower-case tag names and attribute keys.
- z.Raw()
- case lowLevel:
- // Caling z.Text, z.TagName and z.TagAttr returns []byte values
- // whose contents may change on the next call to z.Next.
- switch tt {
- case TextToken, CommentToken, DoctypeToken:
- z.Text()
- case StartTagToken, SelfClosingTagToken:
- _, more := z.TagName()
- for more {
- _, _, more = z.TagAttr()
- }
- case EndTagToken:
- z.TagName()
- }
- case highLevel:
- // Calling z.Token converts []byte values to strings whose validity
- // extend beyond the next call to z.Next.
- z.Token()
- }
- }
- }
- }
- func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
- func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
- func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }
|