token_test.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bytes"
  7. "io"
  8. "io/ioutil"
  9. "reflect"
  10. "runtime"
  11. "strings"
  12. "testing"
  13. )
  14. type tokenTest struct {
  15. // A short description of the test case.
  16. desc string
  17. // The HTML to parse.
  18. html string
  19. // The string representations of the expected tokens, joined by '$'.
  20. golden string
  21. }
  22. var tokenTests = []tokenTest{
  23. {
  24. "empty",
  25. "",
  26. "",
  27. },
  28. // A single text node. The tokenizer should not break text nodes on whitespace,
  29. // nor should it normalize whitespace within a text node.
  30. {
  31. "text",
  32. "foo bar",
  33. "foo bar",
  34. },
  35. // An entity.
  36. {
  37. "entity",
  38. "one < two",
  39. "one < two",
  40. },
  41. // A start, self-closing and end tag. The tokenizer does not care if the start
  42. // and end tokens don't match; that is the job of the parser.
  43. {
  44. "tags",
  45. "<a>b<c/>d</e>",
  46. "<a>$b$<c/>$d$</e>",
  47. },
  48. // Angle brackets that aren't a tag.
  49. {
  50. "not a tag #0",
  51. "<",
  52. "&lt;",
  53. },
  54. {
  55. "not a tag #1",
  56. "</",
  57. "&lt;/",
  58. },
  59. {
  60. "not a tag #2",
  61. "</>",
  62. "<!---->",
  63. },
  64. {
  65. "not a tag #3",
  66. "a</>b",
  67. "a$<!---->$b",
  68. },
  69. {
  70. "not a tag #4",
  71. "</ >",
  72. "<!-- -->",
  73. },
  74. {
  75. "not a tag #5",
  76. "</.",
  77. "<!--.-->",
  78. },
  79. {
  80. "not a tag #6",
  81. "</.>",
  82. "<!--.-->",
  83. },
  84. {
  85. "not a tag #7",
  86. "a < b",
  87. "a &lt; b",
  88. },
  89. {
  90. "not a tag #8",
  91. "<.>",
  92. "&lt;.&gt;",
  93. },
  94. {
  95. "not a tag #9",
  96. "a<<<b>>>c",
  97. "a&lt;&lt;$<b>$&gt;&gt;c",
  98. },
  99. {
  100. "not a tag #10",
  101. "if x<0 and y < 0 then x*y>0",
  102. "if x&lt;0 and y &lt; 0 then x*y&gt;0",
  103. },
  104. // EOF in a tag name.
  105. {
  106. "tag name eof #0",
  107. "<a",
  108. "",
  109. },
  110. {
  111. "tag name eof #1",
  112. "<a ",
  113. "",
  114. },
  115. {
  116. "tag name eof #2",
  117. "a<b",
  118. "a",
  119. },
  120. {
  121. "tag name eof #3",
  122. "<a><b",
  123. "<a>",
  124. },
  125. {
  126. "tag name eof #4",
  127. `<a x`,
  128. ``,
  129. },
  130. // Some malformed tags that are missing a '>'.
  131. {
  132. "malformed tag #0",
  133. `<p</p>`,
  134. `<p< p="">`,
  135. },
  136. {
  137. "malformed tag #1",
  138. `<p </p>`,
  139. `<p <="" p="">`,
  140. },
  141. {
  142. "malformed tag #2",
  143. `<p id`,
  144. ``,
  145. },
  146. {
  147. "malformed tag #3",
  148. `<p id=`,
  149. ``,
  150. },
  151. {
  152. "malformed tag #4",
  153. `<p id=>`,
  154. `<p id="">`,
  155. },
  156. {
  157. "malformed tag #5",
  158. `<p id=0`,
  159. ``,
  160. },
  161. {
  162. "malformed tag #6",
  163. `<p id=0</p>`,
  164. `<p id="0&lt;/p">`,
  165. },
  166. {
  167. "malformed tag #7",
  168. `<p id="0</p>`,
  169. ``,
  170. },
  171. {
  172. "malformed tag #8",
  173. `<p id="0"</p>`,
  174. `<p id="0" <="" p="">`,
  175. },
  176. {
  177. "malformed tag #9",
  178. `<p></p id`,
  179. `<p>`,
  180. },
  181. // Raw text and RCDATA.
  182. {
  183. "basic raw text",
  184. "<script><a></b></script>",
  185. "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
  186. },
  187. {
  188. "unfinished script end tag",
  189. "<SCRIPT>a</SCR",
  190. "<script>$a&lt;/SCR",
  191. },
  192. {
  193. "broken script end tag",
  194. "<SCRIPT>a</SCR ipt>",
  195. "<script>$a&lt;/SCR ipt&gt;",
  196. },
  197. {
  198. "EOF in script end tag",
  199. "<SCRIPT>a</SCRipt",
  200. "<script>$a&lt;/SCRipt",
  201. },
  202. {
  203. "scriptx end tag",
  204. "<SCRIPT>a</SCRiptx",
  205. "<script>$a&lt;/SCRiptx",
  206. },
  207. {
  208. "' ' completes script end tag",
  209. "<SCRIPT>a</SCRipt ",
  210. "<script>$a",
  211. },
  212. {
  213. "'>' completes script end tag",
  214. "<SCRIPT>a</SCRipt>",
  215. "<script>$a$</script>",
  216. },
  217. {
  218. "self-closing script end tag",
  219. "<SCRIPT>a</SCRipt/>",
  220. "<script>$a$</script>",
  221. },
  222. {
  223. "nested script tag",
  224. "<SCRIPT>a</SCRipt<script>",
  225. "<script>$a&lt;/SCRipt&lt;script&gt;",
  226. },
  227. {
  228. "script end tag after unfinished",
  229. "<SCRIPT>a</SCRipt</script>",
  230. "<script>$a&lt;/SCRipt$</script>",
  231. },
  232. {
  233. "script/style mismatched tags",
  234. "<script>a</style>",
  235. "<script>$a&lt;/style&gt;",
  236. },
  237. {
  238. "style element with entity",
  239. "<style>&apos;",
  240. "<style>$&amp;apos;",
  241. },
  242. {
  243. "textarea with tag",
  244. "<textarea><div></textarea>",
  245. "<textarea>$&lt;div&gt;$</textarea>",
  246. },
  247. {
  248. "title with tag and entity",
  249. "<title><b>K&amp;R C</b></title>",
  250. "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
  251. },
  252. // DOCTYPE tests.
  253. {
  254. "Proper DOCTYPE",
  255. "<!DOCTYPE html>",
  256. "<!DOCTYPE html>",
  257. },
  258. {
  259. "DOCTYPE with no space",
  260. "<!doctypehtml>",
  261. "<!DOCTYPE html>",
  262. },
  263. {
  264. "DOCTYPE with two spaces",
  265. "<!doctype html>",
  266. "<!DOCTYPE html>",
  267. },
  268. {
  269. "looks like DOCTYPE but isn't",
  270. "<!DOCUMENT html>",
  271. "<!--DOCUMENT html-->",
  272. },
  273. {
  274. "DOCTYPE at EOF",
  275. "<!DOCtype",
  276. "<!DOCTYPE >",
  277. },
  278. // XML processing instructions.
  279. {
  280. "XML processing instruction",
  281. "<?xml?>",
  282. "<!--?xml?-->",
  283. },
  284. // Comments.
  285. {
  286. "comment0",
  287. "abc<b><!-- skipme --></b>def",
  288. "abc$<b>$<!-- skipme -->$</b>$def",
  289. },
  290. {
  291. "comment1",
  292. "a<!-->z",
  293. "a$<!---->$z",
  294. },
  295. {
  296. "comment2",
  297. "a<!--->z",
  298. "a$<!---->$z",
  299. },
  300. {
  301. "comment3",
  302. "a<!--x>-->z",
  303. "a$<!--x>-->$z",
  304. },
  305. {
  306. "comment4",
  307. "a<!--x->-->z",
  308. "a$<!--x->-->$z",
  309. },
  310. {
  311. "comment5",
  312. "a<!>z",
  313. "a$<!---->$z",
  314. },
  315. {
  316. "comment6",
  317. "a<!->z",
  318. "a$<!----->$z",
  319. },
  320. {
  321. "comment7",
  322. "a<!---<>z",
  323. "a$<!---<>z-->",
  324. },
  325. {
  326. "comment8",
  327. "a<!--z",
  328. "a$<!--z-->",
  329. },
  330. {
  331. "comment9",
  332. "a<!--z-",
  333. "a$<!--z-->",
  334. },
  335. {
  336. "comment10",
  337. "a<!--z--",
  338. "a$<!--z-->",
  339. },
  340. {
  341. "comment11",
  342. "a<!--z---",
  343. "a$<!--z--->",
  344. },
  345. {
  346. "comment12",
  347. "a<!--z----",
  348. "a$<!--z---->",
  349. },
  350. {
  351. "comment13",
  352. "a<!--x--!>z",
  353. "a$<!--x-->$z",
  354. },
  355. // An attribute with a backslash.
  356. {
  357. "backslash",
  358. `<p id="a\"b">`,
  359. `<p id="a\" b"="">`,
  360. },
  361. // Entities, tag name and attribute key lower-casing, and whitespace
  362. // normalization within a tag.
  363. {
  364. "tricky",
  365. "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
  366. `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
  367. },
  368. // A nonexistent entity. Tokenizing and converting back to a string should
  369. // escape the "&" to become "&amp;".
  370. {
  371. "noSuchEntity",
  372. `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
  373. `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
  374. },
  375. {
  376. "entity without semicolon",
  377. `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
  378. `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
  379. },
  380. {
  381. "entity with digits",
  382. "&frac12;",
  383. "½",
  384. },
  385. // Attribute tests:
  386. // http://dev.w3.org/html5/spec/Overview.html#attributes-0
  387. {
  388. "Empty attribute",
  389. `<input disabled FOO>`,
  390. `<input disabled="" foo="">`,
  391. },
  392. {
  393. "Empty attribute, whitespace",
  394. `<input disabled FOO >`,
  395. `<input disabled="" foo="">`,
  396. },
  397. {
  398. "Unquoted attribute value",
  399. `<input value=yes FOO=BAR>`,
  400. `<input value="yes" foo="BAR">`,
  401. },
  402. {
  403. "Unquoted attribute value, spaces",
  404. `<input value = yes FOO = BAR>`,
  405. `<input value="yes" foo="BAR">`,
  406. },
  407. {
  408. "Unquoted attribute value, trailing space",
  409. `<input value=yes FOO=BAR >`,
  410. `<input value="yes" foo="BAR">`,
  411. },
  412. {
  413. "Single-quoted attribute value",
  414. `<input value='yes' FOO='BAR'>`,
  415. `<input value="yes" foo="BAR">`,
  416. },
  417. {
  418. "Single-quoted attribute value, trailing space",
  419. `<input value='yes' FOO='BAR' >`,
  420. `<input value="yes" foo="BAR">`,
  421. },
  422. {
  423. "Double-quoted attribute value",
  424. `<input value="I'm an attribute" FOO="BAR">`,
  425. `<input value="I&#39;m an attribute" foo="BAR">`,
  426. },
  427. {
  428. "Attribute name characters",
  429. `<meta http-equiv="content-type">`,
  430. `<meta http-equiv="content-type">`,
  431. },
  432. {
  433. "Mixed attributes",
  434. `a<P V="0 1" w='2' X=3 y>z`,
  435. `a$<p v="0 1" w="2" x="3" y="">$z`,
  436. },
  437. {
  438. "Attributes with a solitary single quote",
  439. `<p id=can't><p id=won't>`,
  440. `<p id="can&#39;t">$<p id="won&#39;t">`,
  441. },
  442. }
  443. func TestTokenizer(t *testing.T) {
  444. loop:
  445. for _, tt := range tokenTests {
  446. z := NewTokenizer(strings.NewReader(tt.html))
  447. if tt.golden != "" {
  448. for i, s := range strings.Split(tt.golden, "$") {
  449. if z.Next() == ErrorToken {
  450. t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
  451. continue loop
  452. }
  453. actual := z.Token().String()
  454. if s != actual {
  455. t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
  456. continue loop
  457. }
  458. }
  459. }
  460. z.Next()
  461. if z.Err() != io.EOF {
  462. t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
  463. }
  464. }
  465. }
  466. func TestMaxBuffer(t *testing.T) {
  467. // Exceeding the maximum buffer size generates ErrBufferExceeded.
  468. z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
  469. z.SetMaxBuf(5)
  470. tt := z.Next()
  471. if got, want := tt, ErrorToken; got != want {
  472. t.Fatalf("token type: got: %v want: %v", got, want)
  473. }
  474. if got, want := z.Err(), ErrBufferExceeded; got != want {
  475. t.Errorf("error type: got: %v want: %v", got, want)
  476. }
  477. if got, want := string(z.Raw()), "<tttt"; got != want {
  478. t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
  479. }
  480. }
  481. func TestMaxBufferReconstruction(t *testing.T) {
  482. // Exceeding the maximum buffer size at any point while tokenizing permits
  483. // reconstructing the original input.
  484. tests:
  485. for _, test := range tokenTests {
  486. for maxBuf := 1; ; maxBuf++ {
  487. r := strings.NewReader(test.html)
  488. z := NewTokenizer(r)
  489. z.SetMaxBuf(maxBuf)
  490. var tokenized bytes.Buffer
  491. for {
  492. tt := z.Next()
  493. tokenized.Write(z.Raw())
  494. if tt == ErrorToken {
  495. if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
  496. t.Errorf("%s: unexpected error: %v", test.desc, err)
  497. }
  498. break
  499. }
  500. }
  501. // Anything tokenized along with untokenized input or data left in the reader.
  502. assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
  503. if err != nil {
  504. t.Errorf("%s: ReadAll: %v", test.desc, err)
  505. continue tests
  506. }
  507. if got, want := string(assembled), test.html; got != want {
  508. t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
  509. continue tests
  510. }
  511. // EOF indicates that we completed tokenization and hence found the max
  512. // maxBuf that generates ErrBufferExceeded, so continue to the next test.
  513. if z.Err() == io.EOF {
  514. break
  515. }
  516. } // buffer sizes
  517. } // tests
  518. }
  519. func TestPassthrough(t *testing.T) {
  520. // Accumulating the raw output for each parse event should reconstruct the
  521. // original input.
  522. for _, test := range tokenTests {
  523. z := NewTokenizer(strings.NewReader(test.html))
  524. var parsed bytes.Buffer
  525. for {
  526. tt := z.Next()
  527. parsed.Write(z.Raw())
  528. if tt == ErrorToken {
  529. break
  530. }
  531. }
  532. if got, want := parsed.String(), test.html; got != want {
  533. t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
  534. }
  535. }
  536. }
  537. func TestBufAPI(t *testing.T) {
  538. s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
  539. z := NewTokenizer(bytes.NewBufferString(s))
  540. var result bytes.Buffer
  541. depth := 0
  542. loop:
  543. for {
  544. tt := z.Next()
  545. switch tt {
  546. case ErrorToken:
  547. if z.Err() != io.EOF {
  548. t.Error(z.Err())
  549. }
  550. break loop
  551. case TextToken:
  552. if depth > 0 {
  553. result.Write(z.Text())
  554. }
  555. case StartTagToken, EndTagToken:
  556. tn, _ := z.TagName()
  557. if len(tn) == 1 && tn[0] == 'a' {
  558. if tt == StartTagToken {
  559. depth++
  560. } else {
  561. depth--
  562. }
  563. }
  564. }
  565. }
  566. u := "14567"
  567. v := string(result.Bytes())
  568. if u != v {
  569. t.Errorf("TestBufAPI: want %q got %q", u, v)
  570. }
  571. }
  572. func TestConvertNewlines(t *testing.T) {
  573. testCases := map[string]string{
  574. "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
  575. "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
  576. "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
  577. "": "",
  578. "\n": "\n",
  579. "\n\r": "\n\n",
  580. "\r": "\n",
  581. "\r\n": "\n",
  582. "\r\n\n": "\n\n",
  583. "\r\n\r": "\n\n",
  584. "\r\n\r\n": "\n\n",
  585. "\r\r": "\n\n",
  586. "\r\r\n": "\n\n",
  587. "\r\r\n\n": "\n\n\n",
  588. "\r\r\r\n": "\n\n\n",
  589. "\r \n": "\n \n",
  590. "xyz": "xyz",
  591. }
  592. for in, want := range testCases {
  593. if got := string(convertNewlines([]byte(in))); got != want {
  594. t.Errorf("input %q: got %q, want %q", in, got, want)
  595. }
  596. }
  597. }
  598. func TestReaderEdgeCases(t *testing.T) {
  599. const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
  600. testCases := []io.Reader{
  601. &zeroOneByteReader{s: s},
  602. &eofStringsReader{s: s},
  603. &stuckReader{},
  604. }
  605. for i, tc := range testCases {
  606. got := []TokenType{}
  607. z := NewTokenizer(tc)
  608. for {
  609. tt := z.Next()
  610. if tt == ErrorToken {
  611. break
  612. }
  613. got = append(got, tt)
  614. }
  615. if err := z.Err(); err != nil && err != io.EOF {
  616. if err != io.ErrNoProgress {
  617. t.Errorf("i=%d: %v", i, err)
  618. }
  619. continue
  620. }
  621. want := []TokenType{
  622. StartTagToken,
  623. TextToken,
  624. EndTagToken,
  625. }
  626. if !reflect.DeepEqual(got, want) {
  627. t.Errorf("i=%d: got %v, want %v", i, got, want)
  628. continue
  629. }
  630. }
  631. }
  632. // zeroOneByteReader is like a strings.Reader that alternates between
  633. // returning 0 bytes and 1 byte at a time.
  634. type zeroOneByteReader struct {
  635. s string
  636. n int
  637. }
  638. func (r *zeroOneByteReader) Read(p []byte) (int, error) {
  639. if len(p) == 0 {
  640. return 0, nil
  641. }
  642. if len(r.s) == 0 {
  643. return 0, io.EOF
  644. }
  645. r.n++
  646. if r.n%2 != 0 {
  647. return 0, nil
  648. }
  649. p[0], r.s = r.s[0], r.s[1:]
  650. return 1, nil
  651. }
  652. // eofStringsReader is like a strings.Reader but can return an (n, err) where
  653. // n > 0 && err != nil.
  654. type eofStringsReader struct {
  655. s string
  656. }
  657. func (r *eofStringsReader) Read(p []byte) (int, error) {
  658. n := copy(p, r.s)
  659. r.s = r.s[n:]
  660. if r.s != "" {
  661. return n, nil
  662. }
  663. return n, io.EOF
  664. }
  665. // stuckReader is an io.Reader that always returns no data and no error.
  666. type stuckReader struct{}
  667. func (*stuckReader) Read(p []byte) (int, error) {
  668. return 0, nil
  669. }
  670. const (
  671. rawLevel = iota
  672. lowLevel
  673. highLevel
  674. )
  675. func benchmarkTokenizer(b *testing.B, level int) {
  676. buf, err := ioutil.ReadFile("testdata/go1.html")
  677. if err != nil {
  678. b.Fatalf("could not read testdata/go1.html: %v", err)
  679. }
  680. b.SetBytes(int64(len(buf)))
  681. runtime.GC()
  682. b.ReportAllocs()
  683. b.ResetTimer()
  684. for i := 0; i < b.N; i++ {
  685. z := NewTokenizer(bytes.NewBuffer(buf))
  686. for {
  687. tt := z.Next()
  688. if tt == ErrorToken {
  689. if err := z.Err(); err != nil && err != io.EOF {
  690. b.Fatalf("tokenizer error: %v", err)
  691. }
  692. break
  693. }
  694. switch level {
  695. case rawLevel:
  696. // Calling z.Raw just returns the raw bytes of the token. It does
  697. // not unescape &lt; to <, or lower-case tag names and attribute keys.
  698. z.Raw()
  699. case lowLevel:
  700. // Caling z.Text, z.TagName and z.TagAttr returns []byte values
  701. // whose contents may change on the next call to z.Next.
  702. switch tt {
  703. case TextToken, CommentToken, DoctypeToken:
  704. z.Text()
  705. case StartTagToken, SelfClosingTagToken:
  706. _, more := z.TagName()
  707. for more {
  708. _, _, more = z.TagAttr()
  709. }
  710. case EndTagToken:
  711. z.TagName()
  712. }
  713. case highLevel:
  714. // Calling z.Token converts []byte values to strings whose validity
  715. // extend beyond the next call to z.Next.
  716. z.Token()
  717. }
  718. }
  719. }
  720. }
  721. func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
  722. func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
  723. func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }