token_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bytes"
  7. "io"
  8. "io/ioutil"
  9. "reflect"
  10. "runtime"
  11. "strings"
  12. "testing"
  13. )
  14. type tokenTest struct {
  15. // A short description of the test case.
  16. desc string
  17. // The HTML to parse.
  18. html string
  19. // The string representations of the expected tokens, joined by '$'.
  20. golden string
  21. }
  22. var tokenTests = []tokenTest{
  23. {
  24. "empty",
  25. "",
  26. "",
  27. },
  28. // A single text node. The tokenizer should not break text nodes on whitespace,
  29. // nor should it normalize whitespace within a text node.
  30. {
  31. "text",
  32. "foo bar",
  33. "foo bar",
  34. },
  35. // An entity.
  36. {
  37. "entity",
  38. "one < two",
  39. "one < two",
  40. },
  41. // A start, self-closing and end tag. The tokenizer does not care if the start
  42. // and end tokens don't match; that is the job of the parser.
  43. {
  44. "tags",
  45. "<a>b<c/>d</e>",
  46. "<a>$b$<c/>$d$</e>",
  47. },
  48. // Angle brackets that aren't a tag.
  49. {
  50. "not a tag #0",
  51. "<",
  52. "&lt;",
  53. },
  54. {
  55. "not a tag #1",
  56. "</",
  57. "&lt;/",
  58. },
  59. {
  60. "not a tag #2",
  61. "</>",
  62. "",
  63. },
  64. {
  65. "not a tag #3",
  66. "a</>b",
  67. "a$b",
  68. },
  69. {
  70. "not a tag #4",
  71. "</ >",
  72. "<!-- -->",
  73. },
  74. {
  75. "not a tag #5",
  76. "</.",
  77. "<!--.-->",
  78. },
  79. {
  80. "not a tag #6",
  81. "</.>",
  82. "<!--.-->",
  83. },
  84. {
  85. "not a tag #7",
  86. "a < b",
  87. "a &lt; b",
  88. },
  89. {
  90. "not a tag #8",
  91. "<.>",
  92. "&lt;.&gt;",
  93. },
  94. {
  95. "not a tag #9",
  96. "a<<<b>>>c",
  97. "a&lt;&lt;$<b>$&gt;&gt;c",
  98. },
  99. {
  100. "not a tag #10",
  101. "if x<0 and y < 0 then x*y>0",
  102. "if x&lt;0 and y &lt; 0 then x*y&gt;0",
  103. },
  104. // EOF in a tag name.
  105. {
  106. "tag name eof #0",
  107. "<a",
  108. "",
  109. },
  110. {
  111. "tag name eof #1",
  112. "<a ",
  113. "",
  114. },
  115. {
  116. "tag name eof #2",
  117. "a<b",
  118. "a",
  119. },
  120. {
  121. "tag name eof #3",
  122. "<a><b",
  123. "<a>",
  124. },
  125. {
  126. "tag name eof #4",
  127. `<a x`,
  128. ``,
  129. },
  130. // Some malformed tags that are missing a '>'.
  131. {
  132. "malformed tag #0",
  133. `<p</p>`,
  134. `<p< p="">`,
  135. },
  136. {
  137. "malformed tag #1",
  138. `<p </p>`,
  139. `<p <="" p="">`,
  140. },
  141. {
  142. "malformed tag #2",
  143. `<p id`,
  144. ``,
  145. },
  146. {
  147. "malformed tag #3",
  148. `<p id=`,
  149. ``,
  150. },
  151. {
  152. "malformed tag #4",
  153. `<p id=>`,
  154. `<p id="">`,
  155. },
  156. {
  157. "malformed tag #5",
  158. `<p id=0`,
  159. ``,
  160. },
  161. {
  162. "malformed tag #6",
  163. `<p id=0</p>`,
  164. `<p id="0&lt;/p">`,
  165. },
  166. {
  167. "malformed tag #7",
  168. `<p id="0</p>`,
  169. ``,
  170. },
  171. {
  172. "malformed tag #8",
  173. `<p id="0"</p>`,
  174. `<p id="0" <="" p="">`,
  175. },
  176. {
  177. "malformed tag #9",
  178. `<p></p id`,
  179. `<p>`,
  180. },
  181. // Raw text and RCDATA.
  182. {
  183. "basic raw text",
  184. "<script><a></b></script>",
  185. "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
  186. },
  187. {
  188. "unfinished script end tag",
  189. "<SCRIPT>a</SCR",
  190. "<script>$a&lt;/SCR",
  191. },
  192. {
  193. "broken script end tag",
  194. "<SCRIPT>a</SCR ipt>",
  195. "<script>$a&lt;/SCR ipt&gt;",
  196. },
  197. {
  198. "EOF in script end tag",
  199. "<SCRIPT>a</SCRipt",
  200. "<script>$a&lt;/SCRipt",
  201. },
  202. {
  203. "scriptx end tag",
  204. "<SCRIPT>a</SCRiptx",
  205. "<script>$a&lt;/SCRiptx",
  206. },
  207. {
  208. "' ' completes script end tag",
  209. "<SCRIPT>a</SCRipt ",
  210. "<script>$a",
  211. },
  212. {
  213. "'>' completes script end tag",
  214. "<SCRIPT>a</SCRipt>",
  215. "<script>$a$</script>",
  216. },
  217. {
  218. "self-closing script end tag",
  219. "<SCRIPT>a</SCRipt/>",
  220. "<script>$a$</script>",
  221. },
  222. {
  223. "nested script tag",
  224. "<SCRIPT>a</SCRipt<script>",
  225. "<script>$a&lt;/SCRipt&lt;script&gt;",
  226. },
  227. {
  228. "script end tag after unfinished",
  229. "<SCRIPT>a</SCRipt</script>",
  230. "<script>$a&lt;/SCRipt$</script>",
  231. },
  232. {
  233. "script/style mismatched tags",
  234. "<script>a</style>",
  235. "<script>$a&lt;/style&gt;",
  236. },
  237. {
  238. "style element with entity",
  239. "<style>&apos;",
  240. "<style>$&amp;apos;",
  241. },
  242. {
  243. "textarea with tag",
  244. "<textarea><div></textarea>",
  245. "<textarea>$&lt;div&gt;$</textarea>",
  246. },
  247. {
  248. "title with tag and entity",
  249. "<title><b>K&amp;R C</b></title>",
  250. "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
  251. },
  252. // DOCTYPE tests.
  253. {
  254. "Proper DOCTYPE",
  255. "<!DOCTYPE html>",
  256. "<!DOCTYPE html>",
  257. },
  258. {
  259. "DOCTYPE with no space",
  260. "<!doctypehtml>",
  261. "<!DOCTYPE html>",
  262. },
  263. {
  264. "DOCTYPE with two spaces",
  265. "<!doctype html>",
  266. "<!DOCTYPE html>",
  267. },
  268. {
  269. "looks like DOCTYPE but isn't",
  270. "<!DOCUMENT html>",
  271. "<!--DOCUMENT html-->",
  272. },
  273. {
  274. "DOCTYPE at EOF",
  275. "<!DOCtype",
  276. "<!DOCTYPE >",
  277. },
  278. // XML processing instructions.
  279. {
  280. "XML processing instruction",
  281. "<?xml?>",
  282. "<!--?xml?-->",
  283. },
  284. // Comments.
  285. {
  286. "comment0",
  287. "abc<b><!-- skipme --></b>def",
  288. "abc$<b>$<!-- skipme -->$</b>$def",
  289. },
  290. {
  291. "comment1",
  292. "a<!-->z",
  293. "a$<!---->$z",
  294. },
  295. {
  296. "comment2",
  297. "a<!--->z",
  298. "a$<!---->$z",
  299. },
  300. {
  301. "comment3",
  302. "a<!--x>-->z",
  303. "a$<!--x>-->$z",
  304. },
  305. {
  306. "comment4",
  307. "a<!--x->-->z",
  308. "a$<!--x->-->$z",
  309. },
  310. {
  311. "comment5",
  312. "a<!>z",
  313. "a$<!---->$z",
  314. },
  315. {
  316. "comment6",
  317. "a<!->z",
  318. "a$<!----->$z",
  319. },
  320. {
  321. "comment7",
  322. "a<!---<>z",
  323. "a$<!---<>z-->",
  324. },
  325. {
  326. "comment8",
  327. "a<!--z",
  328. "a$<!--z-->",
  329. },
  330. {
  331. "comment9",
  332. "a<!--z-",
  333. "a$<!--z-->",
  334. },
  335. {
  336. "comment10",
  337. "a<!--z--",
  338. "a$<!--z-->",
  339. },
  340. {
  341. "comment11",
  342. "a<!--z---",
  343. "a$<!--z--->",
  344. },
  345. {
  346. "comment12",
  347. "a<!--z----",
  348. "a$<!--z---->",
  349. },
  350. {
  351. "comment13",
  352. "a<!--x--!>z",
  353. "a$<!--x-->$z",
  354. },
  355. // An attribute with a backslash.
  356. {
  357. "backslash",
  358. `<p id="a\"b">`,
  359. `<p id="a\" b"="">`,
  360. },
  361. // Entities, tag name and attribute key lower-casing, and whitespace
  362. // normalization within a tag.
  363. {
  364. "tricky",
  365. "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
  366. `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
  367. },
  368. // A nonexistent entity. Tokenizing and converting back to a string should
  369. // escape the "&" to become "&amp;".
  370. {
  371. "noSuchEntity",
  372. `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
  373. `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
  374. },
  375. {
  376. "entity without semicolon",
  377. `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
  378. `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
  379. },
  380. {
  381. "entity with digits",
  382. "&frac12;",
  383. "½",
  384. },
  385. // Attribute tests:
  386. // http://dev.w3.org/html5/spec/Overview.html#attributes-0
  387. {
  388. "Empty attribute",
  389. `<input disabled FOO>`,
  390. `<input disabled="" foo="">`,
  391. },
  392. {
  393. "Empty attribute, whitespace",
  394. `<input disabled FOO >`,
  395. `<input disabled="" foo="">`,
  396. },
  397. {
  398. "Unquoted attribute value",
  399. `<input value=yes FOO=BAR>`,
  400. `<input value="yes" foo="BAR">`,
  401. },
  402. {
  403. "Unquoted attribute value, spaces",
  404. `<input value = yes FOO = BAR>`,
  405. `<input value="yes" foo="BAR">`,
  406. },
  407. {
  408. "Unquoted attribute value, trailing space",
  409. `<input value=yes FOO=BAR >`,
  410. `<input value="yes" foo="BAR">`,
  411. },
  412. {
  413. "Single-quoted attribute value",
  414. `<input value='yes' FOO='BAR'>`,
  415. `<input value="yes" foo="BAR">`,
  416. },
  417. {
  418. "Single-quoted attribute value, trailing space",
  419. `<input value='yes' FOO='BAR' >`,
  420. `<input value="yes" foo="BAR">`,
  421. },
  422. {
  423. "Double-quoted attribute value",
  424. `<input value="I'm an attribute" FOO="BAR">`,
  425. `<input value="I&#39;m an attribute" foo="BAR">`,
  426. },
  427. {
  428. "Attribute name characters",
  429. `<meta http-equiv="content-type">`,
  430. `<meta http-equiv="content-type">`,
  431. },
  432. {
  433. "Mixed attributes",
  434. `a<P V="0 1" w='2' X=3 y>z`,
  435. `a$<p v="0 1" w="2" x="3" y="">$z`,
  436. },
  437. {
  438. "Attributes with a solitary single quote",
  439. `<p id=can't><p id=won't>`,
  440. `<p id="can&#39;t">$<p id="won&#39;t">`,
  441. },
  442. }
  443. func TestTokenizer(t *testing.T) {
  444. loop:
  445. for _, tt := range tokenTests {
  446. z := NewTokenizer(strings.NewReader(tt.html))
  447. if tt.golden != "" {
  448. for i, s := range strings.Split(tt.golden, "$") {
  449. if z.Next() == ErrorToken {
  450. t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
  451. continue loop
  452. }
  453. actual := z.Token().String()
  454. if s != actual {
  455. t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
  456. continue loop
  457. }
  458. }
  459. }
  460. z.Next()
  461. if z.Err() != io.EOF {
  462. t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
  463. }
  464. }
  465. }
  466. func TestBufAPI(t *testing.T) {
  467. s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
  468. z := NewTokenizer(bytes.NewBufferString(s))
  469. var result bytes.Buffer
  470. depth := 0
  471. loop:
  472. for {
  473. tt := z.Next()
  474. switch tt {
  475. case ErrorToken:
  476. if z.Err() != io.EOF {
  477. t.Error(z.Err())
  478. }
  479. break loop
  480. case TextToken:
  481. if depth > 0 {
  482. result.Write(z.Text())
  483. }
  484. case StartTagToken, EndTagToken:
  485. tn, _ := z.TagName()
  486. if len(tn) == 1 && tn[0] == 'a' {
  487. if tt == StartTagToken {
  488. depth++
  489. } else {
  490. depth--
  491. }
  492. }
  493. }
  494. }
  495. u := "14567"
  496. v := string(result.Bytes())
  497. if u != v {
  498. t.Errorf("TestBufAPI: want %q got %q", u, v)
  499. }
  500. }
  501. func TestConvertNewlines(t *testing.T) {
  502. testCases := map[string]string{
  503. "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
  504. "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
  505. "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
  506. "": "",
  507. "\n": "\n",
  508. "\n\r": "\n\n",
  509. "\r": "\n",
  510. "\r\n": "\n",
  511. "\r\n\n": "\n\n",
  512. "\r\n\r": "\n\n",
  513. "\r\n\r\n": "\n\n",
  514. "\r\r": "\n\n",
  515. "\r\r\n": "\n\n",
  516. "\r\r\n\n": "\n\n\n",
  517. "\r\r\r\n": "\n\n\n",
  518. "\r \n": "\n \n",
  519. "xyz": "xyz",
  520. }
  521. for in, want := range testCases {
  522. if got := string(convertNewlines([]byte(in))); got != want {
  523. t.Errorf("input %q: got %q, want %q", in, got, want)
  524. }
  525. }
  526. }
  527. func TestReaderEdgeCases(t *testing.T) {
  528. const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
  529. testCases := []io.Reader{
  530. &zeroOneByteReader{s: s},
  531. &eofStringsReader{s: s},
  532. &stuckReader{},
  533. }
  534. for i, tc := range testCases {
  535. got := []TokenType{}
  536. z := NewTokenizer(tc)
  537. for {
  538. tt := z.Next()
  539. if tt == ErrorToken {
  540. break
  541. }
  542. got = append(got, tt)
  543. }
  544. if err := z.Err(); err != nil && err != io.EOF {
  545. if err != io.ErrNoProgress {
  546. t.Errorf("i=%d: %v", i, err)
  547. }
  548. continue
  549. }
  550. want := []TokenType{
  551. StartTagToken,
  552. TextToken,
  553. EndTagToken,
  554. }
  555. if !reflect.DeepEqual(got, want) {
  556. t.Errorf("i=%d: got %v, want %v", i, got, want)
  557. continue
  558. }
  559. }
  560. }
  561. // zeroOneByteReader is like a strings.Reader that alternates between
  562. // returning 0 bytes and 1 byte at a time.
  563. type zeroOneByteReader struct {
  564. s string
  565. n int
  566. }
  567. func (r *zeroOneByteReader) Read(p []byte) (int, error) {
  568. if len(p) == 0 {
  569. return 0, nil
  570. }
  571. if len(r.s) == 0 {
  572. return 0, io.EOF
  573. }
  574. r.n++
  575. if r.n%2 != 0 {
  576. return 0, nil
  577. }
  578. p[0], r.s = r.s[0], r.s[1:]
  579. return 1, nil
  580. }
  581. // eofStringsReader is like a strings.Reader but can return an (n, err) where
  582. // n > 0 && err != nil.
  583. type eofStringsReader struct {
  584. s string
  585. }
  586. func (r *eofStringsReader) Read(p []byte) (int, error) {
  587. n := copy(p, r.s)
  588. r.s = r.s[n:]
  589. if r.s != "" {
  590. return n, nil
  591. }
  592. return n, io.EOF
  593. }
  594. // stuckReader is an io.Reader that always returns no data and no error.
  595. type stuckReader struct{}
  596. func (*stuckReader) Read(p []byte) (int, error) {
  597. return 0, nil
  598. }
  599. const (
  600. rawLevel = iota
  601. lowLevel
  602. highLevel
  603. )
  604. func benchmarkTokenizer(b *testing.B, level int) {
  605. buf, err := ioutil.ReadFile("testdata/go1.html")
  606. if err != nil {
  607. b.Fatalf("could not read testdata/go1.html: %v", err)
  608. }
  609. b.SetBytes(int64(len(buf)))
  610. runtime.GC()
  611. b.ReportAllocs()
  612. b.ResetTimer()
  613. for i := 0; i < b.N; i++ {
  614. z := NewTokenizer(bytes.NewBuffer(buf))
  615. for {
  616. tt := z.Next()
  617. if tt == ErrorToken {
  618. if err := z.Err(); err != nil && err != io.EOF {
  619. b.Fatalf("tokenizer error: %v", err)
  620. }
  621. break
  622. }
  623. switch level {
  624. case rawLevel:
  625. // Calling z.Raw just returns the raw bytes of the token. It does
  626. // not unescape &lt; to <, or lower-case tag names and attribute keys.
  627. z.Raw()
  628. case lowLevel:
  629. // Caling z.Text, z.TagName and z.TagAttr returns []byte values
  630. // whose contents may change on the next call to z.Next.
  631. switch tt {
  632. case TextToken, CommentToken, DoctypeToken:
  633. z.Text()
  634. case StartTagToken, SelfClosingTagToken:
  635. _, more := z.TagName()
  636. for more {
  637. _, _, more = z.TagAttr()
  638. }
  639. case EndTagToken:
  640. z.TagName()
  641. }
  642. case highLevel:
  643. // Calling z.Token converts []byte values to strings whose validity
  644. // extend beyond the next call to z.Next.
  645. z.Token()
  646. }
  647. }
  648. }
  649. }
  650. func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
  651. func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
  652. func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }