token_test.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bytes"
  7. "io"
  8. "io/ioutil"
  9. "runtime"
  10. "strings"
  11. "testing"
  12. )
  13. type tokenTest struct {
  14. // A short description of the test case.
  15. desc string
  16. // The HTML to parse.
  17. html string
  18. // The string representations of the expected tokens, joined by '$'.
  19. golden string
  20. }
  21. var tokenTests = []tokenTest{
  22. {
  23. "empty",
  24. "",
  25. "",
  26. },
  27. // A single text node. The tokenizer should not break text nodes on whitespace,
  28. // nor should it normalize whitespace within a text node.
  29. {
  30. "text",
  31. "foo bar",
  32. "foo bar",
  33. },
  34. // An entity.
  35. {
  36. "entity",
  37. "one < two",
  38. "one < two",
  39. },
  40. // A start, self-closing and end tag. The tokenizer does not care if the start
  41. // and end tokens don't match; that is the job of the parser.
  42. {
  43. "tags",
  44. "<a>b<c/>d</e>",
  45. "<a>$b$<c/>$d$</e>",
  46. },
  47. // Angle brackets that aren't a tag.
  48. {
  49. "not a tag #0",
  50. "<",
  51. "&lt;",
  52. },
  53. {
  54. "not a tag #1",
  55. "</",
  56. "&lt;/",
  57. },
  58. {
  59. "not a tag #2",
  60. "</>",
  61. "",
  62. },
  63. {
  64. "not a tag #3",
  65. "a</>b",
  66. "a$b",
  67. },
  68. {
  69. "not a tag #4",
  70. "</ >",
  71. "<!-- -->",
  72. },
  73. {
  74. "not a tag #5",
  75. "</.",
  76. "<!--.-->",
  77. },
  78. {
  79. "not a tag #6",
  80. "</.>",
  81. "<!--.-->",
  82. },
  83. {
  84. "not a tag #7",
  85. "a < b",
  86. "a &lt; b",
  87. },
  88. {
  89. "not a tag #8",
  90. "<.>",
  91. "&lt;.&gt;",
  92. },
  93. {
  94. "not a tag #9",
  95. "a<<<b>>>c",
  96. "a&lt;&lt;$<b>$&gt;&gt;c",
  97. },
  98. {
  99. "not a tag #10",
  100. "if x<0 and y < 0 then x*y>0",
  101. "if x&lt;0 and y &lt; 0 then x*y&gt;0",
  102. },
  103. // EOF in a tag name.
  104. {
  105. "tag name eof #0",
  106. "<a",
  107. "",
  108. },
  109. {
  110. "tag name eof #1",
  111. "<a ",
  112. "",
  113. },
  114. {
  115. "tag name eof #2",
  116. "a<b",
  117. "a",
  118. },
  119. {
  120. "tag name eof #3",
  121. "<a><b",
  122. "<a>",
  123. },
  124. {
  125. "tag name eof #4",
  126. `<a x`,
  127. ``,
  128. },
  129. // Some malformed tags that are missing a '>'.
  130. {
  131. "malformed tag #0",
  132. `<p</p>`,
  133. `<p< p="">`,
  134. },
  135. {
  136. "malformed tag #1",
  137. `<p </p>`,
  138. `<p <="" p="">`,
  139. },
  140. {
  141. "malformed tag #2",
  142. `<p id`,
  143. ``,
  144. },
  145. {
  146. "malformed tag #3",
  147. `<p id=`,
  148. ``,
  149. },
  150. {
  151. "malformed tag #4",
  152. `<p id=>`,
  153. `<p id="">`,
  154. },
  155. {
  156. "malformed tag #5",
  157. `<p id=0`,
  158. ``,
  159. },
  160. {
  161. "malformed tag #6",
  162. `<p id=0</p>`,
  163. `<p id="0&lt;/p">`,
  164. },
  165. {
  166. "malformed tag #7",
  167. `<p id="0</p>`,
  168. ``,
  169. },
  170. {
  171. "malformed tag #8",
  172. `<p id="0"</p>`,
  173. `<p id="0" <="" p="">`,
  174. },
  175. {
  176. "malformed tag #9",
  177. `<p></p id`,
  178. `<p>`,
  179. },
  180. // Raw text and RCDATA.
  181. {
  182. "basic raw text",
  183. "<script><a></b></script>",
  184. "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
  185. },
  186. {
  187. "unfinished script end tag",
  188. "<SCRIPT>a</SCR",
  189. "<script>$a&lt;/SCR",
  190. },
  191. {
  192. "broken script end tag",
  193. "<SCRIPT>a</SCR ipt>",
  194. "<script>$a&lt;/SCR ipt&gt;",
  195. },
  196. {
  197. "EOF in script end tag",
  198. "<SCRIPT>a</SCRipt",
  199. "<script>$a&lt;/SCRipt",
  200. },
  201. {
  202. "scriptx end tag",
  203. "<SCRIPT>a</SCRiptx",
  204. "<script>$a&lt;/SCRiptx",
  205. },
  206. {
  207. "' ' completes script end tag",
  208. "<SCRIPT>a</SCRipt ",
  209. "<script>$a",
  210. },
  211. {
  212. "'>' completes script end tag",
  213. "<SCRIPT>a</SCRipt>",
  214. "<script>$a$</script>",
  215. },
  216. {
  217. "self-closing script end tag",
  218. "<SCRIPT>a</SCRipt/>",
  219. "<script>$a$</script>",
  220. },
  221. {
  222. "nested script tag",
  223. "<SCRIPT>a</SCRipt<script>",
  224. "<script>$a&lt;/SCRipt&lt;script&gt;",
  225. },
  226. {
  227. "script end tag after unfinished",
  228. "<SCRIPT>a</SCRipt</script>",
  229. "<script>$a&lt;/SCRipt$</script>",
  230. },
  231. {
  232. "script/style mismatched tags",
  233. "<script>a</style>",
  234. "<script>$a&lt;/style&gt;",
  235. },
  236. {
  237. "style element with entity",
  238. "<style>&apos;",
  239. "<style>$&amp;apos;",
  240. },
  241. {
  242. "textarea with tag",
  243. "<textarea><div></textarea>",
  244. "<textarea>$&lt;div&gt;$</textarea>",
  245. },
  246. {
  247. "title with tag and entity",
  248. "<title><b>K&amp;R C</b></title>",
  249. "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
  250. },
  251. // DOCTYPE tests.
  252. {
  253. "Proper DOCTYPE",
  254. "<!DOCTYPE html>",
  255. "<!DOCTYPE html>",
  256. },
  257. {
  258. "DOCTYPE with no space",
  259. "<!doctypehtml>",
  260. "<!DOCTYPE html>",
  261. },
  262. {
  263. "DOCTYPE with two spaces",
  264. "<!doctype html>",
  265. "<!DOCTYPE html>",
  266. },
  267. {
  268. "looks like DOCTYPE but isn't",
  269. "<!DOCUMENT html>",
  270. "<!--DOCUMENT html-->",
  271. },
  272. {
  273. "DOCTYPE at EOF",
  274. "<!DOCtype",
  275. "<!DOCTYPE >",
  276. },
  277. // XML processing instructions.
  278. {
  279. "XML processing instruction",
  280. "<?xml?>",
  281. "<!--?xml?-->",
  282. },
  283. // Comments.
  284. {
  285. "comment0",
  286. "abc<b><!-- skipme --></b>def",
  287. "abc$<b>$<!-- skipme -->$</b>$def",
  288. },
  289. {
  290. "comment1",
  291. "a<!-->z",
  292. "a$<!---->$z",
  293. },
  294. {
  295. "comment2",
  296. "a<!--->z",
  297. "a$<!---->$z",
  298. },
  299. {
  300. "comment3",
  301. "a<!--x>-->z",
  302. "a$<!--x>-->$z",
  303. },
  304. {
  305. "comment4",
  306. "a<!--x->-->z",
  307. "a$<!--x->-->$z",
  308. },
  309. {
  310. "comment5",
  311. "a<!>z",
  312. "a$<!---->$z",
  313. },
  314. {
  315. "comment6",
  316. "a<!->z",
  317. "a$<!----->$z",
  318. },
  319. {
  320. "comment7",
  321. "a<!---<>z",
  322. "a$<!---<>z-->",
  323. },
  324. {
  325. "comment8",
  326. "a<!--z",
  327. "a$<!--z-->",
  328. },
  329. {
  330. "comment9",
  331. "a<!--z-",
  332. "a$<!--z-->",
  333. },
  334. {
  335. "comment10",
  336. "a<!--z--",
  337. "a$<!--z-->",
  338. },
  339. {
  340. "comment11",
  341. "a<!--z---",
  342. "a$<!--z--->",
  343. },
  344. {
  345. "comment12",
  346. "a<!--z----",
  347. "a$<!--z---->",
  348. },
  349. {
  350. "comment13",
  351. "a<!--x--!>z",
  352. "a$<!--x-->$z",
  353. },
  354. // An attribute with a backslash.
  355. {
  356. "backslash",
  357. `<p id="a\"b">`,
  358. `<p id="a\" b"="">`,
  359. },
  360. // Entities, tag name and attribute key lower-casing, and whitespace
  361. // normalization within a tag.
  362. {
  363. "tricky",
  364. "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
  365. `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
  366. },
  367. // A nonexistent entity. Tokenizing and converting back to a string should
  368. // escape the "&" to become "&amp;".
  369. {
  370. "noSuchEntity",
  371. `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
  372. `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
  373. },
  374. {
  375. "entity without semicolon",
  376. `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
  377. `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
  378. },
  379. {
  380. "entity with digits",
  381. "&frac12;",
  382. "½",
  383. },
  384. // Attribute tests:
  385. // http://dev.w3.org/html5/spec/Overview.html#attributes-0
  386. {
  387. "Empty attribute",
  388. `<input disabled FOO>`,
  389. `<input disabled="" foo="">`,
  390. },
  391. {
  392. "Empty attribute, whitespace",
  393. `<input disabled FOO >`,
  394. `<input disabled="" foo="">`,
  395. },
  396. {
  397. "Unquoted attribute value",
  398. `<input value=yes FOO=BAR>`,
  399. `<input value="yes" foo="BAR">`,
  400. },
  401. {
  402. "Unquoted attribute value, spaces",
  403. `<input value = yes FOO = BAR>`,
  404. `<input value="yes" foo="BAR">`,
  405. },
  406. {
  407. "Unquoted attribute value, trailing space",
  408. `<input value=yes FOO=BAR >`,
  409. `<input value="yes" foo="BAR">`,
  410. },
  411. {
  412. "Single-quoted attribute value",
  413. `<input value='yes' FOO='BAR'>`,
  414. `<input value="yes" foo="BAR">`,
  415. },
  416. {
  417. "Single-quoted attribute value, trailing space",
  418. `<input value='yes' FOO='BAR' >`,
  419. `<input value="yes" foo="BAR">`,
  420. },
  421. {
  422. "Double-quoted attribute value",
  423. `<input value="I'm an attribute" FOO="BAR">`,
  424. `<input value="I&#39;m an attribute" foo="BAR">`,
  425. },
  426. {
  427. "Attribute name characters",
  428. `<meta http-equiv="content-type">`,
  429. `<meta http-equiv="content-type">`,
  430. },
  431. {
  432. "Mixed attributes",
  433. `a<P V="0 1" w='2' X=3 y>z`,
  434. `a$<p v="0 1" w="2" x="3" y="">$z`,
  435. },
  436. {
  437. "Attributes with a solitary single quote",
  438. `<p id=can't><p id=won't>`,
  439. `<p id="can&#39;t">$<p id="won&#39;t">`,
  440. },
  441. }
  442. func TestTokenizer(t *testing.T) {
  443. loop:
  444. for _, tt := range tokenTests {
  445. z := NewTokenizer(strings.NewReader(tt.html))
  446. if tt.golden != "" {
  447. for i, s := range strings.Split(tt.golden, "$") {
  448. if z.Next() == ErrorToken {
  449. t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
  450. continue loop
  451. }
  452. actual := z.Token().String()
  453. if s != actual {
  454. t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
  455. continue loop
  456. }
  457. }
  458. }
  459. z.Next()
  460. if z.Err() != io.EOF {
  461. t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
  462. }
  463. }
  464. }
  465. func TestBufAPI(t *testing.T) {
  466. s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
  467. z := NewTokenizer(bytes.NewBufferString(s))
  468. var result bytes.Buffer
  469. depth := 0
  470. loop:
  471. for {
  472. tt := z.Next()
  473. switch tt {
  474. case ErrorToken:
  475. if z.Err() != io.EOF {
  476. t.Error(z.Err())
  477. }
  478. break loop
  479. case TextToken:
  480. if depth > 0 {
  481. result.Write(z.Text())
  482. }
  483. case StartTagToken, EndTagToken:
  484. tn, _ := z.TagName()
  485. if len(tn) == 1 && tn[0] == 'a' {
  486. if tt == StartTagToken {
  487. depth++
  488. } else {
  489. depth--
  490. }
  491. }
  492. }
  493. }
  494. u := "14567"
  495. v := string(result.Bytes())
  496. if u != v {
  497. t.Errorf("TestBufAPI: want %q got %q", u, v)
  498. }
  499. }
  500. func TestConvertNewlines(t *testing.T) {
  501. testCases := map[string]string{
  502. "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
  503. "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
  504. "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
  505. "": "",
  506. "\n": "\n",
  507. "\n\r": "\n\n",
  508. "\r": "\n",
  509. "\r\n": "\n",
  510. "\r\n\n": "\n\n",
  511. "\r\n\r": "\n\n",
  512. "\r\n\r\n": "\n\n",
  513. "\r\r": "\n\n",
  514. "\r\r\n": "\n\n",
  515. "\r\r\n\n": "\n\n\n",
  516. "\r\r\r\n": "\n\n\n",
  517. "\r \n": "\n \n",
  518. "xyz": "xyz",
  519. }
  520. for in, want := range testCases {
  521. if got := string(convertNewlines([]byte(in))); got != want {
  522. t.Errorf("input %q: got %q, want %q", in, got, want)
  523. }
  524. }
  525. }
  526. const (
  527. rawLevel = iota
  528. lowLevel
  529. highLevel
  530. )
  531. func benchmarkTokenizer(b *testing.B, level int) {
  532. buf, err := ioutil.ReadFile("testdata/go1.html")
  533. if err != nil {
  534. b.Fatalf("could not read testdata/go1.html: %v", err)
  535. }
  536. b.SetBytes(int64(len(buf)))
  537. runtime.GC()
  538. b.ReportAllocs()
  539. b.ResetTimer()
  540. for i := 0; i < b.N; i++ {
  541. z := NewTokenizer(bytes.NewBuffer(buf))
  542. for {
  543. tt := z.Next()
  544. if tt == ErrorToken {
  545. if err := z.Err(); err != nil && err != io.EOF {
  546. b.Fatalf("tokenizer error: %v", err)
  547. }
  548. break
  549. }
  550. switch level {
  551. case rawLevel:
  552. // Calling z.Raw just returns the raw bytes of the token. It does
  553. // not unescape &lt; to <, or lower-case tag names and attribute keys.
  554. z.Raw()
  555. case lowLevel:
  556. // Caling z.Text, z.TagName and z.TagAttr returns []byte values
  557. // whose contents may change on the next call to z.Next.
  558. switch tt {
  559. case TextToken, CommentToken, DoctypeToken:
  560. z.Text()
  561. case StartTagToken, SelfClosingTagToken:
  562. _, more := z.TagName()
  563. for more {
  564. _, _, more = z.TagAttr()
  565. }
  566. case EndTagToken:
  567. z.TagName()
  568. }
  569. case highLevel:
  570. // Calling z.Token converts []byte values to strings whose validity
  571. // extend beyond the next call to z.Next.
  572. z.Token()
  573. }
  574. }
  575. }
  576. }
  577. func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
  578. func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
  579. func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }