token_test.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bytes"
  7. "io"
  8. "io/ioutil"
  9. "reflect"
  10. "runtime"
  11. "strings"
  12. "testing"
  13. )
  14. type tokenTest struct {
  15. // A short description of the test case.
  16. desc string
  17. // The HTML to parse.
  18. html string
  19. // The string representations of the expected tokens, joined by '$'.
  20. golden string
  21. }
  22. var tokenTests = []tokenTest{
  23. {
  24. "empty",
  25. "",
  26. "",
  27. },
  28. // A single text node. The tokenizer should not break text nodes on whitespace,
  29. // nor should it normalize whitespace within a text node.
  30. {
  31. "text",
  32. "foo bar",
  33. "foo bar",
  34. },
  35. // An entity.
  36. {
  37. "entity",
  38. "one < two",
  39. "one < two",
  40. },
  41. // A start, self-closing and end tag. The tokenizer does not care if the start
  42. // and end tokens don't match; that is the job of the parser.
  43. {
  44. "tags",
  45. "<a>b<c/>d</e>",
  46. "<a>$b$<c/>$d$</e>",
  47. },
  48. // Angle brackets that aren't a tag.
  49. {
  50. "not a tag #0",
  51. "<",
  52. "&lt;",
  53. },
  54. {
  55. "not a tag #1",
  56. "</",
  57. "&lt;/",
  58. },
  59. {
  60. "not a tag #2",
  61. "</>",
  62. "<!---->",
  63. },
  64. {
  65. "not a tag #3",
  66. "a</>b",
  67. "a$<!---->$b",
  68. },
  69. {
  70. "not a tag #4",
  71. "</ >",
  72. "<!-- -->",
  73. },
  74. {
  75. "not a tag #5",
  76. "</.",
  77. "<!--.-->",
  78. },
  79. {
  80. "not a tag #6",
  81. "</.>",
  82. "<!--.-->",
  83. },
  84. {
  85. "not a tag #7",
  86. "a < b",
  87. "a &lt; b",
  88. },
  89. {
  90. "not a tag #8",
  91. "<.>",
  92. "&lt;.&gt;",
  93. },
  94. {
  95. "not a tag #9",
  96. "a<<<b>>>c",
  97. "a&lt;&lt;$<b>$&gt;&gt;c",
  98. },
  99. {
  100. "not a tag #10",
  101. "if x<0 and y < 0 then x*y>0",
  102. "if x&lt;0 and y &lt; 0 then x*y&gt;0",
  103. },
  104. {
  105. "not a tag #11",
  106. "<<p>",
  107. "&lt;$<p>",
  108. },
  109. // EOF in a tag name.
  110. {
  111. "tag name eof #0",
  112. "<a",
  113. "",
  114. },
  115. {
  116. "tag name eof #1",
  117. "<a ",
  118. "",
  119. },
  120. {
  121. "tag name eof #2",
  122. "a<b",
  123. "a",
  124. },
  125. {
  126. "tag name eof #3",
  127. "<a><b",
  128. "<a>",
  129. },
  130. {
  131. "tag name eof #4",
  132. `<a x`,
  133. ``,
  134. },
  135. // Some malformed tags that are missing a '>'.
  136. {
  137. "malformed tag #0",
  138. `<p</p>`,
  139. `<p< p="">`,
  140. },
  141. {
  142. "malformed tag #1",
  143. `<p </p>`,
  144. `<p <="" p="">`,
  145. },
  146. {
  147. "malformed tag #2",
  148. `<p id`,
  149. ``,
  150. },
  151. {
  152. "malformed tag #3",
  153. `<p id=`,
  154. ``,
  155. },
  156. {
  157. "malformed tag #4",
  158. `<p id=>`,
  159. `<p id="">`,
  160. },
  161. {
  162. "malformed tag #5",
  163. `<p id=0`,
  164. ``,
  165. },
  166. {
  167. "malformed tag #6",
  168. `<p id=0</p>`,
  169. `<p id="0&lt;/p">`,
  170. },
  171. {
  172. "malformed tag #7",
  173. `<p id="0</p>`,
  174. ``,
  175. },
  176. {
  177. "malformed tag #8",
  178. `<p id="0"</p>`,
  179. `<p id="0" <="" p="">`,
  180. },
  181. {
  182. "malformed tag #9",
  183. `<p></p id`,
  184. `<p>`,
  185. },
  186. // Raw text and RCDATA.
  187. {
  188. "basic raw text",
  189. "<script><a></b></script>",
  190. "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
  191. },
  192. {
  193. "unfinished script end tag",
  194. "<SCRIPT>a</SCR",
  195. "<script>$a&lt;/SCR",
  196. },
  197. {
  198. "broken script end tag",
  199. "<SCRIPT>a</SCR ipt>",
  200. "<script>$a&lt;/SCR ipt&gt;",
  201. },
  202. {
  203. "EOF in script end tag",
  204. "<SCRIPT>a</SCRipt",
  205. "<script>$a&lt;/SCRipt",
  206. },
  207. {
  208. "scriptx end tag",
  209. "<SCRIPT>a</SCRiptx",
  210. "<script>$a&lt;/SCRiptx",
  211. },
  212. {
  213. "' ' completes script end tag",
  214. "<SCRIPT>a</SCRipt ",
  215. "<script>$a",
  216. },
  217. {
  218. "'>' completes script end tag",
  219. "<SCRIPT>a</SCRipt>",
  220. "<script>$a$</script>",
  221. },
  222. {
  223. "self-closing script end tag",
  224. "<SCRIPT>a</SCRipt/>",
  225. "<script>$a$</script>",
  226. },
  227. {
  228. "nested script tag",
  229. "<SCRIPT>a</SCRipt<script>",
  230. "<script>$a&lt;/SCRipt&lt;script&gt;",
  231. },
  232. {
  233. "script end tag after unfinished",
  234. "<SCRIPT>a</SCRipt</script>",
  235. "<script>$a&lt;/SCRipt$</script>",
  236. },
  237. {
  238. "script/style mismatched tags",
  239. "<script>a</style>",
  240. "<script>$a&lt;/style&gt;",
  241. },
  242. {
  243. "style element with entity",
  244. "<style>&apos;",
  245. "<style>$&amp;apos;",
  246. },
  247. {
  248. "textarea with tag",
  249. "<textarea><div></textarea>",
  250. "<textarea>$&lt;div&gt;$</textarea>",
  251. },
  252. {
  253. "title with tag and entity",
  254. "<title><b>K&amp;R C</b></title>",
  255. "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
  256. },
  257. {
  258. "title with trailing '&lt;' entity",
  259. "<title>foobar<</title>",
  260. "<title>$foobar&lt;$</title>",
  261. },
  262. // DOCTYPE tests.
  263. {
  264. "Proper DOCTYPE",
  265. "<!DOCTYPE html>",
  266. "<!DOCTYPE html>",
  267. },
  268. {
  269. "DOCTYPE with no space",
  270. "<!doctypehtml>",
  271. "<!DOCTYPE html>",
  272. },
  273. {
  274. "DOCTYPE with two spaces",
  275. "<!doctype html>",
  276. "<!DOCTYPE html>",
  277. },
  278. {
  279. "looks like DOCTYPE but isn't",
  280. "<!DOCUMENT html>",
  281. "<!--DOCUMENT html-->",
  282. },
  283. {
  284. "DOCTYPE at EOF",
  285. "<!DOCtype",
  286. "<!DOCTYPE >",
  287. },
  288. // XML processing instructions.
  289. {
  290. "XML processing instruction",
  291. "<?xml?>",
  292. "<!--?xml?-->",
  293. },
  294. // Comments.
  295. {
  296. "comment0",
  297. "abc<b><!-- skipme --></b>def",
  298. "abc$<b>$<!-- skipme -->$</b>$def",
  299. },
  300. {
  301. "comment1",
  302. "a<!-->z",
  303. "a$<!---->$z",
  304. },
  305. {
  306. "comment2",
  307. "a<!--->z",
  308. "a$<!---->$z",
  309. },
  310. {
  311. "comment3",
  312. "a<!--x>-->z",
  313. "a$<!--x>-->$z",
  314. },
  315. {
  316. "comment4",
  317. "a<!--x->-->z",
  318. "a$<!--x->-->$z",
  319. },
  320. {
  321. "comment5",
  322. "a<!>z",
  323. "a$<!---->$z",
  324. },
  325. {
  326. "comment6",
  327. "a<!->z",
  328. "a$<!----->$z",
  329. },
  330. {
  331. "comment7",
  332. "a<!---<>z",
  333. "a$<!---<>z-->",
  334. },
  335. {
  336. "comment8",
  337. "a<!--z",
  338. "a$<!--z-->",
  339. },
  340. {
  341. "comment9",
  342. "a<!--z-",
  343. "a$<!--z-->",
  344. },
  345. {
  346. "comment10",
  347. "a<!--z--",
  348. "a$<!--z-->",
  349. },
  350. {
  351. "comment11",
  352. "a<!--z---",
  353. "a$<!--z--->",
  354. },
  355. {
  356. "comment12",
  357. "a<!--z----",
  358. "a$<!--z---->",
  359. },
  360. {
  361. "comment13",
  362. "a<!--x--!>z",
  363. "a$<!--x-->$z",
  364. },
  365. // An attribute with a backslash.
  366. {
  367. "backslash",
  368. `<p id="a\"b">`,
  369. `<p id="a\" b"="">`,
  370. },
  371. // Entities, tag name and attribute key lower-casing, and whitespace
  372. // normalization within a tag.
  373. {
  374. "tricky",
  375. "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
  376. `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
  377. },
  378. // A nonexistent entity. Tokenizing and converting back to a string should
  379. // escape the "&" to become "&amp;".
  380. {
  381. "noSuchEntity",
  382. `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
  383. `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
  384. },
  385. {
  386. "entity without semicolon",
  387. `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
  388. `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
  389. },
  390. {
  391. "entity with digits",
  392. "&frac12;",
  393. "½",
  394. },
  395. // Attribute tests:
  396. // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
  397. {
  398. "Empty attribute",
  399. `<input disabled FOO>`,
  400. `<input disabled="" foo="">`,
  401. },
  402. {
  403. "Empty attribute, whitespace",
  404. `<input disabled FOO >`,
  405. `<input disabled="" foo="">`,
  406. },
  407. {
  408. "Unquoted attribute value",
  409. `<input value=yes FOO=BAR>`,
  410. `<input value="yes" foo="BAR">`,
  411. },
  412. {
  413. "Unquoted attribute value, spaces",
  414. `<input value = yes FOO = BAR>`,
  415. `<input value="yes" foo="BAR">`,
  416. },
  417. {
  418. "Unquoted attribute value, trailing space",
  419. `<input value=yes FOO=BAR >`,
  420. `<input value="yes" foo="BAR">`,
  421. },
  422. {
  423. "Single-quoted attribute value",
  424. `<input value='yes' FOO='BAR'>`,
  425. `<input value="yes" foo="BAR">`,
  426. },
  427. {
  428. "Single-quoted attribute value, trailing space",
  429. `<input value='yes' FOO='BAR' >`,
  430. `<input value="yes" foo="BAR">`,
  431. },
  432. {
  433. "Double-quoted attribute value",
  434. `<input value="I'm an attribute" FOO="BAR">`,
  435. `<input value="I&#39;m an attribute" foo="BAR">`,
  436. },
  437. {
  438. "Attribute name characters",
  439. `<meta http-equiv="content-type">`,
  440. `<meta http-equiv="content-type">`,
  441. },
  442. {
  443. "Mixed attributes",
  444. `a<P V="0 1" w='2' X=3 y>z`,
  445. `a$<p v="0 1" w="2" x="3" y="">$z`,
  446. },
  447. {
  448. "Attributes with a solitary single quote",
  449. `<p id=can't><p id=won't>`,
  450. `<p id="can&#39;t">$<p id="won&#39;t">`,
  451. },
  452. }
  453. func TestTokenizer(t *testing.T) {
  454. loop:
  455. for _, tt := range tokenTests {
  456. z := NewTokenizer(strings.NewReader(tt.html))
  457. if tt.golden != "" {
  458. for i, s := range strings.Split(tt.golden, "$") {
  459. if z.Next() == ErrorToken {
  460. t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
  461. continue loop
  462. }
  463. actual := z.Token().String()
  464. if s != actual {
  465. t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
  466. continue loop
  467. }
  468. }
  469. }
  470. z.Next()
  471. if z.Err() != io.EOF {
  472. t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
  473. }
  474. }
  475. }
  476. func TestMaxBuffer(t *testing.T) {
  477. // Exceeding the maximum buffer size generates ErrBufferExceeded.
  478. z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
  479. z.SetMaxBuf(5)
  480. tt := z.Next()
  481. if got, want := tt, ErrorToken; got != want {
  482. t.Fatalf("token type: got: %v want: %v", got, want)
  483. }
  484. if got, want := z.Err(), ErrBufferExceeded; got != want {
  485. t.Errorf("error type: got: %v want: %v", got, want)
  486. }
  487. if got, want := string(z.Raw()), "<tttt"; got != want {
  488. t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
  489. }
  490. }
  491. func TestMaxBufferReconstruction(t *testing.T) {
  492. // Exceeding the maximum buffer size at any point while tokenizing permits
  493. // reconstructing the original input.
  494. tests:
  495. for _, test := range tokenTests {
  496. for maxBuf := 1; ; maxBuf++ {
  497. r := strings.NewReader(test.html)
  498. z := NewTokenizer(r)
  499. z.SetMaxBuf(maxBuf)
  500. var tokenized bytes.Buffer
  501. for {
  502. tt := z.Next()
  503. tokenized.Write(z.Raw())
  504. if tt == ErrorToken {
  505. if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
  506. t.Errorf("%s: unexpected error: %v", test.desc, err)
  507. }
  508. break
  509. }
  510. }
  511. // Anything tokenized along with untokenized input or data left in the reader.
  512. assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
  513. if err != nil {
  514. t.Errorf("%s: ReadAll: %v", test.desc, err)
  515. continue tests
  516. }
  517. if got, want := string(assembled), test.html; got != want {
  518. t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
  519. continue tests
  520. }
  521. // EOF indicates that we completed tokenization and hence found the max
  522. // maxBuf that generates ErrBufferExceeded, so continue to the next test.
  523. if z.Err() == io.EOF {
  524. break
  525. }
  526. } // buffer sizes
  527. } // tests
  528. }
  529. func TestPassthrough(t *testing.T) {
  530. // Accumulating the raw output for each parse event should reconstruct the
  531. // original input.
  532. for _, test := range tokenTests {
  533. z := NewTokenizer(strings.NewReader(test.html))
  534. var parsed bytes.Buffer
  535. for {
  536. tt := z.Next()
  537. parsed.Write(z.Raw())
  538. if tt == ErrorToken {
  539. break
  540. }
  541. }
  542. if got, want := parsed.String(), test.html; got != want {
  543. t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
  544. }
  545. }
  546. }
  547. func TestBufAPI(t *testing.T) {
  548. s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
  549. z := NewTokenizer(bytes.NewBufferString(s))
  550. var result bytes.Buffer
  551. depth := 0
  552. loop:
  553. for {
  554. tt := z.Next()
  555. switch tt {
  556. case ErrorToken:
  557. if z.Err() != io.EOF {
  558. t.Error(z.Err())
  559. }
  560. break loop
  561. case TextToken:
  562. if depth > 0 {
  563. result.Write(z.Text())
  564. }
  565. case StartTagToken, EndTagToken:
  566. tn, _ := z.TagName()
  567. if len(tn) == 1 && tn[0] == 'a' {
  568. if tt == StartTagToken {
  569. depth++
  570. } else {
  571. depth--
  572. }
  573. }
  574. }
  575. }
  576. u := "14567"
  577. v := string(result.Bytes())
  578. if u != v {
  579. t.Errorf("TestBufAPI: want %q got %q", u, v)
  580. }
  581. }
  582. func TestConvertNewlines(t *testing.T) {
  583. testCases := map[string]string{
  584. "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
  585. "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
  586. "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
  587. "": "",
  588. "\n": "\n",
  589. "\n\r": "\n\n",
  590. "\r": "\n",
  591. "\r\n": "\n",
  592. "\r\n\n": "\n\n",
  593. "\r\n\r": "\n\n",
  594. "\r\n\r\n": "\n\n",
  595. "\r\r": "\n\n",
  596. "\r\r\n": "\n\n",
  597. "\r\r\n\n": "\n\n\n",
  598. "\r\r\r\n": "\n\n\n",
  599. "\r \n": "\n \n",
  600. "xyz": "xyz",
  601. }
  602. for in, want := range testCases {
  603. if got := string(convertNewlines([]byte(in))); got != want {
  604. t.Errorf("input %q: got %q, want %q", in, got, want)
  605. }
  606. }
  607. }
  608. func TestReaderEdgeCases(t *testing.T) {
  609. const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
  610. testCases := []io.Reader{
  611. &zeroOneByteReader{s: s},
  612. &eofStringsReader{s: s},
  613. &stuckReader{},
  614. }
  615. for i, tc := range testCases {
  616. got := []TokenType{}
  617. z := NewTokenizer(tc)
  618. for {
  619. tt := z.Next()
  620. if tt == ErrorToken {
  621. break
  622. }
  623. got = append(got, tt)
  624. }
  625. if err := z.Err(); err != nil && err != io.EOF {
  626. if err != io.ErrNoProgress {
  627. t.Errorf("i=%d: %v", i, err)
  628. }
  629. continue
  630. }
  631. want := []TokenType{
  632. StartTagToken,
  633. TextToken,
  634. EndTagToken,
  635. }
  636. if !reflect.DeepEqual(got, want) {
  637. t.Errorf("i=%d: got %v, want %v", i, got, want)
  638. continue
  639. }
  640. }
  641. }
  642. // zeroOneByteReader is like a strings.Reader that alternates between
  643. // returning 0 bytes and 1 byte at a time.
  644. type zeroOneByteReader struct {
  645. s string
  646. n int
  647. }
  648. func (r *zeroOneByteReader) Read(p []byte) (int, error) {
  649. if len(p) == 0 {
  650. return 0, nil
  651. }
  652. if len(r.s) == 0 {
  653. return 0, io.EOF
  654. }
  655. r.n++
  656. if r.n%2 != 0 {
  657. return 0, nil
  658. }
  659. p[0], r.s = r.s[0], r.s[1:]
  660. return 1, nil
  661. }
  662. // eofStringsReader is like a strings.Reader but can return an (n, err) where
  663. // n > 0 && err != nil.
  664. type eofStringsReader struct {
  665. s string
  666. }
  667. func (r *eofStringsReader) Read(p []byte) (int, error) {
  668. n := copy(p, r.s)
  669. r.s = r.s[n:]
  670. if r.s != "" {
  671. return n, nil
  672. }
  673. return n, io.EOF
  674. }
  675. // stuckReader is an io.Reader that always returns no data and no error.
  676. type stuckReader struct{}
  677. func (*stuckReader) Read(p []byte) (int, error) {
  678. return 0, nil
  679. }
  680. const (
  681. rawLevel = iota
  682. lowLevel
  683. highLevel
  684. )
  685. func benchmarkTokenizer(b *testing.B, level int) {
  686. buf, err := ioutil.ReadFile("testdata/go1.html")
  687. if err != nil {
  688. b.Fatalf("could not read testdata/go1.html: %v", err)
  689. }
  690. b.SetBytes(int64(len(buf)))
  691. runtime.GC()
  692. b.ReportAllocs()
  693. b.ResetTimer()
  694. for i := 0; i < b.N; i++ {
  695. z := NewTokenizer(bytes.NewBuffer(buf))
  696. for {
  697. tt := z.Next()
  698. if tt == ErrorToken {
  699. if err := z.Err(); err != nil && err != io.EOF {
  700. b.Fatalf("tokenizer error: %v", err)
  701. }
  702. break
  703. }
  704. switch level {
  705. case rawLevel:
  706. // Calling z.Raw just returns the raw bytes of the token. It does
  707. // not unescape &lt; to <, or lower-case tag names and attribute keys.
  708. z.Raw()
  709. case lowLevel:
  710. // Caling z.Text, z.TagName and z.TagAttr returns []byte values
  711. // whose contents may change on the next call to z.Next.
  712. switch tt {
  713. case TextToken, CommentToken, DoctypeToken:
  714. z.Text()
  715. case StartTagToken, SelfClosingTagToken:
  716. _, more := z.TagName()
  717. for more {
  718. _, _, more = z.TagAttr()
  719. }
  720. case EndTagToken:
  721. z.TagName()
  722. }
  723. case highLevel:
  724. // Calling z.Token converts []byte values to strings whose validity
  725. // extend beyond the next call to z.Next.
  726. z.Token()
  727. }
  728. }
  729. }
  730. }
  731. func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
  732. func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
  733. func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }