lex.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. // Copyright 2016 José Santos <henrique_1609@me.com>
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package jet
  15. import (
  16. "fmt"
  17. "strings"
  18. "unicode"
  19. "unicode/utf8"
  20. )
  21. // item represents a token or text string returned from the scanner.
  22. type item struct {
  23. typ itemType // The type of this item.
  24. pos Pos // The starting position, in bytes, of this item in the input string.
  25. val string // The value of this item.
  26. }
  27. func (i item) String() string {
  28. switch {
  29. case i.typ == itemEOF:
  30. return "EOF"
  31. case i.typ == itemError:
  32. return i.val
  33. case i.typ > itemKeyword:
  34. return fmt.Sprintf("<%s>", i.val)
  35. case len(i.val) > 10:
  36. return fmt.Sprintf("%.10q...", i.val)
  37. }
  38. return fmt.Sprintf("%q", i.val)
  39. }
  40. // itemType identifies the type of lex items.
  41. type itemType int
  42. const (
  43. itemError itemType = iota // error occurred; value is text of error
  44. itemBool // boolean constant
  45. itemChar // printable ASCII character; grab bag for comma etc.
  46. itemCharConstant // character constant
  47. itemComplex // complex constant (1+2i); imaginary is just a number
  48. itemEOF
  49. itemField // alphanumeric identifier starting with '.'
  50. itemIdentifier // alphanumeric identifier not starting with '.'
  51. itemLeftDelim // left action delimiter
  52. itemLeftParen // '(' inside action
  53. itemNumber // simple number, including imaginary
  54. itemPipe // pipe symbol
  55. itemRawString // raw quoted string (includes quotes)
  56. itemRightDelim // right action delimiter
  57. itemRightParen // ')' inside action
  58. itemSpace // run of spaces separating arguments
  59. itemString // quoted string (includes quotes)
  60. itemText // plain text
  61. itemAssign
  62. itemEquals
  63. itemNotEquals
  64. itemGreat
  65. itemGreatEquals
  66. itemLess
  67. itemLessEquals
  68. itemComma
  69. itemColonComma
  70. itemAdd
  71. itemMinus
  72. itemMul
  73. itemDiv
  74. itemMod
  75. itemColon
  76. itemTernary
  77. itemLeftBrackets
  78. itemRightBrackets
  79. // Keywords appear after all the rest.
  80. itemKeyword // used only to delimit the keywords
  81. itemExtends
  82. itemBlock
  83. itemYield
  84. itemContent
  85. itemInclude
  86. itemElse
  87. itemEnd
  88. itemIf
  89. itemNil
  90. itemRange
  91. itemImport
  92. itemAnd
  93. itemOr
  94. itemNot
  95. itemMSG
  96. itemTrans
  97. )
  98. var key = map[string]itemType{
  99. "extends": itemExtends,
  100. "import": itemImport,
  101. "include": itemInclude,
  102. "block": itemBlock,
  103. "yield": itemYield,
  104. "else": itemElse,
  105. "end": itemEnd,
  106. "if": itemIf,
  107. "range": itemRange,
  108. "nil": itemNil,
  109. "and": itemAnd,
  110. "or": itemOr,
  111. "not": itemNot,
  112. "content": itemContent,
  113. "msg": itemMSG,
  114. "trans": itemTrans,
  115. }
  116. const eof = -1
  117. // stateFn represents the state of the scanner as a function that returns the next state.
  118. type stateFn func(*lexer) stateFn
  119. // lexer holds the state of the scanner.
  120. type lexer struct {
  121. name string // the name of the input; used only for error reports
  122. input string // the string being scanned
  123. state stateFn // the next lexing function to enter
  124. pos Pos // current position in the input
  125. start Pos // start position of this item
  126. width Pos // width of last rune read from input
  127. lastPos Pos // position of most recent item returned by nextItem
  128. items chan item // channel of scanned items
  129. parenDepth int // nesting depth of ( ) exprs
  130. lastType itemType
  131. }
  132. // next returns the next rune in the input.
  133. func (l *lexer) next() rune {
  134. if int(l.pos) >= len(l.input) {
  135. l.width = 0
  136. return eof
  137. }
  138. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  139. l.width = Pos(w)
  140. l.pos += l.width
  141. return r
  142. }
  143. // peek returns but does not consume the next rune in the input.
  144. func (l *lexer) peek() rune {
  145. r := l.next()
  146. l.backup()
  147. return r
  148. }
  149. // backup steps back one rune. Can only be called once per call of next.
  150. func (l *lexer) backup() {
  151. l.pos -= l.width
  152. }
  153. // emit passes an item back to the client.
  154. func (l *lexer) emit(t itemType) {
  155. l.lastType = t
  156. l.items <- item{t, l.start, l.input[l.start:l.pos]}
  157. l.start = l.pos
  158. }
  159. // ignore skips over the pending input before this point.
  160. func (l *lexer) ignore() {
  161. l.start = l.pos
  162. }
  163. // accept consumes the next rune if it's from the valid set.
  164. func (l *lexer) accept(valid string) bool {
  165. if strings.IndexRune(valid, l.next()) >= 0 {
  166. return true
  167. }
  168. l.backup()
  169. return false
  170. }
  171. // acceptRun consumes a run of runes from the valid set.
  172. func (l *lexer) acceptRun(valid string) {
  173. for strings.IndexRune(valid, l.next()) >= 0 {
  174. }
  175. l.backup()
  176. }
  177. // lineNumber reports which line we're on, based on the position of
  178. // the previous item returned by nextItem. Doing it this way
  179. // means we don't have to worry about peek double counting.
  180. func (l *lexer) lineNumber() int {
  181. return 1 + strings.Count(l.input[:l.lastPos], "\n")
  182. }
  183. // errorf returns an error token and terminates the scan by passing
  184. // back a nil pointer that will be the next state, terminating l.nextItem.
  185. func (l *lexer) errorf(format string, args ...interface{}) stateFn {
  186. l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
  187. return nil
  188. }
  189. // nextItem returns the next item from the input.
  190. // Called by the parser, not in the lexing goroutine.
  191. func (l *lexer) nextItem() item {
  192. item := <-l.items
  193. l.lastPos = item.pos
  194. return item
  195. }
  196. // drain drains the output so the lexing goroutine will exit.
  197. // Called by the parser, not in the lexing goroutine.
  198. func (l *lexer) drain() {
  199. for range l.items {
  200. }
  201. }
  202. // lex creates a new scanner for the input string.
  203. func lex(name, input string) *lexer {
  204. l := &lexer{
  205. name: name,
  206. input: input,
  207. items: make(chan item),
  208. }
  209. go l.run()
  210. return l
  211. }
  212. // run runs the state machine for the lexer.
  213. func (l *lexer) run() {
  214. for l.state = lexText; l.state != nil; {
  215. l.state = l.state(l)
  216. }
  217. close(l.items)
  218. }
  219. const (
  220. leftDelim = "{{"
  221. rightDelim = "}}"
  222. leftComment = "{*"
  223. rightComment = "*}"
  224. )
  225. // state functions
  226. func lexText(l *lexer) stateFn {
  227. for {
  228. if i := strings.IndexByte(l.input[l.pos:], '{'); i == -1 {
  229. l.pos = Pos(len(l.input))
  230. break
  231. } else {
  232. l.pos += Pos(i)
  233. if strings.HasPrefix(l.input[l.pos:], leftDelim) {
  234. if l.pos > l.start {
  235. l.emit(itemText)
  236. }
  237. return lexLeftDelim
  238. }
  239. if strings.HasPrefix(l.input[l.pos:], leftComment) {
  240. if l.pos > l.start {
  241. l.emit(itemText)
  242. }
  243. return lexComment
  244. }
  245. }
  246. if l.next() == eof {
  247. break
  248. }
  249. }
  250. // Correctly reached EOF.
  251. if l.pos > l.start {
  252. l.emit(itemText)
  253. }
  254. l.emit(itemEOF)
  255. return nil
  256. }
  257. func lexLeftDelim(l *lexer) stateFn {
  258. l.pos += Pos(len(leftDelim))
  259. l.emit(itemLeftDelim)
  260. l.parenDepth = 0
  261. return lexInsideAction
  262. }
  263. // lexComment scans a comment. The left comment marker is known to be present.
  264. func lexComment(l *lexer) stateFn {
  265. l.pos += Pos(len(leftComment))
  266. i := strings.Index(l.input[l.pos:], rightComment)
  267. if i < 0 {
  268. return l.errorf("unclosed comment")
  269. }
  270. l.pos += Pos(i + len(rightComment))
  271. l.ignore()
  272. return lexText
  273. }
  274. // lexRightDelim scans the right delimiter, which is known to be present.
  275. func lexRightDelim(l *lexer) stateFn {
  276. l.pos += Pos(len(rightDelim))
  277. l.emit(itemRightDelim)
  278. return lexText
  279. }
  280. // lexInsideAction scans the elements inside action delimiters.
  281. func lexInsideAction(l *lexer) stateFn {
  282. // Either number, quoted string, or identifier.
  283. // Spaces separate arguments; runs of spaces turn into itemSpace.
  284. // Pipe symbols separate and are emitted.
  285. if strings.HasPrefix(l.input[l.pos:], rightDelim) {
  286. if l.parenDepth == 0 {
  287. return lexRightDelim
  288. }
  289. return l.errorf("unclosed left paren")
  290. }
  291. switch r := l.next(); {
  292. case r == eof || isEndOfLine(r):
  293. return l.errorf("unclosed action")
  294. case isSpace(r):
  295. return lexSpace
  296. case r == ',':
  297. l.emit(itemComma)
  298. case r == ';':
  299. l.emit(itemColonComma)
  300. case r == '*':
  301. l.emit(itemMul)
  302. case r == '/':
  303. l.emit(itemDiv)
  304. case r == '%':
  305. l.emit(itemMod)
  306. case r == '-':
  307. if r := l.peek(); '0' <= r && r <= '9' &&
  308. itemAdd != l.lastType &&
  309. itemMinus != l.lastType &&
  310. itemNumber != l.lastType &&
  311. itemIdentifier != l.lastType &&
  312. itemString != l.lastType &&
  313. itemRawString != l.lastType &&
  314. itemCharConstant != l.lastType &&
  315. itemBool != l.lastType &&
  316. itemField != l.lastType &&
  317. itemChar != l.lastType &&
  318. itemTrans != l.lastType {
  319. l.backup()
  320. return lexNumber
  321. }
  322. l.emit(itemMinus)
  323. case r == '+':
  324. if r := l.peek(); '0' <= r && r <= '9' &&
  325. itemAdd != l.lastType &&
  326. itemMinus != l.lastType &&
  327. itemNumber != l.lastType &&
  328. itemIdentifier != l.lastType &&
  329. itemString != l.lastType &&
  330. itemRawString != l.lastType &&
  331. itemCharConstant != l.lastType &&
  332. itemBool != l.lastType &&
  333. itemField != l.lastType &&
  334. itemChar != l.lastType &&
  335. itemTrans != l.lastType {
  336. l.backup()
  337. return lexNumber
  338. }
  339. l.emit(itemAdd)
  340. case r == '?':
  341. l.emit(itemTernary)
  342. case r == '&':
  343. if l.next() == '&' {
  344. l.emit(itemAnd)
  345. } else {
  346. l.backup()
  347. }
  348. case r == '<':
  349. if l.next() == '=' {
  350. l.emit(itemLessEquals)
  351. } else {
  352. l.backup()
  353. l.emit(itemLess)
  354. }
  355. case r == '>':
  356. if l.next() == '=' {
  357. l.emit(itemGreatEquals)
  358. } else {
  359. l.backup()
  360. l.emit(itemGreat)
  361. }
  362. case r == '!':
  363. if l.next() == '=' {
  364. l.emit(itemNotEquals)
  365. } else {
  366. l.backup()
  367. l.emit(itemNot)
  368. }
  369. case r == '=':
  370. if l.next() == '=' {
  371. l.emit(itemEquals)
  372. } else {
  373. l.backup()
  374. l.emit(itemAssign)
  375. }
  376. case r == ':':
  377. if l.next() == '=' {
  378. l.emit(itemAssign)
  379. } else {
  380. l.backup()
  381. l.emit(itemColon)
  382. }
  383. case r == '|':
  384. if l.next() == '|' {
  385. l.emit(itemOr)
  386. } else {
  387. l.backup()
  388. l.emit(itemPipe)
  389. }
  390. case r == '"':
  391. return lexQuote
  392. case r == '`':
  393. return lexRawQuote
  394. case r == '\'':
  395. return lexChar
  396. case r == '.':
  397. // special look-ahead for ".field" so we don't break l.backup().
  398. if l.pos < Pos(len(l.input)) {
  399. r := l.input[l.pos]
  400. if r < '0' || '9' < r {
  401. return lexField
  402. }
  403. }
  404. fallthrough // '.' can start a number.
  405. case '0' <= r && r <= '9':
  406. l.backup()
  407. return lexNumber
  408. case isAlphaNumeric(r):
  409. l.backup()
  410. return lexIdentifier
  411. case r == '[':
  412. l.emit(itemLeftBrackets)
  413. case r == ']':
  414. l.emit(itemRightBrackets)
  415. case r == '(':
  416. l.emit(itemLeftParen)
  417. l.parenDepth++
  418. case r == ')':
  419. l.emit(itemRightParen)
  420. l.parenDepth--
  421. if l.parenDepth < 0 {
  422. return l.errorf("unexpected right paren %#U", r)
  423. }
  424. case r <= unicode.MaxASCII && unicode.IsPrint(r):
  425. l.emit(itemChar)
  426. return lexInsideAction
  427. default:
  428. return l.errorf("unrecognized character in action: %#U", r)
  429. }
  430. return lexInsideAction
  431. }
  432. // lexSpace scans a run of space characters.
  433. // One space has already been seen.
  434. func lexSpace(l *lexer) stateFn {
  435. for isSpace(l.peek()) {
  436. l.next()
  437. }
  438. l.emit(itemSpace)
  439. return lexInsideAction
  440. }
  441. // lexIdentifier scans an alphanumeric.
  442. func lexIdentifier(l *lexer) stateFn {
  443. Loop:
  444. for {
  445. switch r := l.next(); {
  446. case isAlphaNumeric(r):
  447. // absorb.
  448. default:
  449. l.backup()
  450. word := l.input[l.start:l.pos]
  451. if !l.atTerminator() {
  452. return l.errorf("bad character %#U", r)
  453. }
  454. switch {
  455. case key[word] > itemKeyword:
  456. l.emit(key[word])
  457. case word[0] == '.':
  458. l.emit(itemField)
  459. case word == "true", word == "false":
  460. l.emit(itemBool)
  461. default:
  462. l.emit(itemIdentifier)
  463. }
  464. break Loop
  465. }
  466. }
  467. return lexInsideAction
  468. }
  469. // lexField scans a field: .Alphanumeric.
  470. // The . has been scanned.
  471. func lexField(l *lexer) stateFn {
  472. if l.atTerminator() {
  473. // Nothing interesting follows -> "." or "$".
  474. l.emit(itemIdentifier)
  475. return lexInsideAction
  476. }
  477. var r rune
  478. for {
  479. r = l.next()
  480. if !isAlphaNumeric(r) {
  481. l.backup()
  482. break
  483. }
  484. }
  485. if !l.atTerminator() {
  486. return l.errorf("bad character %#U", r)
  487. }
  488. l.emit(itemField)
  489. return lexInsideAction
  490. }
  491. // atTerminator reports whether the input is at valid termination character to
  492. // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
  493. // like "$x+2" not being acceptable without a space, in case we decide one
  494. // day to implement arithmetic.
  495. func (l *lexer) atTerminator() bool {
  496. r := l.peek()
  497. if isSpace(r) || isEndOfLine(r) {
  498. return true
  499. }
  500. switch r {
  501. case eof, '.', ',', '|', ':', ')', '=', '(', ';', '?', '[', ']', '+', '-', '/', '%', '*', '&', '!', '<', '>':
  502. return true
  503. }
  504. // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
  505. // succeed but should fail) but only in extremely rare cases caused by willfully
  506. // bad choice of delimiter.
  507. if rd, _ := utf8.DecodeRuneInString(rightDelim); rd == r {
  508. return true
  509. }
  510. return false
  511. }
  512. // lexChar scans a character constant. The initial quote is already
  513. // scanned. Syntax checking is done by the parser.
  514. func lexChar(l *lexer) stateFn {
  515. Loop:
  516. for {
  517. switch l.next() {
  518. case '\\':
  519. if r := l.next(); r != eof && r != '\n' {
  520. break
  521. }
  522. fallthrough
  523. case eof, '\n':
  524. return l.errorf("unterminated character constant")
  525. case '\'':
  526. break Loop
  527. }
  528. }
  529. l.emit(itemCharConstant)
  530. return lexInsideAction
  531. }
  532. // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
  533. // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
  534. // and "089" - but when it's wrong the input is invalid and the parser (via
  535. // strconv) will notice.
  536. func lexNumber(l *lexer) stateFn {
  537. if !l.scanNumber() {
  538. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  539. }
  540. l.emit(itemNumber)
  541. return lexInsideAction
  542. }
  543. func (l *lexer) scanNumber() bool {
  544. // Optional leading sign.
  545. l.accept("+-")
  546. // Is it hex?
  547. digits := "0123456789"
  548. if l.accept("0") && l.accept("xX") {
  549. digits = "0123456789abcdefABCDEF"
  550. }
  551. l.acceptRun(digits)
  552. if l.accept(".") {
  553. l.acceptRun(digits)
  554. }
  555. if l.accept("eE") {
  556. l.accept("+-")
  557. l.acceptRun("0123456789")
  558. }
  559. //Is it imaginary?
  560. l.accept("i")
  561. //Next thing mustn't be alphanumeric.
  562. if isAlphaNumeric(l.peek()) {
  563. l.next()
  564. return false
  565. }
  566. return true
  567. }
  568. // lexQuote scans a quoted string.
  569. func lexQuote(l *lexer) stateFn {
  570. Loop:
  571. for {
  572. switch l.next() {
  573. case '\\':
  574. if r := l.next(); r != eof && r != '\n' {
  575. break
  576. }
  577. fallthrough
  578. case eof, '\n':
  579. return l.errorf("unterminated quoted string")
  580. case '"':
  581. break Loop
  582. }
  583. }
  584. l.emit(itemString)
  585. return lexInsideAction
  586. }
  587. // lexRawQuote scans a raw quoted string.
  588. func lexRawQuote(l *lexer) stateFn {
  589. Loop:
  590. for {
  591. switch l.next() {
  592. case eof:
  593. return l.errorf("unterminated raw quoted string")
  594. case '`':
  595. break Loop
  596. }
  597. }
  598. l.emit(itemRawString)
  599. return lexInsideAction
  600. }
  601. // isSpace reports whether r is a space character.
  602. func isSpace(r rune) bool {
  603. return r == ' ' || r == '\t'
  604. }
  605. // isEndOfLine reports whether r is an end-of-line character.
  606. func isEndOfLine(r rune) bool {
  607. return r == '\r' || r == '\n'
  608. }
  609. // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
  610. func isAlphaNumeric(r rune) bool {
  611. return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
  612. }