lexer.go 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. package jsoniter
  2. import (
  3. "io"
  4. "errors"
  5. "fmt"
  6. "unicode/utf16"
  7. )
  8. type Lexer struct {
  9. reader io.Reader
  10. buf []byte
  11. head int
  12. tail int
  13. }
  14. func NewLexer(reader io.Reader, bufSize int) *Lexer {
  15. return &Lexer{
  16. reader: reader,
  17. buf: make([]byte, bufSize),
  18. head: 0,
  19. tail: 0,
  20. }
  21. }
  22. func NewLexerWithArray(input []byte) *Lexer {
  23. return &Lexer{
  24. reader: nil,
  25. buf: input,
  26. head: 0,
  27. tail: len(input),
  28. }
  29. }
  30. func (lexer *Lexer) readByte() (byte, error) {
  31. if lexer.head == lexer.tail {
  32. if lexer.reader == nil {
  33. return 0, io.EOF
  34. }
  35. n, err := lexer.reader.Read(lexer.buf)
  36. if err != nil {
  37. return 0, err
  38. }
  39. if n == 0 {
  40. return 0, io.EOF
  41. }
  42. lexer.head = 0
  43. lexer.tail = n
  44. }
  45. b := lexer.buf[lexer.head]
  46. lexer.head += 1
  47. return b, nil
  48. }
  49. func (lexer *Lexer) unreadByte() error {
  50. if lexer.head == 0 {
  51. return errors.New("unread too many bytes")
  52. }
  53. lexer.head -= 1
  54. return nil
  55. }
  56. const maxUint64 = (1 << 64 - 1)
  57. const cutoffUint64 = maxUint64 / 10 + 1
  58. const maxUint32 = (1 << 32 - 1)
  59. const cutoffUint32 = maxUint32 / 10 + 1
  60. func (lexer *Lexer) LexUin64() (uint64, error) {
  61. var n uint64
  62. c, err := lexer.readByte()
  63. if err != nil {
  64. return 0, err
  65. }
  66. /* a single zero, or a series of integers */
  67. if c == '0' {
  68. c, err = lexer.readByte()
  69. if err != nil && err != io.EOF {
  70. return 0, err
  71. }
  72. } else if c >= '1' && c <= '9' {
  73. for c >= '0' && c <= '9' {
  74. var v byte
  75. v = c - '0'
  76. if n >= cutoffUint64 {
  77. return 0, errors.New("overflow")
  78. }
  79. n = n * uint64(10) + uint64(v)
  80. c, err = lexer.readByte()
  81. if err != nil && err != io.EOF {
  82. return 0, err
  83. }
  84. }
  85. lexer.unreadByte()
  86. } else {
  87. lexer.unreadByte()
  88. return 0, errors.New("unexpected")
  89. }
  90. return n, nil
  91. }
  92. func (lexer *Lexer) LexInt64() (int64, error) {
  93. c, err := lexer.readByte()
  94. if err != nil {
  95. return 0, err
  96. }
  97. /* optional leading minus */
  98. if c == '-' {
  99. n, err := lexer.LexUin64()
  100. if err != nil {
  101. return 0, err
  102. }
  103. return -int64(n), nil
  104. } else {
  105. lexer.unreadByte()
  106. n, err := lexer.LexUin64()
  107. if err != nil {
  108. return 0, err
  109. }
  110. return int64(n), nil
  111. }
  112. }
  113. func (lexer *Lexer) LexString() (string, error) {
  114. str := make([]byte, 0, 10)
  115. c, err := lexer.readByte()
  116. if err != nil {
  117. return "", err
  118. }
  119. if c != '"' {
  120. return "", errors.New("unexpected")
  121. }
  122. for {
  123. c, err = lexer.readByte()
  124. if err != nil {
  125. return "", err
  126. }
  127. switch c {
  128. case '\\':
  129. c, err = lexer.readByte()
  130. if err != nil {
  131. return "", err
  132. }
  133. switch c {
  134. case 'u':
  135. r, err := lexer.readU4()
  136. if err != nil {
  137. return "", err
  138. }
  139. if utf16.IsSurrogate(r) {
  140. c, err = lexer.readByte()
  141. if err != nil {
  142. return "", err
  143. }
  144. if c != '\\' {
  145. return "", fmt.Errorf("unexpected: %v", c)
  146. }
  147. c, err = lexer.readByte()
  148. if err != nil {
  149. return "", err
  150. }
  151. if c != 'u' {
  152. return "", fmt.Errorf("unexpected: %v", c)
  153. }
  154. r2, err := lexer.readU4()
  155. if err != nil {
  156. return "", err
  157. }
  158. combined := utf16.DecodeRune(r, r2)
  159. str = appendRune(str, combined)
  160. } else {
  161. str = appendRune(str, r)
  162. }
  163. case '"':
  164. str = append(str, '"')
  165. case '\\':
  166. str = append(str, '\\')
  167. case '/':
  168. str = append(str, '/')
  169. case 'b':
  170. str = append(str, '\b')
  171. case 'f':
  172. str = append(str, '\f')
  173. case 'n':
  174. str = append(str, '\n')
  175. case 'r':
  176. str = append(str, '\r')
  177. case 't':
  178. str = append(str, '\t')
  179. default:
  180. return "", errors.New("unexpected")
  181. }
  182. case '"':
  183. return string(str), nil
  184. default:
  185. str = append(str, c)
  186. }
  187. }
  188. }
  189. func (lexer *Lexer) readU4() (rune, error) {
  190. var u4 rune
  191. for i := 0; i < 4; i++ {
  192. c, err := lexer.readByte()
  193. if err != nil {
  194. return 0, err
  195. }
  196. if (c >= '0' && c <= '9') {
  197. if u4 >= cutoffUint32 {
  198. return 0, errors.New("overflow")
  199. }
  200. u4 = u4 * 16 + rune(c - '0')
  201. } else if ((c >= 'a' && c <= 'f') ) {
  202. if u4 >= cutoffUint32 {
  203. return 0, errors.New("overflow")
  204. }
  205. u4 = u4 * 16 + rune(c - 'a' + 10)
  206. } else {
  207. return 0, fmt.Errorf("unexpected: %v", c)
  208. }
  209. }
  210. return u4, nil
  211. }
  212. const (
  213. t1 = 0x00 // 0000 0000
  214. tx = 0x80 // 1000 0000
  215. t2 = 0xC0 // 1100 0000
  216. t3 = 0xE0 // 1110 0000
  217. t4 = 0xF0 // 1111 0000
  218. t5 = 0xF8 // 1111 1000
  219. maskx = 0x3F // 0011 1111
  220. mask2 = 0x1F // 0001 1111
  221. mask3 = 0x0F // 0000 1111
  222. mask4 = 0x07 // 0000 0111
  223. rune1Max = 1 << 7 - 1
  224. rune2Max = 1 << 11 - 1
  225. rune3Max = 1 << 16 - 1
  226. surrogateMin = 0xD800
  227. surrogateMax = 0xDFFF
  228. MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
  229. RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
  230. )
  231. func appendRune(p []byte, r rune) []byte {
  232. // Negative values are erroneous. Making it unsigned addresses the problem.
  233. switch i := uint32(r); {
  234. case i <= rune1Max:
  235. p = append(p, byte(r))
  236. return p
  237. case i <= rune2Max:
  238. p = append(p, t2 | byte(r >> 6))
  239. p = append(p, tx | byte(r) & maskx)
  240. return p
  241. case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
  242. r = RuneError
  243. fallthrough
  244. case i <= rune3Max:
  245. p = append(p, t3 | byte(r >> 12))
  246. p = append(p, tx | byte(r >> 6) & maskx)
  247. p = append(p, tx | byte(r) & maskx)
  248. return p
  249. default:
  250. p = append(p, t4 | byte(r >> 18))
  251. p = append(p, tx | byte(r >> 12) & maskx)
  252. p = append(p, tx | byte(r >> 6) & maskx)
  253. p = append(p, tx | byte(r) & maskx)
  254. return p
  255. }
  256. }