feature_iter_string.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. package jsoniter
  2. import (
  3. "unicode/utf16"
  4. "unsafe"
  5. )
  6. // TODO: avoid append
  7. func (iter *Iterator) ReadString() (ret string) {
  8. c := iter.nextToken()
  9. if c == '"' {
  10. copied := make([]byte, 32)
  11. j := 0
  12. fast_loop:
  13. for {
  14. i := iter.head
  15. for ; i < iter.tail && j < len(copied); i++ {
  16. c := iter.buf[i]
  17. if c == '"' {
  18. iter.head = i + 1
  19. copied = copied[:j]
  20. return *(*string)(unsafe.Pointer(&copied))
  21. } else if c == '\\' {
  22. iter.head = i
  23. break fast_loop
  24. }
  25. copied[j] = c
  26. j++
  27. }
  28. if i == iter.tail {
  29. if iter.loadMore() {
  30. i = iter.head
  31. continue
  32. } else {
  33. iter.reportError("ReadString", "incomplete string")
  34. return
  35. }
  36. }
  37. iter.head = i
  38. if j == len(copied) {
  39. newBuf := make([]byte, len(copied) * 2)
  40. copy(newBuf, copied)
  41. copied = newBuf
  42. }
  43. }
  44. return iter.readStringSlowPath(copied[:j])
  45. }
  46. iter.reportError("ReadString", `expects " or n`)
  47. return
  48. }
  49. func (iter *Iterator) readStringSlowPath(str []byte) (ret string) {
  50. var c byte
  51. for iter.Error == nil {
  52. c = iter.readByte()
  53. if c == '"' {
  54. return *(*string)(unsafe.Pointer(&str))
  55. }
  56. if c == '\\' {
  57. c = iter.readByte()
  58. switch c {
  59. case 'u':
  60. r := iter.readU4()
  61. if utf16.IsSurrogate(r) {
  62. c = iter.readByte()
  63. if iter.Error != nil {
  64. return
  65. }
  66. if c != '\\' {
  67. iter.reportError("ReadString",
  68. `expects \u after utf16 surrogate, but \ not found`)
  69. return
  70. }
  71. c = iter.readByte()
  72. if iter.Error != nil {
  73. return
  74. }
  75. if c != 'u' {
  76. iter.reportError("ReadString",
  77. `expects \u after utf16 surrogate, but \u not found`)
  78. return
  79. }
  80. r2 := iter.readU4()
  81. if iter.Error != nil {
  82. return
  83. }
  84. combined := utf16.DecodeRune(r, r2)
  85. str = appendRune(str, combined)
  86. } else {
  87. str = appendRune(str, r)
  88. }
  89. case '"':
  90. str = append(str, '"')
  91. case '\\':
  92. str = append(str, '\\')
  93. case '/':
  94. str = append(str, '/')
  95. case 'b':
  96. str = append(str, '\b')
  97. case 'f':
  98. str = append(str, '\f')
  99. case 'n':
  100. str = append(str, '\n')
  101. case 'r':
  102. str = append(str, '\r')
  103. case 't':
  104. str = append(str, '\t')
  105. default:
  106. iter.reportError("ReadString",
  107. `invalid escape char after \`)
  108. return
  109. }
  110. } else {
  111. str = append(str, c)
  112. }
  113. }
  114. return
  115. }
  116. func (iter *Iterator) ReadStringAsSlice() (ret []byte) {
  117. c := iter.nextToken()
  118. if c == '"' {
  119. for i := iter.head; i < iter.tail; i++ {
  120. // require ascii string and no escape
  121. // for: field name, base64, number
  122. if iter.buf[i] == '"' {
  123. // fast path: reuse the underlying buffer
  124. ret = iter.buf[iter.head : i]
  125. iter.head = i + 1
  126. return ret
  127. }
  128. }
  129. readLen := iter.tail - iter.head
  130. copied := make([]byte, readLen, readLen * 2)
  131. copy(copied, iter.buf[iter.head:iter.tail])
  132. iter.head = iter.tail
  133. for iter.Error == nil {
  134. c := iter.readByte()
  135. if c == '"' {
  136. return copied
  137. }
  138. copied = append(copied, c)
  139. }
  140. return copied
  141. }
  142. iter.reportError("ReadString", `expects " or n`)
  143. return
  144. }
  145. func (iter *Iterator) readU4() (ret rune) {
  146. for i := 0; i < 4; i++ {
  147. c := iter.readByte()
  148. if iter.Error != nil {
  149. return
  150. }
  151. if c >= '0' && c <= '9' {
  152. ret = ret * 16 + rune(c - '0')
  153. } else if c >= 'a' && c <= 'f' {
  154. ret = ret * 16 + rune(c - 'a' + 10)
  155. } else {
  156. iter.reportError("readU4", "expects 0~9 or a~f")
  157. return
  158. }
  159. }
  160. return ret
  161. }
  162. const (
  163. t1 = 0x00 // 0000 0000
  164. tx = 0x80 // 1000 0000
  165. t2 = 0xC0 // 1100 0000
  166. t3 = 0xE0 // 1110 0000
  167. t4 = 0xF0 // 1111 0000
  168. t5 = 0xF8 // 1111 1000
  169. maskx = 0x3F // 0011 1111
  170. mask2 = 0x1F // 0001 1111
  171. mask3 = 0x0F // 0000 1111
  172. mask4 = 0x07 // 0000 0111
  173. rune1Max = 1 << 7 - 1
  174. rune2Max = 1 << 11 - 1
  175. rune3Max = 1 << 16 - 1
  176. surrogateMin = 0xD800
  177. surrogateMax = 0xDFFF
  178. maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
  179. runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
  180. )
  181. func appendRune(p []byte, r rune) []byte {
  182. // Negative values are erroneous. Making it unsigned addresses the problem.
  183. switch i := uint32(r); {
  184. case i <= rune1Max:
  185. p = append(p, byte(r))
  186. return p
  187. case i <= rune2Max:
  188. p = append(p, t2 | byte(r >> 6))
  189. p = append(p, tx | byte(r) & maskx)
  190. return p
  191. case i > maxRune, surrogateMin <= i && i <= surrogateMax:
  192. r = runeError
  193. fallthrough
  194. case i <= rune3Max:
  195. p = append(p, t3 | byte(r >> 12))
  196. p = append(p, tx | byte(r >> 6) & maskx)
  197. p = append(p, tx | byte(r) & maskx)
  198. return p
  199. default:
  200. p = append(p, t4 | byte(r >> 18))
  201. p = append(p, tx | byte(r >> 12) & maskx)
  202. p = append(p, tx | byte(r >> 6) & maskx)
  203. p = append(p, tx | byte(r) & maskx)
  204. return p
  205. }
  206. }