feature_iter_string.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. package jsoniter
  2. import (
  3. "unicode/utf16"
  4. )
  5. func (iter *Iterator) ReadString() (ret string) {
  6. c := iter.nextToken()
  7. if c == '"' {
  8. for i := iter.head ; i < iter.tail; i++ {
  9. c := iter.buf[i]
  10. if c == '"' {
  11. ret = string(iter.buf[iter.head:i])
  12. iter.head = i + 1
  13. return ret
  14. } else if c == '\\' {
  15. break
  16. }
  17. }
  18. return iter.readStringSlowPath()
  19. } else if c == 'n' {
  20. iter.skipFixedBytes(3)
  21. return ""
  22. }
  23. iter.reportError("ReadString", `expects " or n`)
  24. return
  25. }
  26. func (iter *Iterator) readStringSlowPath() (ret string) {
  27. var str []byte
  28. var c byte
  29. for iter.Error == nil {
  30. c = iter.readByte()
  31. if c == '"' {
  32. return string(str)
  33. }
  34. if c == '\\' {
  35. c = iter.readByte()
  36. switch c {
  37. case 'u', 'U':
  38. r := iter.readU4()
  39. if utf16.IsSurrogate(r) {
  40. c = iter.readByte()
  41. if iter.Error != nil {
  42. return
  43. }
  44. if c != '\\' {
  45. iter.reportError("ReadString",
  46. `expects \u after utf16 surrogate, but \ not found`)
  47. return
  48. }
  49. c = iter.readByte()
  50. if iter.Error != nil {
  51. return
  52. }
  53. if c != 'u' && c != 'U' {
  54. iter.reportError("ReadString",
  55. `expects \u after utf16 surrogate, but \u not found`)
  56. return
  57. }
  58. r2 := iter.readU4()
  59. if iter.Error != nil {
  60. return
  61. }
  62. combined := utf16.DecodeRune(r, r2)
  63. str = appendRune(str, combined)
  64. } else {
  65. str = appendRune(str, r)
  66. }
  67. case '"':
  68. str = append(str, '"')
  69. case '\\':
  70. str = append(str, '\\')
  71. case '/':
  72. str = append(str, '/')
  73. case 'b':
  74. str = append(str, '\b')
  75. case 'f':
  76. str = append(str, '\f')
  77. case 'n':
  78. str = append(str, '\n')
  79. case 'r':
  80. str = append(str, '\r')
  81. case 't':
  82. str = append(str, '\t')
  83. default:
  84. iter.reportError("ReadString",
  85. `invalid escape char after \`)
  86. return
  87. }
  88. } else {
  89. str = append(str, c)
  90. }
  91. }
  92. return
  93. }
  94. func (iter *Iterator) ReadStringAsSlice() (ret []byte) {
  95. c := iter.nextToken()
  96. if c == '"' {
  97. for i := iter.head; i < iter.tail; i++ {
  98. // require ascii string and no escape
  99. // for: field name, base64, number
  100. if iter.buf[i] == '"' {
  101. // fast path: reuse the underlying buffer
  102. ret = iter.buf[iter.head : i]
  103. iter.head = i + 1
  104. return ret
  105. }
  106. }
  107. readLen := iter.tail - iter.head
  108. copied := make([]byte, readLen, readLen * 2)
  109. copy(copied, iter.buf[iter.head:iter.tail])
  110. iter.head = iter.tail
  111. for iter.Error == nil {
  112. c := iter.readByte()
  113. if c == '"' {
  114. return copied
  115. }
  116. copied = append(copied, c)
  117. }
  118. return copied
  119. }
  120. iter.reportError("ReadString", `expects " or n`)
  121. return
  122. }
  123. func (iter *Iterator) readU4() (ret rune) {
  124. for i := 0; i < 4; i++ {
  125. c := iter.readByte()
  126. if iter.Error != nil {
  127. return
  128. }
  129. if c >= '0' && c <= '9' {
  130. ret = ret * 16 + rune(c - '0')
  131. } else if c >= 'a' && c <= 'f' {
  132. ret = ret * 16 + rune(c - 'a' + 10)
  133. } else if c >= 'A' && c <= 'F' {
  134. ret = ret * 16 + rune(c - 'A' + 10)
  135. } else {
  136. iter.reportError("readU4", "expects 0~9 or a~f")
  137. return
  138. }
  139. }
  140. return ret
  141. }
  142. const (
  143. t1 = 0x00 // 0000 0000
  144. tx = 0x80 // 1000 0000
  145. t2 = 0xC0 // 1100 0000
  146. t3 = 0xE0 // 1110 0000
  147. t4 = 0xF0 // 1111 0000
  148. t5 = 0xF8 // 1111 1000
  149. maskx = 0x3F // 0011 1111
  150. mask2 = 0x1F // 0001 1111
  151. mask3 = 0x0F // 0000 1111
  152. mask4 = 0x07 // 0000 0111
  153. rune1Max = 1 << 7 - 1
  154. rune2Max = 1 << 11 - 1
  155. rune3Max = 1 << 16 - 1
  156. surrogateMin = 0xD800
  157. surrogateMax = 0xDFFF
  158. maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
  159. runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
  160. )
  161. func appendRune(p []byte, r rune) []byte {
  162. // Negative values are erroneous. Making it unsigned addresses the problem.
  163. switch i := uint32(r); {
  164. case i <= rune1Max:
  165. p = append(p, byte(r))
  166. return p
  167. case i <= rune2Max:
  168. p = append(p, t2 | byte(r >> 6))
  169. p = append(p, tx | byte(r) & maskx)
  170. return p
  171. case i > maxRune, surrogateMin <= i && i <= surrogateMax:
  172. r = runeError
  173. fallthrough
  174. case i <= rune3Max:
  175. p = append(p, t3 | byte(r >> 12))
  176. p = append(p, tx | byte(r >> 6) & maskx)
  177. p = append(p, tx | byte(r) & maskx)
  178. return p
  179. default:
  180. p = append(p, t4 | byte(r >> 18))
  181. p = append(p, tx | byte(r >> 12) & maskx)
  182. p = append(p, tx | byte(r >> 6) & maskx)
  183. p = append(p, tx | byte(r) & maskx)
  184. return p
  185. }
  186. }