feature_iter_string.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. package jsoniter
  2. import (
  3. "unicode/utf16"
  4. )
  5. // ReadString read string from iterator
  6. func (iter *Iterator) ReadString() (ret string) {
  7. c := iter.nextToken()
  8. if c == '"' {
  9. for i := iter.head; i < iter.tail; i++ {
  10. c := iter.buf[i]
  11. if c == '"' {
  12. ret = string(iter.buf[iter.head:i])
  13. iter.head = i + 1
  14. return ret
  15. } else if c == '\\' {
  16. break
  17. }
  18. }
  19. return iter.readStringSlowPath()
  20. } else if c == 'n' {
  21. iter.skipFixedBytes(3)
  22. return ""
  23. }
  24. iter.ReportError("ReadString", `expects " or n`)
  25. return
  26. }
  27. func (iter *Iterator) readStringSlowPath() (ret string) {
  28. var str []byte
  29. var c byte
  30. for iter.Error == nil {
  31. c = iter.readByte()
  32. if c == '"' {
  33. return string(str)
  34. }
  35. if c == '\\' {
  36. c = iter.readByte()
  37. switch c {
  38. case 'u', 'U':
  39. r := iter.readU4()
  40. if utf16.IsSurrogate(r) {
  41. c = iter.readByte()
  42. if iter.Error != nil {
  43. return
  44. }
  45. if c != '\\' {
  46. iter.ReportError("ReadString",
  47. `expects \u after utf16 surrogate, but \ not found`)
  48. return
  49. }
  50. c = iter.readByte()
  51. if iter.Error != nil {
  52. return
  53. }
  54. if c != 'u' && c != 'U' {
  55. iter.ReportError("ReadString",
  56. `expects \u after utf16 surrogate, but \u not found`)
  57. return
  58. }
  59. r2 := iter.readU4()
  60. if iter.Error != nil {
  61. return
  62. }
  63. combined := utf16.DecodeRune(r, r2)
  64. str = appendRune(str, combined)
  65. } else {
  66. str = appendRune(str, r)
  67. }
  68. case '"':
  69. str = append(str, '"')
  70. case '\\':
  71. str = append(str, '\\')
  72. case '/':
  73. str = append(str, '/')
  74. case 'b':
  75. str = append(str, '\b')
  76. case 'f':
  77. str = append(str, '\f')
  78. case 'n':
  79. str = append(str, '\n')
  80. case 'r':
  81. str = append(str, '\r')
  82. case 't':
  83. str = append(str, '\t')
  84. default:
  85. iter.ReportError("ReadString",
  86. `invalid escape char after \`)
  87. return
  88. }
  89. } else {
  90. str = append(str, c)
  91. }
  92. }
  93. iter.ReportError("ReadString", "unexpected end of input")
  94. return
  95. }
  96. // ReadStringAsSlice read string from iterator without copying into string form.
  97. // The []byte can not be kept, as it will change after next iterator call.
  98. func (iter *Iterator) ReadStringAsSlice() (ret []byte) {
  99. c := iter.nextToken()
  100. if c == '"' {
  101. for i := iter.head; i < iter.tail; i++ {
  102. // require ascii string and no escape
  103. // for: field name, base64, number
  104. if iter.buf[i] == '"' {
  105. // fast path: reuse the underlying buffer
  106. ret = iter.buf[iter.head:i]
  107. iter.head = i + 1
  108. return ret
  109. }
  110. }
  111. readLen := iter.tail - iter.head
  112. copied := make([]byte, readLen, readLen*2)
  113. copy(copied, iter.buf[iter.head:iter.tail])
  114. iter.head = iter.tail
  115. for iter.Error == nil {
  116. c := iter.readByte()
  117. if c == '"' {
  118. return copied
  119. }
  120. copied = append(copied, c)
  121. }
  122. return copied
  123. }
  124. iter.ReportError("ReadString", `expects " or n`)
  125. return
  126. }
  127. func (iter *Iterator) readU4() (ret rune) {
  128. for i := 0; i < 4; i++ {
  129. c := iter.readByte()
  130. if iter.Error != nil {
  131. return
  132. }
  133. if c >= '0' && c <= '9' {
  134. ret = ret*16 + rune(c-'0')
  135. } else if c >= 'a' && c <= 'f' {
  136. ret = ret*16 + rune(c-'a'+10)
  137. } else if c >= 'A' && c <= 'F' {
  138. ret = ret*16 + rune(c-'A'+10)
  139. } else {
  140. iter.ReportError("readU4", "expects 0~9 or a~f")
  141. return
  142. }
  143. }
  144. return ret
  145. }
  146. const (
  147. t1 = 0x00 // 0000 0000
  148. tx = 0x80 // 1000 0000
  149. t2 = 0xC0 // 1100 0000
  150. t3 = 0xE0 // 1110 0000
  151. t4 = 0xF0 // 1111 0000
  152. t5 = 0xF8 // 1111 1000
  153. maskx = 0x3F // 0011 1111
  154. mask2 = 0x1F // 0001 1111
  155. mask3 = 0x0F // 0000 1111
  156. mask4 = 0x07 // 0000 0111
  157. rune1Max = 1<<7 - 1
  158. rune2Max = 1<<11 - 1
  159. rune3Max = 1<<16 - 1
  160. surrogateMin = 0xD800
  161. surrogateMax = 0xDFFF
  162. maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
  163. runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
  164. )
  165. func appendRune(p []byte, r rune) []byte {
  166. // Negative values are erroneous. Making it unsigned addresses the problem.
  167. switch i := uint32(r); {
  168. case i <= rune1Max:
  169. p = append(p, byte(r))
  170. return p
  171. case i <= rune2Max:
  172. p = append(p, t2|byte(r>>6))
  173. p = append(p, tx|byte(r)&maskx)
  174. return p
  175. case i > maxRune, surrogateMin <= i && i <= surrogateMax:
  176. r = runeError
  177. fallthrough
  178. case i <= rune3Max:
  179. p = append(p, t3|byte(r>>12))
  180. p = append(p, tx|byte(r>>6)&maskx)
  181. p = append(p, tx|byte(r)&maskx)
  182. return p
  183. default:
  184. p = append(p, t4|byte(r>>18))
  185. p = append(p, tx|byte(r>>12)&maskx)
  186. p = append(p, tx|byte(r>>6)&maskx)
  187. p = append(p, tx|byte(r)&maskx)
  188. return p
  189. }
  190. }