string.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "io"
  8. "math"
  9. "math/bits"
  10. "strconv"
  11. "strings"
  12. "unicode"
  13. "unicode/utf16"
  14. "unicode/utf8"
  15. "google.golang.org/protobuf/internal/errors"
  16. )
  17. func (p *encoder) marshalString(v Value) error {
  18. var err error
  19. p.out, err = appendString(p.out, v, p.outputASCII)
  20. return err
  21. }
  22. func appendString(out []byte, v Value, outputASCII bool) ([]byte, error) {
  23. if v.Type() != String {
  24. return nil, errors.New("invalid type %v, expected string", v.Type())
  25. }
  26. if len(v.raw) > 0 {
  27. return append(out, v.raw...), nil
  28. }
  29. in := v.String()
  30. out = append(out, '"')
  31. i := indexNeedEscapeInString(in)
  32. in, out = in[i:], append(out, in[:i]...)
  33. for len(in) > 0 {
  34. switch r, n := utf8.DecodeRuneInString(in); {
  35. case r == utf8.RuneError && n == 1:
  36. // We do not report invalid UTF-8 because strings in the text format
  37. // are used to represent both the proto string and bytes type.
  38. r = rune(in[0])
  39. fallthrough
  40. case r < ' ' || r == '"' || r == '\\':
  41. out = append(out, '\\')
  42. switch r {
  43. case '"', '\\':
  44. out = append(out, byte(r))
  45. case '\n':
  46. out = append(out, 'n')
  47. case '\r':
  48. out = append(out, 'r')
  49. case '\t':
  50. out = append(out, 't')
  51. default:
  52. out = append(out, 'x')
  53. out = append(out, "00"[1+(bits.Len32(uint32(r))-1)/4:]...)
  54. out = strconv.AppendUint(out, uint64(r), 16)
  55. }
  56. in = in[n:]
  57. case outputASCII && r >= utf8.RuneSelf:
  58. out = append(out, '\\')
  59. if r <= math.MaxUint16 {
  60. out = append(out, 'u')
  61. out = append(out, "0000"[1+(bits.Len32(uint32(r))-1)/4:]...)
  62. out = strconv.AppendUint(out, uint64(r), 16)
  63. } else {
  64. out = append(out, 'U')
  65. out = append(out, "00000000"[1+(bits.Len32(uint32(r))-1)/4:]...)
  66. out = strconv.AppendUint(out, uint64(r), 16)
  67. }
  68. in = in[n:]
  69. default:
  70. i := indexNeedEscapeInString(in[n:])
  71. in, out = in[n+i:], append(out, in[:n+i]...)
  72. }
  73. }
  74. out = append(out, '"')
  75. return out, nil
  76. }
  77. func (p *decoder) unmarshalString() (Value, error) {
  78. v, n, err := consumeString(p.in)
  79. p.consume(n)
  80. return v, err
  81. }
  82. func consumeString(in []byte) (Value, int, error) {
  83. in0 := in
  84. if len(in) == 0 {
  85. return Value{}, 0, io.ErrUnexpectedEOF
  86. }
  87. quote := in[0]
  88. if in[0] != '"' && in[0] != '\'' {
  89. return Value{}, 0, newSyntaxError("invalid character %q at start of string", in[0])
  90. }
  91. in = in[1:]
  92. i := indexNeedEscapeInBytes(in)
  93. in, out := in[i:], in[:i:i] // set cap to prevent mutations
  94. for len(in) > 0 {
  95. switch r, n := utf8.DecodeRune(in); {
  96. case r == utf8.RuneError && n == 1:
  97. return Value{}, 0, newSyntaxError("invalid UTF-8 detected")
  98. case r == 0 || r == '\n':
  99. return Value{}, 0, newSyntaxError("invalid character %q in string", r)
  100. case r == rune(quote):
  101. in = in[1:]
  102. n := len(in0) - len(in)
  103. v := rawValueOf(string(out), in0[:n:n])
  104. return v, n, nil
  105. case r == '\\':
  106. if len(in) < 2 {
  107. return Value{}, 0, io.ErrUnexpectedEOF
  108. }
  109. switch r := in[1]; r {
  110. case '"', '\'', '\\', '?':
  111. in, out = in[2:], append(out, r)
  112. case 'a':
  113. in, out = in[2:], append(out, '\a')
  114. case 'b':
  115. in, out = in[2:], append(out, '\b')
  116. case 'n':
  117. in, out = in[2:], append(out, '\n')
  118. case 'r':
  119. in, out = in[2:], append(out, '\r')
  120. case 't':
  121. in, out = in[2:], append(out, '\t')
  122. case 'v':
  123. in, out = in[2:], append(out, '\v')
  124. case 'f':
  125. in, out = in[2:], append(out, '\f')
  126. case '0', '1', '2', '3', '4', '5', '6', '7':
  127. // One, two, or three octal characters.
  128. n := len(in[1:]) - len(bytes.TrimLeft(in[1:], "01234567"))
  129. if n > 3 {
  130. n = 3
  131. }
  132. v, err := strconv.ParseUint(string(in[1:1+n]), 8, 8)
  133. if err != nil {
  134. return Value{}, 0, newSyntaxError("invalid octal escape code %q in string", in[:1+n])
  135. }
  136. in, out = in[1+n:], append(out, byte(v))
  137. case 'x':
  138. // One or two hexadecimal characters.
  139. n := len(in[2:]) - len(bytes.TrimLeft(in[2:], "0123456789abcdefABCDEF"))
  140. if n > 2 {
  141. n = 2
  142. }
  143. v, err := strconv.ParseUint(string(in[2:2+n]), 16, 8)
  144. if err != nil {
  145. return Value{}, 0, newSyntaxError("invalid hex escape code %q in string", in[:2+n])
  146. }
  147. in, out = in[2+n:], append(out, byte(v))
  148. case 'u', 'U':
  149. // Four or eight hexadecimal characters
  150. n := 6
  151. if r == 'U' {
  152. n = 10
  153. }
  154. if len(in) < n {
  155. return Value{}, 0, io.ErrUnexpectedEOF
  156. }
  157. v, err := strconv.ParseUint(string(in[2:n]), 16, 32)
  158. if utf8.MaxRune < v || err != nil {
  159. return Value{}, 0, newSyntaxError("invalid Unicode escape code %q in string", in[:n])
  160. }
  161. in = in[n:]
  162. r := rune(v)
  163. if utf16.IsSurrogate(r) {
  164. if len(in) < 6 {
  165. return Value{}, 0, io.ErrUnexpectedEOF
  166. }
  167. v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
  168. r = utf16.DecodeRune(r, rune(v))
  169. if in[0] != '\\' || in[1] != 'u' || r == unicode.ReplacementChar || err != nil {
  170. return Value{}, 0, newSyntaxError("invalid Unicode escape code %q in string", in[:6])
  171. }
  172. in = in[6:]
  173. }
  174. out = append(out, string(r)...)
  175. default:
  176. return Value{}, 0, newSyntaxError("invalid escape code %q in string", in[:2])
  177. }
  178. default:
  179. i := indexNeedEscapeInBytes(in[n:])
  180. in, out = in[n+i:], append(out, in[:n+i]...)
  181. }
  182. }
  183. return Value{}, 0, io.ErrUnexpectedEOF
  184. }
  185. // unmarshalStrings unmarshals multiple strings.
  186. // This differs from unmarshalString since the text format allows
  187. // multiple back-to-back string literals where they are semantically treated
  188. // as a single large string with all values concatenated.
  189. //
  190. // E.g., `"foo" "bar" "baz"` => ValueOf("foobarbaz")
  191. func (p *decoder) unmarshalStrings() (Value, error) {
  192. // Note that the ending quote is sufficient to unambiguously mark the end
  193. // of a string. Thus, the text grammar does not require intervening
  194. // whitespace or control characters in-between strings.
  195. // Thus, the following is valid:
  196. // `"foo"'bar'"baz"` => ValueOf("foobarbaz")
  197. b := p.in
  198. var ss []string
  199. for len(p.in) > 0 && (p.in[0] == '"' || p.in[0] == '\'') {
  200. v, err := p.unmarshalString()
  201. if err != nil {
  202. return Value{}, err
  203. }
  204. ss = append(ss, v.String())
  205. }
  206. b = b[:len(b)-len(p.in)]
  207. return rawValueOf(strings.Join(ss, ""), b[:len(b):len(b)]), nil
  208. }
  209. // indexNeedEscapeInString returns the index of the character that needs
  210. // escaping. If no characters need escaping, this returns the input length.
  211. func indexNeedEscapeInString(s string) int {
  212. for i := 0; i < len(s); i++ {
  213. if c := s[i]; c < ' ' || c == '"' || c == '\'' || c == '\\' || c >= utf8.RuneSelf {
  214. return i
  215. }
  216. }
  217. return len(s)
  218. }
  219. // indexNeedEscapeInBytes returns the index of the character that needs
  220. // escaping. If no characters need escaping, this returns the input length.
  221. // TODO: Remove this duplicate function when https://golang.org/issue/31506 gets
  222. // resolved.
  223. func indexNeedEscapeInBytes(b []byte) int {
  224. for i := 0; i < len(b); {
  225. c, size := utf8.DecodeRune(b[i:])
  226. if c < ' ' || c == '"' || c == '\'' || c == '\\' || c >= utf8.RuneSelf {
  227. return i
  228. }
  229. i += size
  230. }
  231. return len(b)
  232. }