string.go 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "io"
  8. "math"
  9. "math/bits"
  10. "strconv"
  11. "strings"
  12. "unicode"
  13. "unicode/utf16"
  14. "unicode/utf8"
  15. "github.com/golang/protobuf/v2/internal/errors"
  16. )
  17. func (p *encoder) marshalString(v Value) error {
  18. var err error
  19. p.out, err = appendString(p.out, v, p.outputASCII)
  20. return err
  21. }
  22. func appendString(out []byte, v Value, outputASCII bool) ([]byte, error) {
  23. if v.Type() != String {
  24. return nil, errors.New("invalid type %v, expected string", v.Type())
  25. }
  26. if len(v.raw) > 0 {
  27. return append(out, v.raw...), nil
  28. }
  29. in := v.String()
  30. out = append(out, '"')
  31. i := indexNeedEscape(in)
  32. in, out = in[i:], append(out, in[:i]...)
  33. for len(in) > 0 {
  34. switch r, n := utf8.DecodeRuneInString(in); {
  35. case r == utf8.RuneError && n == 1:
  36. // We do not report invalid UTF-8 because strings in the text format
  37. // are used to represent both the proto string and bytes type.
  38. r = rune(in[0])
  39. fallthrough
  40. case r < ' ' || r == '"' || r == '\\':
  41. out = append(out, '\\')
  42. switch r {
  43. case '"', '\\':
  44. out = append(out, byte(r))
  45. case '\n':
  46. out = append(out, 'n')
  47. case '\r':
  48. out = append(out, 'r')
  49. case '\t':
  50. out = append(out, 't')
  51. default:
  52. out = append(out, 'x')
  53. out = append(out, "00"[1+(bits.Len32(uint32(r))-1)/4:]...)
  54. out = strconv.AppendUint(out, uint64(r), 16)
  55. }
  56. in = in[n:]
  57. case outputASCII && r >= utf8.RuneSelf:
  58. out = append(out, '\\')
  59. if r <= math.MaxUint16 {
  60. out = append(out, 'u')
  61. out = append(out, "0000"[1+(bits.Len32(uint32(r))-1)/4:]...)
  62. out = strconv.AppendUint(out, uint64(r), 16)
  63. } else {
  64. out = append(out, 'U')
  65. out = append(out, "00000000"[1+(bits.Len32(uint32(r))-1)/4:]...)
  66. out = strconv.AppendUint(out, uint64(r), 16)
  67. }
  68. in = in[n:]
  69. default:
  70. i := indexNeedEscape(in[n:])
  71. in, out = in[n+i:], append(out, in[:n+i]...)
  72. }
  73. }
  74. out = append(out, '"')
  75. return out, nil
  76. }
  77. func (p *decoder) unmarshalString() (Value, error) {
  78. v, n, err := consumeString(p.in)
  79. p.consume(n)
  80. return v, err
  81. }
  82. func consumeString(in []byte) (Value, int, error) {
  83. var nerr errors.NonFatal
  84. in0 := in
  85. if len(in) == 0 {
  86. return Value{}, 0, io.ErrUnexpectedEOF
  87. }
  88. quote := in[0]
  89. if in[0] != '"' && in[0] != '\'' {
  90. return Value{}, 0, newSyntaxError("invalid character %q at start of string", in[0])
  91. }
  92. in = in[1:]
  93. i := indexNeedEscape(string(in))
  94. in, out := in[i:], in[:i:i] // set cap to prevent mutations
  95. for len(in) > 0 {
  96. switch r, n := utf8.DecodeRune(in); {
  97. case r == utf8.RuneError && n == 1:
  98. nerr.AppendInvalidUTF8("")
  99. in, out = in[1:], append(out, in[0]) // preserve invalid byte
  100. case r == 0 || r == '\n':
  101. return Value{}, 0, newSyntaxError("invalid character %q in string", r)
  102. case r == rune(quote):
  103. in = in[1:]
  104. n := len(in0) - len(in)
  105. v := rawValueOf(string(out), in0[:n:n])
  106. return v, n, nerr.E
  107. case r == '\\':
  108. if len(in) < 2 {
  109. return Value{}, 0, io.ErrUnexpectedEOF
  110. }
  111. switch r := in[1]; r {
  112. case '"', '\'', '\\', '?':
  113. in, out = in[2:], append(out, r)
  114. case 'a':
  115. in, out = in[2:], append(out, '\a')
  116. case 'b':
  117. in, out = in[2:], append(out, '\b')
  118. case 'n':
  119. in, out = in[2:], append(out, '\n')
  120. case 'r':
  121. in, out = in[2:], append(out, '\r')
  122. case 't':
  123. in, out = in[2:], append(out, '\t')
  124. case 'v':
  125. in, out = in[2:], append(out, '\v')
  126. case 'f':
  127. in, out = in[2:], append(out, '\f')
  128. case '0', '1', '2', '3', '4', '5', '6', '7':
  129. // One, two, or three octal characters.
  130. n := len(in[1:]) - len(bytes.TrimLeft(in[1:], "01234567"))
  131. if n > 3 {
  132. n = 3
  133. }
  134. v, err := strconv.ParseUint(string(in[1:1+n]), 8, 8)
  135. if err != nil {
  136. return Value{}, 0, newSyntaxError("invalid octal escape code %q in string", in[:1+n])
  137. }
  138. in, out = in[1+n:], append(out, byte(v))
  139. case 'x':
  140. // One or two hexadecimal characters.
  141. n := len(in[2:]) - len(bytes.TrimLeft(in[2:], "0123456789abcdefABCDEF"))
  142. if n > 2 {
  143. n = 2
  144. }
  145. v, err := strconv.ParseUint(string(in[2:2+n]), 16, 8)
  146. if err != nil {
  147. return Value{}, 0, newSyntaxError("invalid hex escape code %q in string", in[:2+n])
  148. }
  149. in, out = in[2+n:], append(out, byte(v))
  150. case 'u', 'U':
  151. // Four or eight hexadecimal characters
  152. n := 6
  153. if r == 'U' {
  154. n = 10
  155. }
  156. if len(in) < n {
  157. return Value{}, 0, io.ErrUnexpectedEOF
  158. }
  159. v, err := strconv.ParseUint(string(in[2:n]), 16, 32)
  160. if utf8.MaxRune < v || err != nil {
  161. return Value{}, 0, newSyntaxError("invalid Unicode escape code %q in string", in[:n])
  162. }
  163. in = in[n:]
  164. r := rune(v)
  165. if utf16.IsSurrogate(r) {
  166. if len(in) < 6 {
  167. return Value{}, 0, io.ErrUnexpectedEOF
  168. }
  169. v, err := strconv.ParseUint(string(in[2:6]), 16, 16)
  170. r = utf16.DecodeRune(r, rune(v))
  171. if in[0] != '\\' || in[1] != 'u' || r == unicode.ReplacementChar || err != nil {
  172. return Value{}, 0, newSyntaxError("invalid Unicode escape code %q in string", in[:6])
  173. }
  174. in = in[6:]
  175. }
  176. out = append(out, string(r)...)
  177. default:
  178. return Value{}, 0, newSyntaxError("invalid escape code %q in string", in[:2])
  179. }
  180. default:
  181. i := indexNeedEscape(string(in[n:]))
  182. in, out = in[n+i:], append(out, in[:n+i]...)
  183. }
  184. }
  185. return Value{}, 0, io.ErrUnexpectedEOF
  186. }
  187. // unmarshalStrings unmarshals multiple strings.
  188. // This differs from unmarshalString since the text format allows
  189. // multiple back-to-back string literals where they are semantically treated
  190. // as a single large string with all values concatenated.
  191. //
  192. // E.g., `"foo" "bar" "baz"` => ValueOf("foobarbaz")
  193. func (p *decoder) unmarshalStrings() (Value, error) {
  194. // Note that the ending quote is sufficient to unambiguously mark the end
  195. // of a string. Thus, the text grammar does not require intervening
  196. // whitespace or control characters in-between strings.
  197. // Thus, the following is valid:
  198. // `"foo"'bar'"baz"` => ValueOf("foobarbaz")
  199. b := p.in
  200. var ss []string
  201. for len(p.in) > 0 && (p.in[0] == '"' || p.in[0] == '\'') {
  202. v, err := p.unmarshalString()
  203. if !p.nerr.Merge(err) {
  204. return Value{}, err
  205. }
  206. ss = append(ss, v.String())
  207. }
  208. b = b[:len(b)-len(p.in)]
  209. return rawValueOf(strings.Join(ss, ""), b[:len(b):len(b)]), nil
  210. }
  211. // indexNeedEscape returns the index of the next character that needs escaping.
  212. // If no characters need escaping, this returns the input length.
  213. func indexNeedEscape(s string) int {
  214. for i := 0; i < len(s); i++ {
  215. if c := s[i]; c < ' ' || c == '"' || c == '\'' || c == '\\' || c >= utf8.RuneSelf {
  216. return i
  217. }
  218. }
  219. return len(s)
  220. }