decode.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "io"
  8. "regexp"
  9. "unicode/utf8"
  10. "github.com/golang/protobuf/v2/internal/errors"
  11. "github.com/golang/protobuf/v2/reflect/protoreflect"
  12. )
  13. type syntaxError struct{ error }
  14. func newSyntaxError(f string, x ...interface{}) error {
  15. return syntaxError{errors.New(f, x...)}
  16. }
  17. // Unmarshal parses b as the proto text format.
  18. // It returns a Value, which is always of the Message type.
  19. func Unmarshal(b []byte) (Value, error) {
  20. p := decoder{in: b}
  21. p.consume(0) // trim leading spaces or comments
  22. v, err := p.unmarshalMessage(false)
  23. if !p.nerr.Merge(err) {
  24. if e, ok := err.(syntaxError); ok {
  25. b = b[:len(b)-len(p.in)] // consumed input
  26. line := bytes.Count(b, []byte("\n")) + 1
  27. if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
  28. b = b[i+1:]
  29. }
  30. column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
  31. err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
  32. }
  33. return Value{}, err
  34. }
  35. if len(p.in) > 0 {
  36. return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
  37. }
  38. return v, p.nerr.E
  39. }
  40. type decoder struct {
  41. nerr errors.NonFatal
  42. in []byte
  43. }
  44. func (p *decoder) unmarshalList() (Value, error) {
  45. b := p.in
  46. var elems []Value
  47. if err := p.consumeChar('[', "at start of list"); err != nil {
  48. return Value{}, err
  49. }
  50. if len(p.in) > 0 && p.in[0] != ']' {
  51. for len(p.in) > 0 {
  52. v, err := p.unmarshalValue()
  53. if !p.nerr.Merge(err) {
  54. return Value{}, err
  55. }
  56. elems = append(elems, v)
  57. if !p.tryConsumeChar(',') {
  58. break
  59. }
  60. }
  61. }
  62. if err := p.consumeChar(']', "at end of list"); err != nil {
  63. return Value{}, err
  64. }
  65. b = b[:len(b)-len(p.in)]
  66. return rawValueOf(elems, b[:len(b):len(b)]), nil
  67. }
  68. func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
  69. b := p.in
  70. var items [][2]Value
  71. delims := [2]byte{'{', '}'}
  72. if len(p.in) > 0 && p.in[0] == '<' {
  73. delims = [2]byte{'<', '>'}
  74. }
  75. if checkDelims {
  76. if err := p.consumeChar(delims[0], "at start of message"); err != nil {
  77. return Value{}, err
  78. }
  79. }
  80. for len(p.in) > 0 {
  81. if p.in[0] == '}' || p.in[0] == '>' {
  82. break
  83. }
  84. k, err := p.unmarshalKey()
  85. if !p.nerr.Merge(err) {
  86. return Value{}, err
  87. }
  88. if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
  89. return Value{}, newSyntaxError("expected ':' after message key")
  90. }
  91. v, err := p.unmarshalValue()
  92. if !p.nerr.Merge(err) {
  93. return Value{}, err
  94. }
  95. if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
  96. // always optional
  97. }
  98. items = append(items, [2]Value{k, v})
  99. }
  100. if checkDelims {
  101. if err := p.consumeChar(delims[1], "at end of message"); err != nil {
  102. return Value{}, err
  103. }
  104. }
  105. b = b[:len(b)-len(p.in)]
  106. return rawValueOf(items, b[:len(b):len(b)]), nil
  107. }
  108. // This expression is more liberal than ConsumeAnyTypeUrl in C++.
  109. // However, the C++ parser does not handle many legal URL strings.
  110. // The Go implementation is more liberal to be backwards compatible with
  111. // the historical Go implementation which was overly liberal (and buggy).
  112. var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
  113. // unmarshalKey parses the key, which may be a Name, String, or Uint.
  114. func (p *decoder) unmarshalKey() (v Value, err error) {
  115. if p.tryConsumeChar('[') {
  116. if len(p.in) == 0 {
  117. return Value{}, io.ErrUnexpectedEOF
  118. }
  119. if p.in[0] == '\'' || p.in[0] == '"' {
  120. // Historically, Go's parser allowed a string for the Any type URL.
  121. // This is specific to Go and contrary to the C++ implementation,
  122. // which does not support strings for the Any type URL.
  123. v, err = p.unmarshalString()
  124. if !p.nerr.Merge(err) {
  125. return Value{}, err
  126. }
  127. } else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
  128. v = rawValueOf(string(p.in[:n]), p.in[:n:n])
  129. p.consume(n)
  130. } else {
  131. return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
  132. }
  133. if err := p.consumeChar(']', "at end of extension name"); err != nil {
  134. return Value{}, err
  135. }
  136. return v, nil
  137. }
  138. if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
  139. return p.unmarshalNumber()
  140. }
  141. return p.unmarshalName()
  142. }
  143. func (p *decoder) unmarshalValue() (Value, error) {
  144. if len(p.in) == 0 {
  145. return Value{}, io.ErrUnexpectedEOF
  146. }
  147. switch p.in[0] {
  148. case '"', '\'':
  149. return p.unmarshalStrings()
  150. case '[':
  151. return p.unmarshalList()
  152. case '{', '<':
  153. return p.unmarshalMessage(true)
  154. default:
  155. n := matchWithDelim(nameRegexp, p.in) // zero if no match
  156. if n > 0 && literals[string(p.in[:n])] == nil {
  157. return p.unmarshalName()
  158. }
  159. return p.unmarshalNumber()
  160. }
  161. }
  162. // This expression matches all valid proto identifiers.
  163. var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
  164. // unmarshalName unmarshals an unquoted identifier.
  165. //
  166. // E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
  167. func (p *decoder) unmarshalName() (Value, error) {
  168. if n := matchWithDelim(nameRegexp, p.in); n > 0 {
  169. v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
  170. p.consume(n)
  171. return v, nil
  172. }
  173. return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
  174. }
  175. func (p *decoder) consumeChar(c byte, msg string) error {
  176. if p.tryConsumeChar(c) {
  177. return nil
  178. }
  179. if len(p.in) == 0 {
  180. return io.ErrUnexpectedEOF
  181. }
  182. return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
  183. }
  184. func (p *decoder) tryConsumeChar(c byte) bool {
  185. if len(p.in) > 0 && p.in[0] == c {
  186. p.consume(1)
  187. return true
  188. }
  189. return false
  190. }
  191. // consume consumes n bytes of input and any subsequent whitespace or comments.
  192. func (p *decoder) consume(n int) {
  193. p.in = p.in[n:]
  194. for len(p.in) > 0 {
  195. switch p.in[0] {
  196. case ' ', '\n', '\r', '\t':
  197. p.in = p.in[1:]
  198. case '#':
  199. if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
  200. p.in = p.in[i+len("\n"):]
  201. } else {
  202. p.in = nil
  203. }
  204. default:
  205. return
  206. }
  207. }
  208. }
  209. // Any sequence that looks like a non-delimiter (for error reporting).
  210. var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
  211. // matchWithDelim matches r with the input b and verifies that the match
  212. // terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
  213. // As a special case, EOF is considered a delimiter.
  214. func matchWithDelim(r *regexp.Regexp, b []byte) int {
  215. n := len(r.Find(b))
  216. if n < len(b) {
  217. // Check that that the next character is a delimiter.
  218. c := b[n]
  219. notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
  220. ('a' <= c && c <= 'z') ||
  221. ('A' <= c && c <= 'Z') ||
  222. ('0' <= c && c <= '9'))
  223. if notDelim {
  224. return 0
  225. }
  226. }
  227. return n
  228. }