decode.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package text
  5. import (
  6. "bytes"
  7. "io"
  8. "regexp"
  9. "strconv"
  10. "unicode/utf8"
  11. "google.golang.org/protobuf/internal/errors"
  12. "google.golang.org/protobuf/reflect/protoreflect"
  13. )
  14. type syntaxError struct{ error }
  15. func newSyntaxError(f string, x ...interface{}) error {
  16. return syntaxError{errors.New(f, x...)}
  17. }
  18. // Unmarshal parses b as the proto text format.
  19. // It returns a Value, which is always of the Message type.
  20. func Unmarshal(b []byte) (Value, error) {
  21. p := decoder{in: b}
  22. p.consume(0) // trim leading spaces or comments
  23. v, err := p.unmarshalMessage(false)
  24. if err != nil {
  25. if e, ok := err.(syntaxError); ok {
  26. b = b[:len(b)-len(p.in)] // consumed input
  27. line := bytes.Count(b, []byte("\n")) + 1
  28. if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
  29. b = b[i+1:]
  30. }
  31. column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
  32. err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
  33. }
  34. return Value{}, err
  35. }
  36. if len(p.in) > 0 {
  37. return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
  38. }
  39. return v, nil
  40. }
  41. type decoder struct {
  42. in []byte
  43. }
  44. func (p *decoder) unmarshalList() (Value, error) {
  45. b := p.in
  46. var elems []Value
  47. if err := p.consumeChar('[', "at start of list"); err != nil {
  48. return Value{}, err
  49. }
  50. if len(p.in) > 0 && p.in[0] != ']' {
  51. for len(p.in) > 0 {
  52. v, err := p.unmarshalValue()
  53. if err != nil {
  54. return Value{}, err
  55. }
  56. elems = append(elems, v)
  57. if !p.tryConsumeChar(',') {
  58. break
  59. }
  60. }
  61. }
  62. if err := p.consumeChar(']', "at end of list"); err != nil {
  63. return Value{}, err
  64. }
  65. b = b[:len(b)-len(p.in)]
  66. return rawValueOf(elems, b[:len(b):len(b)]), nil
  67. }
  68. func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
  69. b := p.in
  70. var items [][2]Value
  71. delims := [2]byte{'{', '}'}
  72. if len(p.in) > 0 && p.in[0] == '<' {
  73. delims = [2]byte{'<', '>'}
  74. }
  75. if checkDelims {
  76. if err := p.consumeChar(delims[0], "at start of message"); err != nil {
  77. return Value{}, err
  78. }
  79. }
  80. for len(p.in) > 0 {
  81. if p.in[0] == '}' || p.in[0] == '>' {
  82. break
  83. }
  84. k, err := p.unmarshalKey()
  85. if err != nil {
  86. return Value{}, err
  87. }
  88. if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
  89. return Value{}, newSyntaxError("expected ':' after message key")
  90. }
  91. v, err := p.unmarshalValue()
  92. if err != nil {
  93. return Value{}, err
  94. }
  95. if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
  96. // always optional
  97. }
  98. items = append(items, [2]Value{k, v})
  99. }
  100. if checkDelims {
  101. if err := p.consumeChar(delims[1], "at end of message"); err != nil {
  102. return Value{}, err
  103. }
  104. }
  105. b = b[:len(b)-len(p.in)]
  106. return rawValueOf(items, b[:len(b):len(b)]), nil
  107. }
  108. // unmarshalKey parses the key, which may be a Name, String, or Uint.
  109. func (p *decoder) unmarshalKey() (v Value, err error) {
  110. if p.tryConsumeChar('[') {
  111. if len(p.in) == 0 {
  112. return Value{}, io.ErrUnexpectedEOF
  113. }
  114. if p.in[0] == '\'' || p.in[0] == '"' {
  115. // Historically, Go's parser allowed a string for the Any type URL.
  116. // This is specific to Go and contrary to the C++ implementation,
  117. // which does not support strings for the Any type URL.
  118. v, err = p.unmarshalString()
  119. if err != nil {
  120. return Value{}, err
  121. }
  122. } else {
  123. v, err = p.unmarshalURL()
  124. if err != nil {
  125. return Value{}, err
  126. }
  127. }
  128. if err := p.consumeChar(']', "at end of extension name"); err != nil {
  129. return Value{}, err
  130. }
  131. return v, nil
  132. }
  133. v, err = p.unmarshalName()
  134. if err == nil {
  135. return v, nil
  136. }
  137. v, err = p.unmarshalNumberKey()
  138. if err == nil {
  139. return v, nil
  140. }
  141. return Value{}, err
  142. }
  143. // unmarshalURL parses an Any type URL string. The C++ parser does not handle
  144. // many legal URL strings. This implementation is more liberal and allows for
  145. // the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
  146. func (p *decoder) unmarshalURL() (Value, error) {
  147. s := p.in
  148. var size int
  149. for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
  150. ('0' <= s[0] && s[0] <= '9') ||
  151. ('a' <= s[0] && s[0] <= 'z') ||
  152. ('A' <= s[0] && s[0] <= 'Z')) {
  153. s = s[1:]
  154. size++
  155. if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
  156. s = s[1:]
  157. size++
  158. }
  159. }
  160. // Last character cannot be '.' or '/'.
  161. // Next byte should either be a delimiter or it is at the end.
  162. if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
  163. (len(s) > 0 && !isDelim(s[0])) {
  164. return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
  165. }
  166. v := rawValueOf(string(p.in[:size]), p.in[:size:size])
  167. p.consume(size)
  168. return v, nil
  169. }
  170. // unmarshalNumberKey parses field number as key. Field numbers are non-negative
  171. // integers.
  172. func (p *decoder) unmarshalNumberKey() (Value, error) {
  173. num, ok := parseNumber(p.in)
  174. if !ok || num.neg || num.typ == numFloat {
  175. return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
  176. }
  177. v, err := strconv.ParseUint(string(num.value), 0, 64)
  178. if err != nil {
  179. return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
  180. }
  181. p.consume(num.size)
  182. return rawValueOf(v, num.value), nil
  183. }
  184. func (p *decoder) unmarshalValue() (Value, error) {
  185. if len(p.in) == 0 {
  186. return Value{}, io.ErrUnexpectedEOF
  187. }
  188. switch p.in[0] {
  189. case '"', '\'':
  190. return p.unmarshalStrings()
  191. case '[':
  192. return p.unmarshalList()
  193. case '{', '<':
  194. return p.unmarshalMessage(true)
  195. default:
  196. n, ok := consumeName(p.in)
  197. if ok && literals[string(p.in[:n])] == nil {
  198. v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
  199. p.consume(n)
  200. return v, nil
  201. }
  202. return p.unmarshalNumber()
  203. }
  204. }
  205. // unmarshalName unmarshals an unquoted proto identifier.
  206. // Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
  207. //
  208. // E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
  209. func (p *decoder) unmarshalName() (Value, error) {
  210. n, ok := consumeName(p.in)
  211. if !ok {
  212. return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
  213. }
  214. v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
  215. p.consume(n)
  216. return v, nil
  217. }
  218. func consumeName(input []byte) (int, bool) {
  219. var n int
  220. s := input
  221. if len(s) == 0 {
  222. return 0, false
  223. }
  224. switch {
  225. case s[0] == '_',
  226. 'a' <= s[0] && s[0] <= 'z',
  227. 'A' <= s[0] && s[0] <= 'Z':
  228. s = s[1:]
  229. n++
  230. default:
  231. return 0, false
  232. }
  233. for len(s) > 0 && (s[0] == '_' ||
  234. 'a' <= s[0] && s[0] <= 'z' ||
  235. 'A' <= s[0] && s[0] <= 'Z' ||
  236. '0' <= s[0] && s[0] <= '9') {
  237. s = s[1:]
  238. n++
  239. }
  240. if len(s) > 0 && !isDelim(s[0]) {
  241. return 0, false
  242. }
  243. return n, true
  244. }
  245. func (p *decoder) consumeChar(c byte, msg string) error {
  246. if p.tryConsumeChar(c) {
  247. return nil
  248. }
  249. if len(p.in) == 0 {
  250. return io.ErrUnexpectedEOF
  251. }
  252. return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
  253. }
  254. func (p *decoder) tryConsumeChar(c byte) bool {
  255. if len(p.in) > 0 && p.in[0] == c {
  256. p.consume(1)
  257. return true
  258. }
  259. return false
  260. }
  261. // consume consumes n bytes of input and any subsequent whitespace or comments.
  262. func (p *decoder) consume(n int) {
  263. p.in = p.in[n:]
  264. for len(p.in) > 0 {
  265. switch p.in[0] {
  266. case ' ', '\n', '\r', '\t':
  267. p.in = p.in[1:]
  268. case '#':
  269. if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
  270. p.in = p.in[i+len("\n"):]
  271. } else {
  272. p.in = nil
  273. }
  274. default:
  275. return
  276. }
  277. }
  278. }
  279. // Any sequence that looks like a non-delimiter (for error reporting).
  280. var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
  281. // isDelim returns true if given byte is a delimiter character.
  282. func isDelim(c byte) bool {
  283. return !(c == '-' || c == '+' || c == '.' || c == '_' ||
  284. ('a' <= c && c <= 'z') ||
  285. ('A' <= c && c <= 'Z') ||
  286. ('0' <= c && c <= '9'))
  287. }