// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package text import ( "bytes" "io" "regexp" "strconv" "unicode/utf8" "google.golang.org/protobuf/internal/errors" "google.golang.org/protobuf/reflect/protoreflect" ) type syntaxError struct{ error } func newSyntaxError(f string, x ...interface{}) error { return syntaxError{errors.New(f, x...)} } // Unmarshal parses b as the proto text format. // It returns a Value, which is always of the Message type. func Unmarshal(b []byte) (Value, error) { p := decoder{in: b} p.consume(0) // trim leading spaces or comments v, err := p.unmarshalMessage(false) if err != nil { if e, ok := err.(syntaxError); ok { b = b[:len(b)-len(p.in)] // consumed input line := bytes.Count(b, []byte("\n")) + 1 if i := bytes.LastIndexByte(b, '\n'); i >= 0 { b = b[i+1:] } column := utf8.RuneCount(b) + 1 // ignore multi-rune characters err = errors.New("syntax error (line %d:%d): %v", line, column, e.error) } return Value{}, err } if len(p.in) > 0 { return Value{}, errors.New("%d bytes of unconsumed input", len(p.in)) } return v, nil } type decoder struct { in []byte } func (p *decoder) unmarshalList() (Value, error) { b := p.in var elems []Value if err := p.consumeChar('[', "at start of list"); err != nil { return Value{}, err } if len(p.in) > 0 && p.in[0] != ']' { for len(p.in) > 0 { v, err := p.unmarshalValue() if err != nil { return Value{}, err } elems = append(elems, v) if !p.tryConsumeChar(',') { break } } } if err := p.consumeChar(']', "at end of list"); err != nil { return Value{}, err } b = b[:len(b)-len(p.in)] return rawValueOf(elems, b[:len(b):len(b)]), nil } func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) { b := p.in var items [][2]Value delims := [2]byte{'{', '}'} if len(p.in) > 0 && p.in[0] == '<' { delims = [2]byte{'<', '>'} } if checkDelims { if err := p.consumeChar(delims[0], "at start of message"); err != nil { return Value{}, err } } for len(p.in) > 0 { if p.in[0] == '}' || p.in[0] == '>' { break } k, err := p.unmarshalKey() if err != nil { return Value{}, err } if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' { return Value{}, newSyntaxError("expected ':' after message key") } v, err := p.unmarshalValue() if err != nil { return Value{}, err } if p.tryConsumeChar(';') || p.tryConsumeChar(',') { // always optional } items = append(items, [2]Value{k, v}) } if checkDelims { if err := p.consumeChar(delims[1], "at end of message"); err != nil { return Value{}, err } } b = b[:len(b)-len(p.in)] return rawValueOf(items, b[:len(b):len(b)]), nil } // unmarshalKey parses the key, which may be a Name, String, or Uint. func (p *decoder) unmarshalKey() (v Value, err error) { if p.tryConsumeChar('[') { if len(p.in) == 0 { return Value{}, io.ErrUnexpectedEOF } if p.in[0] == '\'' || p.in[0] == '"' { // Historically, Go's parser allowed a string for the Any type URL. // This is specific to Go and contrary to the C++ implementation, // which does not support strings for the Any type URL. v, err = p.unmarshalString() if err != nil { return Value{}, err } } else { v, err = p.unmarshalURL() if err != nil { return Value{}, err } } if err := p.consumeChar(']', "at end of extension name"); err != nil { return Value{}, err } return v, nil } v, err = p.unmarshalName() if err == nil { return v, nil } v, err = p.unmarshalNumberKey() if err == nil { return v, nil } return Value{}, err } // unmarshalURL parses an Any type URL string. The C++ parser does not handle // many legal URL strings. This implementation is more liberal and allows for // the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). func (p *decoder) unmarshalURL() (Value, error) { s := p.in var size int for len(s) > 0 && (s[0] == '-' || s[0] == '_' || ('0' <= s[0] && s[0] <= '9') || ('a' <= s[0] && s[0] <= 'z') || ('A' <= s[0] && s[0] <= 'Z')) { s = s[1:] size++ if len(s) > 0 && (s[0] == '/' || s[0] == '.') { s = s[1:] size++ } } // Last character cannot be '.' or '/'. // Next byte should either be a delimiter or it is at the end. if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' || (len(s) > 0 && !isDelim(s[0])) { return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in)) } v := rawValueOf(string(p.in[:size]), p.in[:size:size]) p.consume(size) return v, nil } // unmarshalNumberKey parses field number as key. Field numbers are non-negative // integers. func (p *decoder) unmarshalNumberKey() (Value, error) { num, ok := parseNumber(p.in) if !ok || num.neg || num.typ == numFloat { return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in)) } v, err := strconv.ParseUint(string(num.value), 0, 64) if err != nil { return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in)) } p.consume(num.size) return rawValueOf(v, num.value), nil } func (p *decoder) unmarshalValue() (Value, error) { if len(p.in) == 0 { return Value{}, io.ErrUnexpectedEOF } switch p.in[0] { case '"', '\'': return p.unmarshalStrings() case '[': return p.unmarshalList() case '{', '<': return p.unmarshalMessage(true) default: n, ok := consumeName(p.in) if ok && literals[string(p.in[:n])] == nil { v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n]) p.consume(n) return v, nil } return p.unmarshalNumber() } } // unmarshalName unmarshals an unquoted proto identifier. // Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*` // // E.g., `field_name` => ValueOf(protoreflect.Name("field_name")) func (p *decoder) unmarshalName() (Value, error) { n, ok := consumeName(p.in) if !ok { return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in)) } v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n]) p.consume(n) return v, nil } func consumeName(input []byte) (int, bool) { var n int s := input if len(s) == 0 { return 0, false } switch { case s[0] == '_', 'a' <= s[0] && s[0] <= 'z', 'A' <= s[0] && s[0] <= 'Z': s = s[1:] n++ default: return 0, false } for len(s) > 0 && (s[0] == '_' || 'a' <= s[0] && s[0] <= 'z' || 'A' <= s[0] && s[0] <= 'Z' || '0' <= s[0] && s[0] <= '9') { s = s[1:] n++ } if len(s) > 0 && !isDelim(s[0]) { return 0, false } return n, true } func (p *decoder) consumeChar(c byte, msg string) error { if p.tryConsumeChar(c) { return nil } if len(p.in) == 0 { return io.ErrUnexpectedEOF } return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg) } func (p *decoder) tryConsumeChar(c byte) bool { if len(p.in) > 0 && p.in[0] == c { p.consume(1) return true } return false } // consume consumes n bytes of input and any subsequent whitespace or comments. func (p *decoder) consume(n int) { p.in = p.in[n:] for len(p.in) > 0 { switch p.in[0] { case ' ', '\n', '\r', '\t': p.in = p.in[1:] case '#': if i := bytes.IndexByte(p.in, '\n'); i >= 0 { p.in = p.in[i+len("\n"):] } else { p.in = nil } default: return } } } // Any sequence that looks like a non-delimiter (for error reporting). var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`) // isDelim returns true if given byte is a delimiter character. func isDelim(c byte) bool { return !(c == '-' || c == '+' || c == '.' || c == '_' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) }