Explorar el Código

goprotobuf: Standalone parser for text format strings.

This fixes some errors with escaping and with \u and \U.

R=r
CC=golang-dev
http://codereview.appspot.com/6555054
David Symonds hace 13 años
padre
commit
fa94a1e2bc
Se han modificado 3 ficheros con 167 adiciones y 25 borrados
  1. 122 24
      proto/text_parser.go
  2. 27 0
      proto/text_parser_test.go
  3. 18 1
      proto/text_test.go

+ 122 - 24
proto/text_parser.go

@@ -35,10 +35,12 @@ package proto
 // TODO: message sets.
 
 import (
+	"errors"
 	"fmt"
 	"reflect"
 	"strconv"
 	"strings"
+	"unicode/utf8"
 )
 
 type ParseError struct {
@@ -169,7 +171,7 @@ func (p *textParser) advance() {
 			p.errorf("unmatched quote")
 			return
 		}
-		unq, err := unquoteC(p.s[0 : i+1])
+		unq, err := unquoteC(p.s[1:i], rune(p.s[0]))
 		if err != nil {
 			p.errorf("invalid quoted string %v", p.s[0:i+1])
 			return
@@ -190,35 +192,131 @@ func (p *textParser) advance() {
 	p.offset += len(p.cur.value)
 }
 
-// quoteSwap returns a single quote for a double quote, and vice versa.
-// It is intended to be used with strings.Map.
-func quoteSwap(r rune) rune {
-	switch r {
-	case '\'':
-		return '"'
-	case '"':
-		return '\''
+var (
+	errBadUTF8 = errors.New("bad UTF-8")
+	errBadHex  = errors.New("bad hexadecimal")
+)
+
+func unquoteC(s string, quote rune) (string, error) {
+	// This is based on C++'s tokenizer.cc.
+	// Despite its name, this is *not* parsing C syntax.
+	// For instance, "\0" is an invalid quoted string.
+
+	// Avoid allocation in trivial cases.
+	simple := true
+	for _, r := range s {
+		if r == '\\' || r == quote {
+			simple = false
+			break
+		}
 	}
-	return r
+	if simple {
+		return s, nil
+	}
+
+	buf := make([]byte, 0, 3*len(s)/2)
+	for len(s) > 0 {
+		r, n := utf8.DecodeRuneInString(s)
+		if r == utf8.RuneError && n == 1 {
+			return "", errBadUTF8
+		}
+		s = s[n:]
+		if r != '\\' {
+			if r < utf8.RuneSelf {
+				buf = append(buf, byte(r))
+			} else {
+				buf = append(buf, string(r)...)
+			}
+			continue
+		}
+
+		ch, tail, err := unescape(s)
+		if err != nil {
+			return "", err
+		}
+		buf = append(buf, ch...)
+		s = tail
+	}
+	return string(buf), nil
 }
 
-func unquoteC(s string) (string, error) {
-	// TODO: This is getting hacky. We should replace it work a self-contained parser.
+func unescape(s string) (ch string, tail string, err error) {
+	r, n := utf8.DecodeRuneInString(s)
+	if r == utf8.RuneError && n == 1 {
+		return "", "", errBadUTF8
+	}
+	s = s[n:]
+	switch r {
+	case 'a':
+		return "\a", s, nil
+	case 'b':
+		return "\b", s, nil
+	case 'f':
+		return "\f", s, nil
+	case 'n':
+		return "\n", s, nil
+	case 'r':
+		return "\r", s, nil
+	case 't':
+		return "\t", s, nil
+	case 'v':
+		return "\v", s, nil
+	case '?':
+		return "?", s, nil // trigraph workaround
+	case '\'', '"', '\\':
+		return string(r), s, nil
+	case '0', '1', '2', '3', '4', '5', '6', '7', 'x', 'X':
+		if len(s) < 2 {
+			return "", "", fmt.Errorf(`\%c requires 2 following digits`, r)
+		}
+		base := 8
+		ss := s[:2]
+		s = s[2:]
+		if r == 'x' || r == 'X' {
+			base = 16
+		} else {
+			ss = string(r) + ss
+		}
+		i, err := strconv.ParseUint(ss, base, 8)
+		if err != nil {
+			return "", "", err
+		}
+		return string([]byte{byte(i)}), s, nil
+	case 'u', 'U':
+		n := 4
+		if r == 'U' {
+			n = 8
+		}
+		if len(s) < n {
+			return "", "", fmt.Errorf(`\%c requires %d digits`, r, n)
+		}
 
-	// strconv.Unquote is for Go strings, but text format strings may use
-	// single *or* double quotes.
-	if s[0] == '\'' {
-		s = strings.Map(quoteSwap, s)
-		s, err := unquoteC(s)
-		s = strings.Map(quoteSwap, s)
-		return s, err
+		bs := make([]byte, n/2)
+		for i := 0; i < n; i += 2 {
+			a, ok1 := unhex(s[i])
+			b, ok2 := unhex(s[i+1])
+			if !ok1 || !ok2 {
+				return "", "", errBadHex
+			}
+			bs[i/2] = a<<4 | b
+		}
+		s = s[n:]
+		return string(bs), s, nil
 	}
+	return "", "", fmt.Errorf(`unknown escape \%c`, r)
+}
 
-	// A notable divergence between quoted string literals in Go
-	// and what is acceptable for text format protocol buffers:
-	// the former considers \' invalid, but the latter considers it valid.
-	s = strings.Replace(s, `\'`, "'", -1)
-	return strconv.Unquote(s)
+// Adapted from src/pkg/strconv/quote.go.
+func unhex(b byte) (v byte, ok bool) {
+	switch {
+	case '0' <= b && b <= '9':
+		return b - '0', true
+	case 'a' <= b && b <= 'f':
+		return b - 'a' + 10, true
+	case 'A' <= b && b <= 'F':
+		return b - 'A' + 10, true
+	}
+	return 0, false
 }
 
 // Back off the parser by one token. Can only be done between calls to next().

+ 27 - 0
proto/text_parser_test.go

@@ -120,6 +120,33 @@ var unMarshalTextTests = []UnmarshalTextTest{
 		},
 	},
 
+	// Quoted string with all the accepted special characters from the C++ test
+	{
+		in: `count:42 name: ` + "\"\\\"A string with \\' characters \\n and \\r newlines and \\t tabs and \\001 slashes \\\\ and  multiple   spaces\"",
+		out: &MyMessage{
+			Count: Int32(42),
+			Name:  String("\"A string with ' characters \n and \r newlines and \t tabs and \001 slashes \\ and  multiple   spaces"),
+		},
+	},
+
+	// Quoted string with quoted backslash
+	{
+		in: `count:42 name: "\\'xyz"`,
+		out: &MyMessage{
+			Count: Int32(42),
+			Name:  String(`\'xyz`),
+		},
+	},
+
+	// Quoted string with UTF-8 bytes.
+	{
+		in: "count:42 name: '\303\277\302\201\xAB'",
+		out: &MyMessage{
+			Count: Int32(42),
+			Name:  String("\303\277\302\201\xAB"),
+		},
+	},
+
 	// Bad quoted string
 	{
 		in:  `inner: < host: "\0" >` + "\n",

+ 18 - 1
proto/text_test.go

@@ -218,13 +218,30 @@ func TestStringEscaping(t *testing.T) {
 			&pb.Strings{StringField: proto.String("\350\260\267\346\255\214")},
 			"string_field: \"\\350\\260\\267\\346\\255\\214\"\n",
 		},
+		{
+			// Some UTF-8.
+			&pb.Strings{StringField: proto.String("\x00\x01\xff\x81")},
+			`string_field: "\000\001\377\201"` + "\n",
+		},
 	}
 
 	for i, tc := range testCases {
 		var buf bytes.Buffer
 		proto.MarshalText(&buf, tc.in)
-		if s := buf.String(); s != tc.out {
+		s := buf.String()
+		if s != tc.out {
 			t.Errorf("#%d: Got:\n%s\nExpected:\n%s\n", i, s, tc.out)
+			continue
+		}
+
+		// Check round-trip.
+		pb := new(pb.Strings)
+		if err := proto.UnmarshalText(s, pb); err != nil {
+			t.Errorf("#%d: UnmarshalText: %v", i, err)
+			continue
+		}
+		if !proto.Equal(pb, tc.in) {
+			t.Errorf("#%d: Round-trip failed:\nstart: %v\n  end: %v", i, tc.in, pb)
 		}
 	}
 }