Ver Fonte

codec: json: fix unicode escape decoding

When decoding quoted strings, invalid UTF-8 or invalid UTF-16 surrogate pairs
are not treated as an error.
Instead, they are replaced by the Unicode replacement character U+FFFD.

Fixes #213
Ugorji Nwoke há 8 anos atrás
pai
commit
cbd720c66c
3 ficheiros alterados com 57 adições e 24 exclusões
  1. 23 0
      codec/codec_test.go
  2. 32 24
      codec/json.go
  3. 2 0
      codec/z_all_test.go

+ 23 - 0
codec/codec_test.go

@@ -2020,6 +2020,29 @@ func TestJsonLargeInteger(t *testing.T) {
 	}
 }
 
+func TestJsonInvalidUnicode(t *testing.T) {
+	var m = map[string]string{
+		`"\udc49\u0430abc"`: "\uFFFDabc",
+		`"\udc49\u0430"`:    "\uFFFD",
+		`"\udc49abc"`:       "\uFFFDabc",
+		`"\udc49"`:          "\uFFFD",
+		`"\udZ49\u0430abc"`: "\uFFFD\u0430abc",
+		`"\udcG9\u0430"`:    "\uFFFD\u0430",
+		`"\uHc49abc"`:       "\uFFFDabc",
+		`"\uKc49"`:          "\uFFFD",
+		// ``: "",
+	}
+	for k, v := range m {
+		// println("k = ", k)
+		var s string
+		testUnmarshalErr(&s, []byte(k), testJsonH, t, "-")
+		if s != v {
+			logT(t, "not equal: %q, %q", v, s)
+			failT(t)
+		}
+	}
+}
+
 // ----------
 
 func TestBincCodecsTable(t *testing.T) {

+ 32 - 24
codec/json.go

@@ -36,6 +36,7 @@ import (
 	"encoding/base64"
 	"reflect"
 	"strconv"
+	"unicode"
 	"unicode/utf16"
 	"unicode/utf8"
 )
@@ -88,14 +89,6 @@ var (
 )
 
 const (
-	// jsonUnreadAfterDecNum controls whether we unread after decoding a number.
-	//
-	// instead of unreading, just update d.tok (iff it's not a whitespace char)
-	// However, doing this means that we may HOLD onto some data which belongs to another stream.
-	// Thus, it is safest to unread the data when done.
-	// keep behind a constant flag for now.
-	jsonUnreadAfterDecNum = true
-
 	// If !jsonValidateSymbols, decoding will be faster, by skipping some checks:
 	//   - If we see first character of null, false or true,
 	//     do not validate subsequent characters.
@@ -882,34 +875,45 @@ func (d *jsonDecDriver) appendStringAsBytes() {
 		case 'u':
 			var r rune
 			var rr uint32
-			c = cs[i+4] // may help reduce bounds-checking
+			if len(cs) < i+4 { // may help reduce bounds-checking
+				d.d.errorf(`json: need at least 4 more bytes for unicode sequence`)
+			}
+			// c = cs[i+4] // may help reduce bounds-checking
 			for j := 1; j < 5; j++ {
 				c = jsonU4Set[cs[i+j]]
 				if c == jsonU4SetErrVal {
-					d.d.errorf(`json: unquoteStr: invalid hex char in \u unicode sequence: %q`, c)
+					// d.d.errorf(`json: unquoteStr: invalid hex char in \u unicode sequence: %q`, c)
+					r = unicode.ReplacementChar
+					i += 4
+					goto encode_rune
 				}
 				rr = rr*16 + uint32(c)
 			}
 			r = rune(rr)
 			i += 4
 			if utf16.IsSurrogate(r) {
-				if !(cs[i+2] == 'u' && cs[i+i] == '\\') {
-					d.d.errorf(`json: unquoteStr: invalid unicode sequence. Expecting \u`)
-					return
-				}
-				i += 2
-				c = cs[i+4] // may help reduce bounds-checking
-				var rr1 uint32
-				for j := 1; j < 5; j++ {
-					c = jsonU4Set[cs[i+j]]
-					if c == jsonU4SetErrVal {
-						d.d.errorf(`json: unquoteStr: invalid hex char in \u unicode sequence: %q`, c)
+				if len(cs) >= i+6 && cs[i+2] == 'u' && cs[i+1] == '\\' {
+					i += 2
+					// c = cs[i+4] // may help reduce bounds-checking
+					var rr1 uint32
+					for j := 1; j < 5; j++ {
+						c = jsonU4Set[cs[i+j]]
+						if c == jsonU4SetErrVal {
+							// d.d.errorf(`json: unquoteStr: invalid hex char in \u unicode sequence: %q`, c)
+							r = unicode.ReplacementChar
+							i += 4
+							goto encode_rune
+						}
+						rr1 = rr1*16 + uint32(c)
 					}
-					rr1 = rr1*16 + uint32(c)
+					r = utf16.DecodeRune(r, rune(rr1))
+					i += 4
+				} else {
+					r = unicode.ReplacementChar
+					goto encode_rune
 				}
-				r = utf16.DecodeRune(r, rune(rr1))
-				i += 4
 			}
+		encode_rune:
 			w2 := utf8.EncodeRune(d.bstr[:], r)
 			v = append(v, d.bstr[:w2]...)
 		default:
@@ -1032,6 +1036,10 @@ func (d *jsonDecDriver) DecodeNaked() {
 // reading multiple values from a stream containing json and non-json content.
 // For example, a user can read a json value, then a cbor value, then a msgpack value,
 // all from the same stream in sequence.
+//
+// Note that, when decoding quoted strings, invalid UTF-8 or invalid UTF-16 surrogate pairs
+// are not treated as an error.
+// Instead, they are replaced by the Unicode replacement character U+FFFD.
 type JsonHandle struct {
 	textEncodingType
 	BasicHandle

+ 2 - 0
codec/z_all_test.go

@@ -201,6 +201,7 @@ func testCodecGroup(t *testing.T) {
 	t.Run("TestBincMammothMapsAndSlices", TestBincMammothMapsAndSlices)
 	t.Run("TestSimpleMammothMapsAndSlices", TestSimpleMammothMapsAndSlices)
 
+	t.Run("TestJsonInvalidUnicode", TestJsonInvalidUnicode)
 	// <tear-down code>
 }
 
@@ -224,6 +225,7 @@ func testJsonGroup(t *testing.T) {
 	t.Run("TestJsonEmbeddedFieldPrecedence", TestJsonEmbeddedFieldPrecedence)
 	t.Run("TestJsonLargeContainerLen", TestJsonLargeContainerLen)
 	t.Run("TestJsonMammothMapsAndSlices", TestJsonMammothMapsAndSlices)
+	t.Run("TestJsonInvalidUnicode", TestJsonInvalidUnicode)
 }
 
 func testBincGroup(t *testing.T) {