Просмотр исходного кода

#137 fix unicode surrogate incompatibility

Tao Wen 8 лет назад
Родитель
Сommit
156284b028
2 измененных файлов с 71 добавлено и 53 удалено
  1. 61 52
      feature_iter_string.go
  2. 10 1
      jsoniter_string_test.go

+ 61 - 52
feature_iter_string.go

@@ -42,58 +42,7 @@ func (iter *Iterator) readStringSlowPath() (ret string) {
 		}
 		if c == '\\' {
 			c = iter.readByte()
-			switch c {
-			case 'u', 'U':
-				r := iter.readU4()
-				if utf16.IsSurrogate(r) {
-					c = iter.readByte()
-					if iter.Error != nil {
-						return
-					}
-					if c != '\\' {
-						iter.ReportError("ReadString",
-							`expects \u after utf16 surrogate, but \ not found`)
-						return
-					}
-					c = iter.readByte()
-					if iter.Error != nil {
-						return
-					}
-					if c != 'u' && c != 'U' {
-						iter.ReportError("ReadString",
-							`expects \u after utf16 surrogate, but \u not found`)
-						return
-					}
-					r2 := iter.readU4()
-					if iter.Error != nil {
-						return
-					}
-					combined := utf16.DecodeRune(r, r2)
-					str = appendRune(str, combined)
-				} else {
-					str = appendRune(str, r)
-				}
-			case '"':
-				str = append(str, '"')
-			case '\\':
-				str = append(str, '\\')
-			case '/':
-				str = append(str, '/')
-			case 'b':
-				str = append(str, '\b')
-			case 'f':
-				str = append(str, '\f')
-			case 'n':
-				str = append(str, '\n')
-			case 'r':
-				str = append(str, '\r')
-			case 't':
-				str = append(str, '\t')
-			default:
-				iter.ReportError("ReadString",
-					`invalid escape char after \`)
-				return
-			}
+			str = iter.readEscapedChar(c, str)
 		} else {
 			str = append(str, c)
 		}
@@ -102,6 +51,66 @@ func (iter *Iterator) readStringSlowPath() (ret string) {
 	return
 }
 
+func (iter *Iterator) readEscapedChar(c byte, str []byte) []byte {
+	switch c {
+	case 'u':
+		r := iter.readU4()
+		if utf16.IsSurrogate(r) {
+			c = iter.readByte()
+			if iter.Error != nil {
+				return nil
+			}
+			if c != '\\' {
+				iter.unreadByte()
+				str = appendRune(str, r)
+				return str
+			}
+			c = iter.readByte()
+			if iter.Error != nil {
+				return nil
+			}
+			if c != 'u' {
+				str = appendRune(str, r)
+				return iter.readEscapedChar(c, str)
+			}
+			r2 := iter.readU4()
+			if iter.Error != nil {
+				return nil
+			}
+			combined := utf16.DecodeRune(r, r2)
+			if combined == '\uFFFD' {
+				str = appendRune(str, r)
+				str = appendRune(str, r2)
+			} else {
+				str = appendRune(str, combined)
+			}
+		} else {
+			str = appendRune(str, r)
+		}
+	case '"':
+		str = append(str, '"')
+	case '\\':
+		str = append(str, '\\')
+	case '/':
+		str = append(str, '/')
+	case 'b':
+		str = append(str, '\b')
+	case 'f':
+		str = append(str, '\f')
+	case 'n':
+		str = append(str, '\n')
+	case 'r':
+		str = append(str, '\r')
+	case 't':
+		str = append(str, '\t')
+	default:
+		iter.ReportError("ReadString",
+			`invalid escape char after \`)
+		return nil
+	}
+	return str
+}
+
 // ReadStringAsSlice read string from iterator without copying into string form.
 // The []byte can not be kept, as it will change after next iterator call.
 func (iter *Iterator) ReadStringAsSlice() (ret []byte) {

+ 10 - 1
jsoniter_string_test.go

@@ -19,6 +19,8 @@ func Test_read_string(t *testing.T) {
 		`"\"`,
 		`"\\\"`,
 		"\"\n\"",
+		`"\U0001f64f"`,
+		`"\uD83D\u00"`,
 	}
 	for i := 0; i < 32; i++ {
 		// control characters are invalid
@@ -39,6 +41,11 @@ func Test_read_string(t *testing.T) {
 		{`"a"`, "a"},
 		{`null`, ""},
 		{`"Iñtërnâtiônàlizætiøn,💝🐹🌇⛔"`, "Iñtërnâtiônàlizætiøn,💝🐹🌇⛔"},
+		{`"\uD83D"`, string([]byte{239, 191, 189})},
+		{`"\uD83D\\"`, string([]byte{239, 191, 189, '\\'})},
+		{`"\uD83D\ub000"`, string([]byte{239, 191, 189, 235, 128, 128})},
+		{`"\uD83D\ude04"`, "😄"},
+		{`"\uDEADBEEF"`, string([]byte{239, 191, 189, 66, 69, 69, 70})},
 	}
 
 	for _, tc := range goodInputs {
@@ -111,7 +118,9 @@ func Test_read_exotic_string(t *testing.T) {
 		t.Run(fmt.Sprintf("%v:%v", input, output), func(t *testing.T) {
 			should := require.New(t)
 			iter := ParseString(ConfigDefault, input)
-			should.Equal(output, iter.ReadString())
+			var v string
+			should.Nil(json.Unmarshal([]byte(input), &v))
+			should.Equal(v, iter.ReadString())
 		})
 		t.Run(fmt.Sprintf("%v:%v", input, output), func(t *testing.T) {
 			should := require.New(t)