Quellcode durchsuchen

codec: introduce EncodeOptions:StringToRaw flag to control string encoding

Previously, we based the implementation on the idea that string types are always UTF8.
That is incorrect.

In go, a string is just an immutable sequence of bytes (i.e. immutable view of []byte).

Now, we honor that, and provide an EncodeOptions flag called StringToRaw that configures
how we encode strings - either as a "raw" sequence of bytes, or a UTF-8 string.

This is supported in reflection-based path, fast-path and codecgen.

While at it, remove the loop-unrolling done in fast-path.
This makes the code easier to read and we don't have to modify stuff in multiple places.
We will let the compiler and CPU branch predictor help remove the performance impact of
the unrolled loops, which is easier as the condition is constant throughout the loop.

Fixes #286
Ugorji Nwoke vor 6 Jahren
Ursprung
Commit
aa2c01e021

+ 47 - 34
codec/encode.go

@@ -48,13 +48,12 @@ type encDriver interface {
 	// encodeExtPreamble(xtag byte, length int)
 	EncodeRawExt(re *RawExt, e *Encoder)
 	EncodeExt(v interface{}, xtag uint64, ext Ext, e *Encoder)
-	// Deprecated: try to use EncodeStringEnc instead
+	// Deprecated: use EncodeStringEnc instead
 	EncodeString(c charEncoding, v string)
-	// c cannot be cRAW
-	EncodeStringEnc(c charEncoding, v string)
-	// EncodeSymbol(v string)
-	// Deprecated: try to use EncodeStringBytesRaw instead
+	// Deprecated: use EncodeStringBytesRaw instead
 	EncodeStringBytes(c charEncoding, v []byte)
+	EncodeStringEnc(c charEncoding, v string) // c cannot be cRAW
+	// EncodeSymbol(v string)
 	EncodeStringBytesRaw(v []byte)
 	EncodeTime(time.Time)
 	//encBignum(f *big.Int)
@@ -169,6 +168,11 @@ type EncodeOptions struct {
 	// If unset, we error out.
 	Raw bool
 
+	// StringToRaw controls how strings are encoded -
+	// by default, they are encoded as UTF-8,
+	// but can be treated as []byte during an encode.
+	StringToRaw bool
+
 	// // AsSymbols defines what should be encoded as symbols.
 	// //
 	// // Encoding as symbols can reduce the encoded size significantly.
@@ -644,14 +648,14 @@ func (e *Encoder) kStructNoOmitempty(f *codecFnInfo, rv reflect.Value) {
 			for _, si := range tisfi {
 				ee.WriteMapElemKey()
 				// ee.EncodeStringEnc(cUTF8, si.encName)
-				e.kStructFieldKey(fti.keyType, si)
+				e.kStructFieldKey(fti.keyType, si.encNameAsciiAlphaNum, si.encName)
 				ee.WriteMapElemValue()
 				e.encodeValue(sfn.field(si), nil, true)
 			}
 		} else {
 			for _, si := range tisfi {
 				// ee.EncodeStringEnc(cUTF8, si.encName)
-				e.kStructFieldKey(fti.keyType, si)
+				e.kStructFieldKey(fti.keyType, si.encNameAsciiAlphaNum, si.encName)
 				e.encodeValue(sfn.field(si), nil, true)
 			}
 		}
@@ -672,33 +676,18 @@ func (e *Encoder) kStructNoOmitempty(f *codecFnInfo, rv reflect.Value) {
 	}
 }
 
-func (e *Encoder) kStructFieldKey(keyType valueType, s *structFieldInfo) {
+func (e *Encoder) kStructFieldKey(keyType valueType, encNameAsciiAlphaNum bool, encName string) {
 	var m must
 	// use if-else-if, not switch (which compiles to binary-search)
 	// since keyType is typically valueTypeString, branch prediction is pretty good.
 	if keyType == valueTypeString {
-		if e.js && s.encNameAsciiAlphaNum { // keyType == valueTypeString
+		if e.js && encNameAsciiAlphaNum { // keyType == valueTypeString
 			e.w.writen1('"')
-			e.w.writestr(s.encName)
+			e.w.writestr(encName)
 			e.w.writen1('"')
 		} else { // keyType == valueTypeString
-			e.e.EncodeStringEnc(cUTF8, s.encName)
+			e.e.EncodeStringEnc(cUTF8, encName)
 		}
-	} else if keyType == valueTypeInt {
-		e.e.EncodeInt(m.Int(strconv.ParseInt(s.encName, 10, 64)))
-	} else if keyType == valueTypeUint {
-		e.e.EncodeUint(m.Uint(strconv.ParseUint(s.encName, 10, 64)))
-	} else if keyType == valueTypeFloat {
-		e.e.EncodeFloat64(m.Float(strconv.ParseFloat(s.encName, 64)))
-	}
-}
-
-func (e *Encoder) kStructFieldKeyName(keyType valueType, encName string) {
-	var m must
-	// use if-else-if, not switch (which compiles to binary-search)
-	// since keyType is typically valueTypeString, branch prediction is pretty good.
-	if keyType == valueTypeString {
-		e.e.EncodeStringEnc(cUTF8, encName)
 	} else if keyType == valueTypeInt {
 		e.e.EncodeInt(m.Int(strconv.ParseInt(encName, 10, 64)))
 	} else if keyType == valueTypeUint {
@@ -803,7 +792,7 @@ func (e *Encoder) kStruct(f *codecFnInfo, rv reflect.Value) {
 				kv = fkvs[j]
 				ee.WriteMapElemKey()
 				// ee.EncodeStringEnc(cUTF8, kv.v)
-				e.kStructFieldKey(fti.keyType, kv.v)
+				e.kStructFieldKey(fti.keyType, kv.v.encNameAsciiAlphaNum, kv.v.encName)
 				ee.WriteMapElemValue()
 				e.encodeValue(kv.r, nil, true)
 			}
@@ -811,14 +800,14 @@ func (e *Encoder) kStruct(f *codecFnInfo, rv reflect.Value) {
 			for j = 0; j < len(fkvs); j++ {
 				kv = fkvs[j]
 				// ee.EncodeStringEnc(cUTF8, kv.v)
-				e.kStructFieldKey(fti.keyType, kv.v)
+				e.kStructFieldKey(fti.keyType, kv.v.encNameAsciiAlphaNum, kv.v.encName)
 				e.encodeValue(kv.r, nil, true)
 			}
 		}
 		// now, add the others
 		for k, v := range mf {
 			ee.WriteMapElemKey()
-			e.kStructFieldKeyName(fti.keyType, k)
+			e.kStructFieldKey(fti.keyType, false, k)
 			ee.WriteMapElemValue()
 			e.encode(v)
 		}
@@ -903,7 +892,11 @@ func (e *Encoder) kMap(f *codecFnInfo, rv reflect.Value) {
 			ee.WriteMapElemKey()
 		}
 		if keyTypeIsString {
-			ee.EncodeStringEnc(cUTF8, mks[j].String())
+			if e.h.StringToRaw {
+				ee.EncodeStringBytesRaw(bytesView(mks[j].String()))
+			} else {
+				ee.EncodeStringEnc(cUTF8, mks[j].String())
+			}
 		} else {
 			e.encodeValue(mks[j], keyFn, true)
 		}
@@ -953,7 +946,11 @@ func (e *Encoder) kMapCanonical(rtkey reflect.Type, rv reflect.Value, mks []refl
 			if elemsep {
 				ee.WriteMapElemKey()
 			}
-			ee.EncodeStringEnc(cUTF8, mksv[i].v)
+			if e.h.StringToRaw {
+				ee.EncodeStringBytesRaw(bytesView(mksv[i].v))
+			} else {
+				ee.EncodeStringEnc(cUTF8, mksv[i].v)
+			}
 			if elemsep {
 				ee.WriteMapElemValue()
 			}
@@ -1562,7 +1559,11 @@ func (e *Encoder) encode(iv interface{}) {
 		e.encodeValue(v, nil, true)
 
 	case string:
-		e.e.EncodeStringEnc(cUTF8, v)
+		if e.h.StringToRaw {
+			e.e.EncodeStringBytesRaw(bytesView(v))
+		} else {
+			e.e.EncodeStringEnc(cUTF8, v)
+		}
 	case bool:
 		e.e.EncodeBool(v)
 	case int:
@@ -1600,7 +1601,11 @@ func (e *Encoder) encode(iv interface{}) {
 		e.rawBytes(*v)
 
 	case *string:
-		e.e.EncodeStringEnc(cUTF8, *v)
+		if e.h.StringToRaw {
+			e.e.EncodeStringBytesRaw(bytesView(*v))
+		} else {
+			e.e.EncodeStringEnc(cUTF8, *v)
+		}
 	case *bool:
 		e.e.EncodeBool(*v)
 	case *int:
@@ -1719,7 +1724,7 @@ TOP:
 // 	} else if asis {
 // 		e.asis(bs)
 // 	} else {
-// 		e.e.EncodeStringBytes(c, bs)
+// 		e.e.EncodeStringBytesRaw(bs)
 // 	}
 // }
 
@@ -1775,3 +1780,11 @@ func (e *Encoder) rawBytes(vv Raw) {
 func (e *Encoder) wrapErr(v interface{}, err *error) {
 	*err = encodeError{codecError{name: e.hh.Name(), err: v}}
 }
+
+// func encStringAsRawBytesMaybe(ee encDriver, s string, stringToRaw bool) {
+// 	if stringToRaw {
+// 		ee.EncodeStringBytesRaw(bytesView(s))
+// 	} else {
+// 		ee.EncodeStringEnc(cUTF8, s)
+// 	}
+// }

Datei-Diff unterdrückt, da er zu groß ist
+ 198 - 362
codec/fast-path.generated.go


+ 7 - 70
codec/fast-path.go.tmpl

@@ -191,20 +191,10 @@ func (_ fastpathT) {{ .MethodNamePfx "Enc" false }}V(v []{{ .Elem }}, e *Encoder
 	if v == nil { e.e.EncodeNil(); return }
 	ee, esep := e.e, e.hh.hasElemSeparators()
 	ee.WriteArrayStart(len(v))
-	if esep {
-		for _, v2 := range v {
-			ee.WriteArrayElem()
-			{{ encmd .Elem "v2"}}
-		}
-	} else {
-		for _, v2 := range v {
-			{{ encmd .Elem "v2"}}
-		}
-	} {{/*
 	for _, v2 := range v {
 		if esep { ee.WriteArrayElem() }
 		{{ encmd .Elem "v2"}}
-	} */}}
+	} 
 	ee.WriteArrayEnd()
 }
 func (_ fastpathT) {{ .MethodNamePfx "EncAsMap" false }}V(v []{{ .Elem }}, e *Encoder) {
@@ -214,20 +204,6 @@ func (_ fastpathT) {{ .MethodNamePfx "EncAsMap" false }}V(v []{{ .Elem }}, e *En
 		return
 	}
 	ee.WriteMapStart(len(v) / 2)
-	if esep {
-		for j, v2 := range v {
-			if j%2 == 0 {
-				ee.WriteMapElemKey()
-			} else {
-				ee.WriteMapElemValue()
-			}
-			{{ encmd .Elem "v2"}}
-		}
-	} else {
-		for _, v2 := range v {
-			{{ encmd .Elem "v2"}}
-		}
-	} {{/*
 	for j, v2 := range v {
 		if esep {
 			if j%2 == 0 {
@@ -237,7 +213,7 @@ func (_ fastpathT) {{ .MethodNamePfx "EncAsMap" false }}V(v []{{ .Elem }}, e *En
 			}
 		}
 		{{ encmd .Elem "v2"}}
-	} */}}
+	}
 	ee.WriteMapEnd()
 }
 {{end}}{{end}}{{end}}
@@ -266,70 +242,31 @@ func (_ fastpathT) {{ .MethodNamePfx "Enc" false }}V(v map[{{ .MapKey }}]{{ .Ele
 			i++
 		}
 		sort.Sort(bytesISlice(v2))
-		if esep {
-			for j := range v2 {
-				ee.WriteMapElemKey()
-				e.asis(v2[j].v)
-				ee.WriteMapElemValue()
-				e.encode(v[v2[j].i])
-			}
-		} else {
-			for j := range v2 {
-				e.asis(v2[j].v)
-				e.encode(v[v2[j].i])
-			}
-		} {{/*
 		for j := range v2 {
 			if esep { ee.WriteMapElemKey() }
 			e.asis(v2[j].v)
 			if esep { ee.WriteMapElemValue() }
 			e.encode(v[v2[j].i])
-		} */}} {{else}}{{ $x := sorttype .MapKey true}}v2 := make([]{{ $x }}, len(v))
+		} {{else}}{{ $x := sorttype .MapKey true}}v2 := make([]{{ $x }}, len(v))
 		var i uint
 		for k := range v {
 			v2[i] = {{ $x }}(k)
 			i++
 		}
 		sort.Sort({{ sorttype .MapKey false}}(v2))
-		if esep {
-			for _, k2 := range v2 {
-				ee.WriteMapElemKey()
-				{{if eq .MapKey "string"}}ee.EncodeStringEnc(cUTF8, k2){{else}}{{ $y := printf "%s(k2)" .MapKey }}{{ encmd .MapKey $y }}{{end}}
-				ee.WriteMapElemValue()
-				{{ $y := printf "v[%s(k2)]" .MapKey }}{{ encmd .Elem $y }}
-			} 
-		} else {
-			for _, k2 := range v2 {
-				{{if eq .MapKey "string"}}ee.EncodeStringEnc(cUTF8, k2){{else}}{{ $y := printf "%s(k2)" .MapKey }}{{ encmd .MapKey $y }}{{end}}
-				{{ $y := printf "v[%s(k2)]" .MapKey }}{{ encmd .Elem $y }}
-			} 
-		} {{/*
 		for _, k2 := range v2 {
 			if esep { ee.WriteMapElemKey() }
-			{{if eq .MapKey "string"}}ee.EncodeStringEnc(cUTF8, k2){{else}}{{ $y := printf "%s(k2)" .MapKey }}{{ encmd .MapKey $y }}{{end}}
+			{{if eq .MapKey "string"}} if e.h.StringToRaw {ee.EncodeStringBytesRaw(bytesView(k2))} else {ee.EncodeStringEnc(cUTF8, k2)} {{else}}{{ $y := printf "%s(k2)" .MapKey }}{{ encmd .MapKey $y }}{{end}}
 			if esep { ee.WriteMapElemValue() }
 			{{ $y := printf "v[%s(k2)]" .MapKey }}{{ encmd .Elem $y }}
-		} */}} {{end}}
+		} {{end}}
 	} else {
-		if esep {
-			for k2, v2 := range v {
-				ee.WriteMapElemKey()
-				{{if eq .MapKey "string"}}ee.EncodeStringEnc(cUTF8, k2){{else}}{{ encmd .MapKey "k2"}}{{end}}
-				ee.WriteMapElemValue()
-				{{ encmd .Elem "v2"}}
-			}
-		} else {
-			for k2, v2 := range v {
-				{{if eq .MapKey "string"}}ee.EncodeStringEnc(cUTF8, k2){{else}}{{ encmd .MapKey "k2"}}{{end}}
-				{{ encmd .Elem "v2"}}
-			}
-		} {{/*
 		for k2, v2 := range v {
 			if esep { ee.WriteMapElemKey() }
-			{{if eq .MapKey "string"}}ee.EncodeStringEnc(cUTF8, k2){{else}}{{ encmd .MapKey "k2"}}{{end}}
+			{{if eq .MapKey "string"}} if e.h.StringToRaw {ee.EncodeStringBytesRaw(bytesView(k2))} else {ee.EncodeStringEnc(cUTF8, k2)} {{else}}{{ encmd .MapKey "k2"}}{{end}}
 			if esep { ee.WriteMapElemValue() }
 			{{ encmd .Elem "v2"}}
-		} */}}
+		}
 	}
 	ee.WriteMapEnd()
 }

+ 3 - 0
codec/gen-helper.generated.go

@@ -185,6 +185,9 @@ func (f genHelperEncoder) WriteStr(s string) {
 	f.e.w.writestr(s)
 }
 
+// FOR USE BY CODECGEN ONLY. IT *WILL* CHANGE WITHOUT NOTICE. *DO NOT USE*
+func (f genHelperEncoder) BytesView(v string) []byte { return bytesView(v) }
+
 // FOR USE BY CODECGEN ONLY. IT *WILL* CHANGE WITHOUT NOTICE. *DO NOT USE*
 //
 // Deprecated: No longer used,

+ 2 - 0
codec/gen-helper.go.tmpl

@@ -173,6 +173,8 @@ func (f genHelperEncoder) WriteStr(s string) {
 	f.e.w.writestr(s)
 }
 // FOR USE BY CODECGEN ONLY. IT *WILL* CHANGE WITHOUT NOTICE. *DO NOT USE*
+func (f genHelperEncoder) BytesView(v string) []byte { return bytesView(v) }
+// FOR USE BY CODECGEN ONLY. IT *WILL* CHANGE WITHOUT NOTICE. *DO NOT USE*
 //
 // Deprecated: No longer used,
 // but leave in-place so that old generated files continue to work without regeneration.

+ 4 - 4
codec/gen.go

@@ -798,7 +798,7 @@ func (x *genRunner) enc(varname string, t reflect.Type) {
 	case reflect.Bool:
 		x.line("r.EncodeBool(bool(" + varname + "))")
 	case reflect.String:
-		x.line("r.EncodeStringEnc(codecSelferCcUTF8" + x.xs + ", string(" + varname + "))")
+		x.linef("if z.EncBasicHandle().StringToRaw { r.EncodeStringBytesRaw(z.BytesView(string(%s))) } else { r.EncodeStringEnc(codecSelferCcUTF8%s, string(%s)) }", varname, x.xs, varname)
 	case reflect.Chan:
 		x.xtraSM(varname, t, true, false)
 		// x.encListFallback(varname, rtid, t)
@@ -862,7 +862,7 @@ func (x *genRunner) encZero(t reflect.Type) {
 	case reflect.Bool:
 		x.line("r.EncodeBool(false)")
 	case reflect.String:
-		x.line("r.EncodeStringEnc(codecSelferCcUTF8" + x.xs + `, "")`)
+		x.linef(`if z.EncBasicHandle().StringToRaw { r.EncodeStringBytesRaw([]byte{}) } else { r.EncodeStringEnc(codecSelferCcUTF8%s, "") }`, x.xs)
 	default:
 		x.line("r.EncodeNil()")
 	}
@@ -1051,7 +1051,6 @@ func (x *genRunner) encStruct(varname string, rtid uintptr, t reflect.Type) {
 		}
 		x.line("r.WriteMapElemKey()")
 
-		// x.line("r.EncodeStringEnc(codecSelferCcUTF8" + x.xs + ", `" + si.encName + "`)")
 		// emulate EncStructFieldKey
 		switch ti.keyType {
 		case valueTypeInt:
@@ -1937,7 +1936,8 @@ func genInternalEncCommandAsString(s string, vname string) string {
 	case "int", "int8", "int16", "int32", "int64":
 		return "ee.EncodeInt(int64(" + vname + "))"
 	case "string":
-		return "ee.EncodeStringEnc(cUTF8, " + vname + ")"
+		return "if e.h.StringToRaw { ee.EncodeStringBytesRaw(bytesView(" + vname + ")) " +
+			"} else { ee.EncodeStringEnc(cUTF8, " + vname + ") }"
 	case "float32":
 		return "ee.EncodeFloat32(" + vname + ")"
 	case "float64":

+ 20 - 4
codec/mammoth2_codecgen_generated_test.go

@@ -135,7 +135,11 @@ func (x *TestMammoth2) CodecEncodeSelf(e *Encoder) {
 				r.WriteArrayElem()
 				if false {
 				} else {
-					r.EncodeStringEnc(codecSelferCcUTF819781, string(x.FString))
+					if z.EncBasicHandle().StringToRaw {
+						r.EncodeStringBytesRaw(z.BytesView(string(x.FString)))
+					} else {
+						r.EncodeStringEnc(codecSelferCcUTF819781, string(x.FString))
+					}
 				}
 			} else {
 				r.WriteMapElemKey()
@@ -147,7 +151,11 @@ func (x *TestMammoth2) CodecEncodeSelf(e *Encoder) {
 				r.WriteMapElemValue()
 				if false {
 				} else {
-					r.EncodeStringEnc(codecSelferCcUTF819781, string(x.FString))
+					if z.EncBasicHandle().StringToRaw {
+						r.EncodeStringBytesRaw(z.BytesView(string(x.FString)))
+					} else {
+						r.EncodeStringEnc(codecSelferCcUTF819781, string(x.FString))
+					}
 				}
 			}
 			var yyn14 bool
@@ -168,7 +176,11 @@ func (x *TestMammoth2) CodecEncodeSelf(e *Encoder) {
 						yy15 := *x.FptrString
 						if false {
 						} else {
-							r.EncodeStringEnc(codecSelferCcUTF819781, string(yy15))
+							if z.EncBasicHandle().StringToRaw {
+								r.EncodeStringBytesRaw(z.BytesView(string(yy15)))
+							} else {
+								r.EncodeStringEnc(codecSelferCcUTF819781, string(yy15))
+							}
 						}
 					}
 				}
@@ -189,7 +201,11 @@ func (x *TestMammoth2) CodecEncodeSelf(e *Encoder) {
 						yy17 := *x.FptrString
 						if false {
 						} else {
-							r.EncodeStringEnc(codecSelferCcUTF819781, string(yy17))
+							if z.EncBasicHandle().StringToRaw {
+								r.EncodeStringBytesRaw(z.BytesView(string(yy17)))
+							} else {
+								r.EncodeStringEnc(codecSelferCcUTF819781, string(yy17))
+							}
 						}
 					}
 				}

+ 1 - 1
codec/simple.go

@@ -158,7 +158,7 @@ func (e *simpleEncDriver) WriteMapStart(length int) {
 }
 
 // func (e *simpleEncDriver) EncodeSymbol(v string) {
-// 	e.EncodeString(cUTF8, v)
+// 	e.EncodeStringEnc(cUTF8, v)
 // }
 
 func (e *simpleEncDriver) EncodeStringEnc(c charEncoding, v string) {

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.