Procházet zdrojové kódy

codec: json: try fast-path for decoding floats before defaulting to strconv.ParseFloat

The json format for floats is more constrained that what strconv supports.
We noticed some slowdown with float parsing which we have attempted to resolve
by walking through the text and doing an exact parse if within the narrow constraints where
it is simple.

Also, have float32 not convert back and forth between float64 unnecessarily.
Ugorji Nwoke před 6 roky
rodič
revize
0a3d04cd38

+ 2 - 2
codec/decode.go

@@ -2900,11 +2900,11 @@ func (d *Decoder) NumBytesRead() int {
 // else if will call DecodeFloat64 and ensure the value doesn't overflow.
 //
 // Note that we return float64 to reduce unnecessary conversions
-func (d *Decoder) decodeFloat32() float64 {
+func (d *Decoder) decodeFloat32() float32 {
 	if d.js {
 		return d.jdec.DecodeFloat32() // custom implementation for 32-bit
 	}
-	return chkOvf.Float32V(d.d.DecodeFloat64())
+	return float32(chkOvf.Float32V(d.d.DecodeFloat64()))
 }
 
 // ---- container tracking

+ 285 - 0
codec/float.go

@@ -0,0 +1,285 @@
+// Copyright (c) 2012-2018 Ugorji Nwoke. All rights reserved.
+// Use of this source code is governed by a MIT license found in the LICENSE file.
+
+package codec
+
+import "strconv"
+
+// func parseFloat(b []byte, bitsize int) (f float64, err error) {
+// 	if bitsize == 32 {
+// 		return parseFloat32(b)
+// 	} else {
+// 		return parseFloat64(b)
+// 	}
+// }
+
+func parseFloat32(b []byte) (f float32, err error) {
+	return parseFloat32_custom(b)
+	// return parseFloat32_strconv(b)
+}
+
+func parseFloat64(b []byte) (f float64, err error) {
+	return parseFloat64_custom(b)
+	// return parseFloat64_strconv(b)
+}
+
+func parseFloat32_strconv(b []byte) (f float32, err error) {
+	f64, err := strconv.ParseFloat(stringView(b), 32)
+	f = float32(f64)
+	return
+}
+
+func parseFloat64_strconv(b []byte) (f float64, err error) {
+	return strconv.ParseFloat(stringView(b), 64)
+}
+
+// ------ parseFloat custom below --------
+
+// We assume that a lot of floating point numbers in json files will be
+// those that are handwritten, and with defined precision (in terms of number
+// of digits after decimal point), etc.
+//
+// We further assume that this ones can be written in exact format.
+//
+// strconv.ParseFloat has some unnecessary overhead which we can do without
+// for the common case:
+//
+//    - expensive char-by-char check to see if underscores are in right place
+//    - testing for and skipping underscores
+//    - check if the string matches ignorecase +/- inf, +/- infinity, nan
+//    - support for base 16 (0xFFFF...)
+//
+// The functions below will try a fast-path for floats which can be decoded
+// without any loss of precision, meaning they:
+//
+//    - fits within the significand bits of the 32-bits or 64-bits
+//    - exponent fits within the exponent value
+//    - there is no truncation (any extra numbers are all trailing zeros)
+//
+// To figure out what the values are for maxMantDigits, use this idea below:
+//
+// 2^23 =                 838 8608 (between 10^ 6 and 10^ 7) (significand bits of uint32)
+// 2^32 =             42 9496 7296 (between 10^ 9 and 10^10) (full uint32)
+// 2^52 =      4503 5996 2737 0496 (between 10^15 and 10^16) (significand bits of uint64)
+// 2^64 = 1844 6744 0737 0955 1616 (between 10^19 and 10^20) (full uint64)
+//
+// Since we only allow for up to what can comfortably fit into the significand
+// ignoring the exponent, and we only try to parse iff significand fits into the
+
+// Exact powers of 10.
+var float64pow10 = [...]float64{
+	1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
+	1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+	1e20, 1e21, 1e22,
+}
+var float32pow10 = [...]float32{1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10}
+
+type floatinfo struct {
+	mantbits uint8
+	expbits  uint8
+	bias     int16
+
+	exactPow10 int8 // Exact powers of ten are <= 10^N (32: 10, 64: 22)
+	exactInts  int8 // Exact integers are <= 10^N
+
+	maxMantDigits int8 // 10^19 fits in uint64, while 10^9 fits in uint32
+}
+
+var fi32 = floatinfo{23, 8, -127, 10, 7, 9}     // maxMantDigits = 9
+var fi64 = floatinfo{52, 11, -1023, 22, 15, 19} // maxMantDigits = 19
+
+const fMax64 = 1e15
+const fMax32 = 1e7
+
+func parseFloatErr(b []byte) error {
+	return &strconv.NumError{
+		Func: "ParseFloat",
+		Err:  strconv.ErrSyntax,
+		Num:  string(b),
+	}
+}
+
+func parseFloat32_custom(b []byte) (f float32, err error) {
+	mantissa, exp, i16, neg, _, bad, ok := readFloat(b, fi32)
+	if bad {
+		return 0, parseFloatErr(b)
+	}
+	// defer parseFloatDebug(b, 32, &trunc, exp, trunc, ok)
+	if ok {
+		f = float32(mantissa)
+		if neg {
+			f = -f
+		}
+		if exp == 0 {
+		} else if exp < 0 { // int / 10^k
+			f /= float32pow10[-exp]
+		} else { // exp > 0
+			i16 = int16(fi32.exactPow10)
+			if exp > i16 {
+				f *= float32pow10[exp-i16]
+				exp = i16
+			}
+			if f < -fMax32 || f > fMax32 { // exponent may be too large - outside range
+				goto FALLBACK
+			}
+			f *= float32pow10[exp]
+		}
+		return
+	}
+FALLBACK:
+	return parseFloat32_strconv(b)
+}
+
+func parseFloat64_custom(b []byte) (f float64, err error) {
+	mantissa, exp, i16, neg, _, bad, ok := readFloat(b, fi64)
+	if bad {
+		return 0, parseFloatErr(b)
+	}
+	// defer parseFloatDebug(b, 64, &trunc, exp, trunc, ok)
+	if ok {
+		f = float64(mantissa)
+		if neg {
+			f = -f
+		}
+		if exp == 0 {
+		} else if exp < 0 { // int / 10^k
+			f /= float64pow10[-exp]
+		} else { // exp > 0
+			i16 = int16(fi64.exactPow10)
+			if exp > i16 {
+				f *= float64pow10[exp-i16]
+				exp = i16
+			}
+			if f < -fMax64 || f > fMax64 { // exponent may be too large - outside range
+				goto FALLBACK
+			}
+			f *= float64pow10[exp]
+		}
+		return
+	}
+FALLBACK:
+	return parseFloat64_strconv(b)
+}
+
+func readFloat(s []byte, y floatinfo) (mantissa uint64, exp, i16 int16, neg, trunc, bad, ok bool) {
+	var i int
+	if len(s) == 0 {
+		bad = true
+		return
+	}
+	switch s[0] {
+	case '+':
+		i++
+	case '-':
+		neg = true
+		i++
+	}
+
+	// we considered punting early if string has length > maxMantDigits, but this doesn't account
+	// for trailing 0's e.g. 700000000000000000000 can be encoded exactly as it is 7e20
+
+	const base = 10
+
+	// var sawdot, sawdigits, sawexp bool
+	var sawdot bool
+	var nd, ndMant, dp int8
+L:
+	for ; i < len(s); i++ {
+		switch s[i] {
+		case '.':
+			if sawdot {
+				bad = true
+				return
+			}
+			sawdot = true
+			dp = nd
+		case '0':
+			if nd == 0 { // ignore leading zeros
+				dp--
+				continue
+			}
+			nd++
+			if ndMant < y.maxMantDigits {
+				mantissa *= base
+				ndMant++
+			}
+		case '1', '2', '3', '4', '5', '6', '7', '8', '9':
+			// sawdigits = true
+			nd++
+			if ndMant < y.maxMantDigits {
+				mantissa *= base
+				mantissa += uint64(s[i] - '0')
+				ndMant++
+			} else {
+				trunc = true
+				return // break L
+			}
+		case 'e', 'E':
+			// sawexp = true
+			i++
+			if i == len(s) {
+				break L
+			}
+			var eneg bool
+			if s[i] == '+' {
+				i++
+			} else if s[i] == '-' {
+				i++
+				eneg = true
+			}
+			if i == len(s) {
+				break L
+			}
+			// for exact match, exponent is in single or double digits (-22 to 37 for float64).
+			// exit quick if exponent is more than 2 digits.
+			if len(s)-i > 2 {
+				return
+			}
+			var e int8
+			for ; i < len(s); i++ {
+				if s[i] < '0' || s[i] > '9' {
+					bad = true
+					return
+				}
+				e = e*base + int8(s[i]-'0')
+			}
+			if eneg {
+				dp -= e
+			} else {
+				dp += e
+			}
+			break L
+		default:
+			bad = true
+			return
+		}
+	}
+	// if !sawdigits {
+	// 	bad = true
+	// 	return
+	// }
+	if !sawdot {
+		dp = nd
+	}
+
+	if mantissa != 0 {
+		nd = dp - ndMant
+		if nd < -y.exactPow10 || nd > y.exactInts+y.exactPow10 { // cannot handle it
+			return
+		}
+		exp = int16(nd)
+	}
+	ok = true && !trunc && mantissa>>y.mantbits == 0
+	return
+}
+
+// func parseFloatDebug(b []byte, bitsize int, strconv *bool, exp int16, trunc, ok bool) {
+// 	if false && bitsize == 64 {
+// 		return
+// 	}
+// 	if *strconv {
+// 		xdebugf("parseFloat%d: delegating: %s, exp: %d, trunc: %v, ok: %v", bitsize, b, exp, trunc, ok)
+// 	} else {
+// 		xdebug2f("parseFloat%d: attempting: %s, exp: %d, trunc: %v, ok: %v", bitsize, b, exp, trunc, ok)
+// 	}
+// }

+ 1 - 1
codec/gen-helper.generated.go

@@ -270,4 +270,4 @@ func (f genHelperDecoder) DecReadMapElemKey() { f.d.mapElemKey() }
 func (f genHelperDecoder) DecReadMapElemValue() { f.d.mapElemValue() }
 
 // FOR USE BY CODECGEN ONLY. IT *WILL* CHANGE WITHOUT NOTICE. *DO NOT USE*
-func (f genHelperDecoder) DecDecodeFloat32() float64 { return f.d.decodeFloat32() }
+func (f genHelperDecoder) DecDecodeFloat32() float32 { return f.d.decodeFloat32() }

+ 1 - 1
codec/gen-helper.go.tmpl

@@ -250,5 +250,5 @@ func (f genHelperDecoder) DecReadMapElemKey() { f.d.mapElemKey() }
 func (f genHelperDecoder) DecReadMapElemValue() { f.d.mapElemValue() }
 
 // FOR USE BY CODECGEN ONLY. IT *WILL* CHANGE WITHOUT NOTICE. *DO NOT USE*
-func (f genHelperDecoder) DecDecodeFloat32() float64 { return f.d.decodeFloat32() }
+func (f genHelperDecoder) DecDecodeFloat32() float32 { return f.d.decodeFloat32() }
 

+ 1 - 1
codec/helper_not_unsafe.go

@@ -182,7 +182,7 @@ func (d *Decoder) kTime(f *codecFnInfo, rv reflect.Value) {
 }
 
 func (d *Decoder) kFloat32(f *codecFnInfo, rv reflect.Value) {
-	rv.SetFloat(d.decodeFloat32())
+	rv.SetFloat(float64(d.decodeFloat32()))
 }
 
 func (d *Decoder) kFloat64(f *codecFnInfo, rv reflect.Value) {

+ 1 - 1
codec/helper_unsafe.go

@@ -327,7 +327,7 @@ func (d *Decoder) kTime(f *codecFnInfo, rv reflect.Value) {
 
 func (d *Decoder) kFloat32(f *codecFnInfo, rv reflect.Value) {
 	urv := (*unsafeReflectValue)(unsafe.Pointer(&rv))
-	*(*float32)(urv.ptr) = float32(d.decodeFloat32())
+	*(*float32)(urv.ptr) = d.decodeFloat32()
 }
 
 func (d *Decoder) kFloat64(f *codecFnInfo, rv reflect.Value) {

+ 33 - 20
codec/json.go

@@ -851,7 +851,7 @@ func (d *jsonDecDriver) DecodeUint64() (u uint64) {
 		d.d.errorf("minus found parsing unsigned integer: %s", bs)
 	} else if badsyntax {
 		// fallback: try to decode as float, and cast
-		n = d.decUint64ViaFloat(stringView(bs))
+		n = d.decUint64ViaFloat(bs)
 	}
 	return n
 }
@@ -869,9 +869,9 @@ func (d *jsonDecDriver) DecodeInt64() (i int64) {
 		// d.d.errorf("invalid syntax for integer: %s", bs)
 		// fallback: try to decode as float, and cast
 		if neg {
-			n = d.decUint64ViaFloat(stringView(bs[1:]))
+			n = d.decUint64ViaFloat(bs[1:])
 		} else {
-			n = d.decUint64ViaFloat(stringView(bs))
+			n = d.decUint64ViaFloat(bs)
 		}
 	}
 	if neg {
@@ -888,11 +888,11 @@ func (d *jsonDecDriver) DecodeInt64() (i int64) {
 	return
 }
 
-func (d *jsonDecDriver) decUint64ViaFloat(s string) (u uint64) {
+func (d *jsonDecDriver) decUint64ViaFloat(s []byte) (u uint64) {
 	if len(s) == 0 {
 		return
 	}
-	f, err := strconv.ParseFloat(s, 64)
+	f, err := parseFloat64(s)
 	if err != nil {
 		d.d.errorf("invalid syntax for integer: %s", s)
 		// d.d.errorv(err)
@@ -906,24 +906,37 @@ func (d *jsonDecDriver) decUint64ViaFloat(s string) (u uint64) {
 	return uint64(fi)
 }
 
-func (d *jsonDecDriver) decodeFloat(bitsize int) (f float64) {
-	bs := d.decNumBytes()
-	if len(bs) == 0 {
-		return
-	}
-	f, err := strconv.ParseFloat(stringView(bs), bitsize)
-	if err != nil {
-		d.d.errorv(err)
-	}
-	return
-}
+// func (d *jsonDecDriver) decodeFloat(bitsize int) (f float64) {
+// 	bs := d.decNumBytes()
+// 	if len(bs) == 0 {
+// 		return
+// 	}
+// 	f, err := parseFloat(bs, bitsize)
+// 	if err != nil {
+// 		d.d.errorv(err)
+// 	}
+// 	return
+// }
 
 func (d *jsonDecDriver) DecodeFloat64() (f float64) {
-	return d.decodeFloat(64)
+	// return d.decodeFloat(64)
+	var err error
+	if bs := d.decNumBytes(); len(bs) > 0 {
+		if f, err = parseFloat64(bs); err != nil {
+			d.d.errorv(err)
+		}
+	}
+	return
 }
 
-func (d *jsonDecDriver) DecodeFloat32() (f float64) {
-	return d.decodeFloat(32)
+func (d *jsonDecDriver) DecodeFloat32() (f float32) {
+	var err error
+	if bs := d.decNumBytes(); len(bs) > 0 {
+		if f, err = parseFloat32(bs); err != nil {
+			d.d.errorv(err)
+		}
+	}
+	return
 }
 
 func (d *jsonDecDriver) DecodeExt(rv interface{}, xtag uint64, ext Ext) (realxtag uint64) {
@@ -1183,7 +1196,7 @@ func (d *jsonDecDriver) nakedNum(z *decNaked, bs []byte) (err error) {
 	return
 F:
 	z.v = valueTypeFloat
-	z.f, err = strconv.ParseFloat(stringView(bs), 64)
+	z.f, err = parseFloat64(bs)
 	return
 }