Browse Source

codec: misc fixes - handle error from io.Reader.Read, fix MissingFielder support, refactor for better mid-stack inlining, use custom bufio.Writer impl, etc

bufioDecReader: panic if error from Read
bytesDecReader: panic with io.EOF if all bytes read
bytesDecReader: eliminate field a (available) which is inferred from len and cursor
refactor and setup noinline annotations to get best performance from mid-stack inlining
remove entryType and just use variables bytes and bufio
remove ioEncWriter and use bufioEncWriter exclusively.
pooler can now provide [...]byte

Fix missing field decoding support where shared buffer is clobbered
steamline function entry between decode and encode, to support bytes/bufio/non-bufio
Add bufioEncWriter implementation instead of using bufio.NewWriterSize

test: set testStructToArray=false for most suite tests
Ugorji Nwoke 7 years ago
parent
commit
ce1d126566
7 changed files with 481 additions and 182 deletions
  1. 43 32
      codec/0doc.go
  2. 4 2
      codec/codec_test.go
  3. 142 107
      codec/decode.go
  4. 245 40
      codec/encode.go
  5. 43 1
      codec/helper.go
  6. 1 0
      codec/values_flex_test.go
  7. 3 0
      codec/z_all_test.go

+ 43 - 32
codec/0doc.go

@@ -225,35 +225,46 @@ with some caveats. See Encode documentation.
 */
 package codec
 
-// TODO:
-//   - When mid-stack inlining is enabled, do the following:
-//     - if 41<=inlineExtraCallCost<=56, make ioEncWriter.{writen1,writen2,writestr,writeb,atEndOfEncode} go:noinline
-//     - if <=40, do nothing
-//
-// PUNTED:
-//   - To make Handle comparable, make extHandle in BasicHandle a non-embedded pointer,
-//     and use overlay methods on *BasicHandle to call through to extHandle after initializing
-//     the "xh *extHandle" to point to a real slice.
-//
-// BEFORE EACH RELEASE:
-//   - Look through and fix padding for each type, to eliminate false sharing
-//     - critical shared objects that are read many times
-//       TypeInfos
-//     - pooled objects:
-//       decNaked, decNakedContainers, codecFner, typeInfoLoadArray, 
-//     - small objects allocated independently, that we read/use much across threads:
-//       codecFn, typeInfo
-//     - Objects allocated independently and used a lot
-//       Decoder, Encoder,
-//       xxxHandle, xxxEncDriver, xxxDecDriver (xxx = json, msgpack, cbor, binc, simple)
-//     - In all above, arrange values modified together to be close to each other.
-//
-//     For all of these, either ensure that they occupy full cache lines,
-//     or ensure that the things just past the cache line boundary are hardly read/written
-//     e.g. JsonHandle.RawBytesExt - which is copied into json(En|De)cDriver at init
-//
-//     Occupying full cache lines means they occupy 8*N words (where N is an integer).
-//     Check this out by running: ./run.sh -z
-//     - look at those tagged ****, meaning they are not occupying full cache lines
-//     - look at those tagged <<<<, meaning they are larger than 32 words (something to watch)
-//   - Run "golint -min_confidence 0.81"
+/*
+MID-STACK INLINING:
+  - The code currently works optimally with fully enabled mid-stack inlining.
+    This way, when enabled, we are already performant.
+  - To see how well inlining is working, use the following scripts below:
+
+    myblanklines 20
+    zf=7; go build -gcflags "-m=2 -l=4" > $TMPDIR/a$zf.txt 2>&1
+    for i in decReaderSwitch bytesDecReader bufioDecReader ioDecReader \
+        encWriterSwitch bytesEncAppender bufioEncWriter ioEncWriter
+    do echo ; grep -E "cannot inline \(\*${i}\)." $TMPDIR/a7.txt; done 
+
+  - When mid-stack inlining is enabled, consider doing the following:
+    - if 41<=inlineExtraCallCost<=56, make (buf)ioEncWriter.{writen1,writen2,writestr,writeb,atEndOfEncode} go:noinline
+    - if <=40, do nothing (no go:inline)
+
+PUNTED:
+  - To make Handle comparable, make extHandle in BasicHandle a non-embedded pointer,
+    and use overlay methods on *BasicHandle to call through to extHandle after initializing
+    the "xh *extHandle" to point to a real slice.
+
+BEFORE EACH RELEASE:
+  - Look through and fix padding for each type, to eliminate false sharing
+    - critical shared objects that are read many times
+      TypeInfos
+    - pooled objects:
+      decNaked, decNakedContainers, codecFner, typeInfoLoadArray, 
+    - small objects allocated independently, that we read/use much across threads:
+      codecFn, typeInfo
+    - Objects allocated independently and used a lot
+      Decoder, Encoder,
+      xxxHandle, xxxEncDriver, xxxDecDriver (xxx = json, msgpack, cbor, binc, simple)
+    - In all above, arrange values modified together to be close to each other.
+    For all of these, either ensure that they occupy full cache lines,
+    or ensure that the things just past the cache line boundary are hardly read/written
+    e.g. JsonHandle.RawBytesExt - which is copied into json(En|De)cDriver at init
+
+    Occupying full cache lines means they occupy 8*N words (where N is an integer).
+    Check this out by running: ./run.sh -z
+    - look at those tagged ****, meaning they are not occupying full cache lines
+    - look at those tagged <<<<, meaning they are larger than 32 words (something to watch)
+  - Run "golint -min_confidence 0.81"
+*/

+ 4 - 2
codec/codec_test.go

@@ -2330,13 +2330,15 @@ func doTestMissingFields(t *testing.T, name string, h Handle) {
 	// encode missingFielderT2, decode into missingFielderT1, encode it out again, decode into new missingFielderT2, compare
 	v1 := missingFielderT2{S: "true seven eight", B: true, F: 777.0, I: -888}
 	b1 := testMarshalErr(v1, h, t, name+"-missing-enc-2")
-	// xdebugf("b1: %s", b1)
+	// xdebugf("marshal into b1: %s", b1)
 	var v2 missingFielderT1
 	testUnmarshalErr(&v2, b1, h, t, name+"-missing-dec-1")
-	// xdebugf("unmarshal worked")
+	// xdebugf("unmarshal into v2: %v", v2)
 	b2 := testMarshalErr(&v2, h, t, name+"-missing-enc-1")
+	// xdebugf("marshal into b2: %s", b2)
 	var v3 missingFielderT2
 	testUnmarshalErr(&v3, b2, h, t, name+"-missing-dec-2")
+	// xdebugf("unmarshal into v3: %v", v3)
 	testDeepEqualErr(v1, v3, t, name+"-missing-cmp-2")
 }
 

+ 142 - 107
codec/decode.go

@@ -23,8 +23,8 @@ const (
 const (
 	decDefMaxDepth         = 1024 // maximum depth
 	decDefSliceCap         = 8
-	decDefChanCap          = 64 // should be large, as cap cannot be expanded
-	decScratchByteArrayLen = cacheLineSize - (8 * 1)
+	decDefChanCap          = 64            // should be large, as cap cannot be expanded
+	decScratchByteArrayLen = cacheLineSize // - (8 * 1)
 )
 
 var (
@@ -367,9 +367,12 @@ func (z *bufioDecReader) UnreadByte() (err error) {
 }
 
 func (z *bufioDecReader) readx(n int) (bs []byte) {
-	if n <= 0 || z.err != nil {
+	if n <= 0 {
 		return
 	}
+	if z.err != nil {
+		panic(z.err)
+	}
 	if z.c+n <= len(z.buf) {
 		bs = z.buf[z.c : z.c+n]
 		z.n += n
@@ -435,17 +438,14 @@ func (z *bufioDecReader) skip(accept *bitset256) (token byte) {
 	if z.trb {
 		z.tr = append(z.tr, z.buf[z.c:]...)
 	}
-	if z.err != nil {
-		return 0
-	}
 	var n2 int
 	for {
+		if z.err != nil {
+			panic(z.err)
+		}
 		z.c = 0
 		z.buf = z.buf[0:cap(z.buf)]
 		n2, z.err = z.r.Read(z.buf)
-		if n2 > 0 && z.err != nil {
-			z.err = nil
-		}
 		z.buf = z.buf[:n2]
 		for i := 0; i < n2; i++ {
 			if token = z.buf[i]; !accept.isset(token) {
@@ -454,9 +454,6 @@ func (z *bufioDecReader) skip(accept *bitset256) (token byte) {
 			}
 		}
 		z.n += n2
-		if z.err != nil {
-			return 0
-		}
 		if z.trb {
 			z.tr = append(z.tr, z.buf[:n2]...)
 		}
@@ -489,17 +486,17 @@ func (z *bufioDecReader) readTo(in []byte, accept *bitset256) (out []byte) {
 	if z.trb {
 		z.tr = append(z.tr, z.buf[z.c:]...)
 	}
-	if z.err != nil {
-		return
-	}
 	var n2 int
 	for {
+		if z.err != nil {
+			if z.err == io.EOF {
+				return // readTo should read until it matches or end is reached
+			}
+			panic(z.err)
+		}
 		z.c = 0
 		z.buf = z.buf[0:cap(z.buf)]
 		n2, z.err = z.r.Read(z.buf)
-		if n2 > 0 && z.err != nil {
-			z.err = nil
-		}
 		z.buf = z.buf[:n2]
 		for i := 0; i < n2; i++ {
 			if !accept.isset(z.buf[i]) {
@@ -508,9 +505,6 @@ func (z *bufioDecReader) readTo(in []byte, accept *bitset256) (out []byte) {
 		}
 		out = append(out, z.buf[:n2]...)
 		z.n += n2
-		if z.err != nil {
-			return
-		}
 		if z.trb {
 			z.tr = append(z.tr, z.buf[:n2]...)
 		}
@@ -544,17 +538,14 @@ func (z *bufioDecReader) readUntil(in []byte, stop byte) (out []byte) {
 	if z.trb {
 		z.tr = append(z.tr, z.buf[z.c:]...)
 	}
-	if z.err != nil {
-		return
-	}
 	var n2 int
 	for {
+		if z.err != nil {
+			panic(z.err)
+		}
 		z.c = 0
 		z.buf = z.buf[0:cap(z.buf)]
 		n2, z.err = z.r.Read(z.buf)
-		if n2 > 0 && z.err != nil {
-			z.err = nil
-		}
 		z.buf = z.buf[:n2]
 		for i := 0; i < n2; i++ {
 			if z.buf[i] == stop {
@@ -563,9 +554,6 @@ func (z *bufioDecReader) readUntil(in []byte, stop byte) (out []byte) {
 		}
 		out = append(out, z.buf[:n2]...)
 		z.n += n2
-		if z.err != nil {
-			return
-		}
 		if z.trb {
 			z.tr = append(z.tr, z.buf[:n2]...)
 		}
@@ -695,6 +683,7 @@ func (z *ioDecReader) UnreadByte() (err error) {
 	return
 }
 
+// //go:noinline
 func (z *ioDecReader) readx(n int) (bs []byte) {
 	if n <= 0 {
 		return
@@ -714,6 +703,7 @@ func (z *ioDecReader) readx(n int) (bs []byte) {
 	return
 }
 
+// //go:noinline
 func (z *ioDecReader) readb(bs []byte) {
 	if len(bs) == 0 {
 		return
@@ -727,6 +717,7 @@ func (z *ioDecReader) readb(bs []byte) {
 	}
 }
 
+// //go:noinline
 func (z *ioDecReader) readn1eof() (b uint8, eof bool) {
 	b, err := z.br.ReadByte()
 	if err == nil {
@@ -742,6 +733,7 @@ func (z *ioDecReader) readn1eof() (b uint8, eof bool) {
 	return
 }
 
+// //go:noinline
 func (z *ioDecReader) readn1() (b uint8) {
 	b, err := z.br.ReadByte()
 	if err == nil {
@@ -754,6 +746,7 @@ func (z *ioDecReader) readn1() (b uint8) {
 	panic(err)
 }
 
+// //go:noinline
 func (z *ioDecReader) skip(accept *bitset256) (token byte) {
 	var eof bool
 	// for {
@@ -850,13 +843,13 @@ var errBytesDecReaderCannotUnread = errors.New("cannot unread last byte read")
 type bytesDecReader struct {
 	b []byte // data
 	c int    // cursor
-	a int    // available
 	t int    // track start
+	// a int    // available
 }
 
 func (z *bytesDecReader) reset(in []byte) {
 	z.b = in
-	z.a = len(in)
+	// z.a = len(in)
 	z.c = 0
 	z.t = 0
 }
@@ -870,7 +863,7 @@ func (z *bytesDecReader) unreadn1() {
 		panic(errBytesDecReaderCannotUnread)
 	}
 	z.c--
-	z.a++
+	// z.a++
 	return
 }
 
@@ -896,14 +889,14 @@ func (z *bytesDecReader) readx(n int) (bs []byte) {
 	if n <= 0 {
 		return
 	}
-	if z.a == 0 {
+	if z.c == len(z.b) {
 		panic(io.EOF)
 	}
-	if n > z.a {
+	if n > len(z.b)-z.c {
 		panic(io.ErrUnexpectedEOF)
 	}
 
-	z.a -= n
+	// z.a -= n
 	z.c += n
 	return z.b[z.c-n : z.c]
 }
@@ -913,12 +906,12 @@ func (z *bytesDecReader) readb(bs []byte) {
 }
 
 func (z *bytesDecReader) readn1() (v uint8) {
-	if z.a == 0 {
+	if z.c == len(z.b) {
 		panic(io.EOF)
 	}
 	v = z.b[z.c]
 	z.c++
-	z.a--
+	// z.a--
 	return
 }
 
@@ -935,10 +928,12 @@ func (z *bytesDecReader) readn1() (v uint8) {
 
 // // go:noinline
 func (z *bytesDecReader) skip(accept *bitset256) (token byte) {
-	if z.a == 0 {
-		return
-	}
+	i := z.c
 	blen := len(z.b)
+	if z.c == blen {
+		goto END
+		// panic(io.EOF)
+	}
 	// Replace loop with goto construct, so that this can be inlined
 	// for i := z.c; i < blen; i++ {
 	// 	if !accept.isset(z.b[i]) {
@@ -949,29 +944,32 @@ func (z *bytesDecReader) skip(accept *bitset256) (token byte) {
 	// 		return
 	// 	}
 	// }
-	i := z.c
+
+	// i := z.c
 LOOP:
 	if i < blen {
-		if accept.isset(z.b[i]) {
-			i++
-			goto LOOP
-		}
 		token = z.b[i]
 		i++
-		z.a -= (i - z.c)
+		if accept.isset(token) {
+			goto LOOP
+		}
+		// z.a -= (i - z.c)
 		z.c = i
 		return
 	}
-	z.a, z.c = 0, blen
-	return
+END:
+	panic(io.EOF)
+	// // z.a = 0
+	// z.c = blen
+	// return
 }
 
 // // go:noinline
 func (z *bytesDecReader) readTo(_ []byte, accept *bitset256) (out []byte) {
-	if z.a == 0 {
-		return
-	}
 	blen := len(z.b)
+	if z.c == blen {
+		panic(io.EOF)
+	}
 
 	// Replace loop with goto construct, so that this can be inlined
 	// for i := z.c; i < blen; i++ {
@@ -1005,6 +1003,7 @@ func (z *bytesDecReader) readTo(_ []byte, accept *bitset256) (out []byte) {
 	// 	return
 
 	i := z.c
+	// c := i
 LOOP:
 	if i < blen {
 		if accept.isset(z.b[i]) {
@@ -1013,17 +1012,19 @@ LOOP:
 		}
 	}
 	out = z.b[z.c:i]
-	z.a -= (i - z.c)
+	// z.a -= (i - z.c)
 	z.c = i
-	return
+	return // z.b[c:i]
+	// z.c, i = i, z.c
+	// return z.b[i:z.c]
 }
 
 // // go:noinline
 func (z *bytesDecReader) readUntil(_ []byte, stop byte) (out []byte) {
-	if z.a == 0 {
+	blen := len(z.b)
+	if z.c == blen {
 		panic(io.EOF)
 	}
-	blen := len(z.b)
 	// Replace loop with goto construct, so that this can be inlined
 	// for i := z.c; i < blen; i++ {
 	// 	if z.b[i] == stop {
@@ -1040,14 +1041,15 @@ LOOP:
 		if z.b[i] == stop {
 			i++
 			out = z.b[z.c:i]
-			z.a -= (i - z.c)
+			// z.a -= (i - z.c)
 			z.c = i
 			return
 		}
 		i++
 		goto LOOP
 	}
-	z.a, z.c = 0, blen
+	// z.a = 0
+	// z.c = blen
 	panic(io.EOF)
 }
 
@@ -1362,8 +1364,15 @@ func (d *Decoder) kStruct(f *codecFnInfo, rv reflect.Value) {
 					d.decodeValue(sfn.field(si), nil, true)
 				}
 			} else if mf != nil {
+				// store rvkencname in new []byte, as it previously shares Decoder.b, which is used in decode
+				name2 := rvkencname
+				rvkencname = make([]byte, len(rvkencname))
+				copy(rvkencname, name2)
+
 				var f interface{}
+				// xdebugf("kStruct: mf != nil: before decode: rvkencname: %s", rvkencname)
 				d.decode(&f)
+				// xdebugf("kStruct: mf != nil: after decode: rvkencname: %s", rvkencname)
 				if !mf.CodecMissingField(rvkencname, f) && d.h.ErrorIfNoField {
 					d.errorf("no matching struct field found when decoding stream map with key: %s ",
 						stringView(rvkencname))
@@ -1924,15 +1933,6 @@ type rtid2rv struct {
 
 // --------------
 
-type decReaderType uint8
-
-const (
-	decReaderTypeBytes decReaderType = iota // make this 0, so a comparison is cheap
-	decReaderTypeIo
-	decReaderTypeBufio
-	decReaderTypeUnset = 255
-)
-
 type decReaderSwitch struct {
 	rb bytesDecReader
 	// ---- cpu cache line boundary?
@@ -1946,18 +1946,19 @@ type decReaderSwitch struct {
 	jsms bool // is json handle, and MapKeyAsString
 	esep bool // has elem separators
 
-	bytes bool
-	typ   decReaderType
-	// bytes bool // is bytes reader
-	// bufio bool // is this a bufioDecReader?
+	// typ   entryType
+	bytes bool // is bytes reader
+	bufio bool // is this a bufioDecReader?
 }
 
-// these first 3 functions all always inlined, as they just check int fields, etc.
+// numread, track and stopTrack are always inlined, as they just check int fields, etc.
+
+/*
 func (z *decReaderSwitch) numread() int {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.numread()
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.numread()
 	default:
 		return z.bi.numread()
@@ -1965,9 +1966,9 @@ func (z *decReaderSwitch) numread() int {
 }
 func (z *decReaderSwitch) track() {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		z.rb.track()
-	case decReaderTypeIo:
+	case entryTypeIo:
 		z.ri.track()
 	default:
 		z.bi.track()
@@ -1975,9 +1976,9 @@ func (z *decReaderSwitch) track() {
 }
 func (z *decReaderSwitch) stopTrack() []byte {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.stopTrack()
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.stopTrack()
 	default:
 		return z.bi.stopTrack()
@@ -1986,9 +1987,9 @@ func (z *decReaderSwitch) stopTrack() []byte {
 
 func (z *decReaderSwitch) unreadn1() {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		z.rb.unreadn1()
-	case decReaderTypeIo:
+	case entryTypeIo:
 		z.ri.unreadn1()
 	default:
 		z.bi.unreadn1()
@@ -1996,9 +1997,9 @@ func (z *decReaderSwitch) unreadn1() {
 }
 func (z *decReaderSwitch) readx(n int) []byte {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.readx(n)
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.readx(n)
 	default:
 		return z.bi.readx(n)
@@ -2006,9 +2007,9 @@ func (z *decReaderSwitch) readx(n int) []byte {
 }
 func (z *decReaderSwitch) readb(s []byte) {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		z.rb.readb(s)
-	case decReaderTypeIo:
+	case entryTypeIo:
 		z.ri.readb(s)
 	default:
 		z.bi.readb(s)
@@ -2016,9 +2017,9 @@ func (z *decReaderSwitch) readb(s []byte) {
 }
 func (z *decReaderSwitch) readn1() uint8 {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.readn1()
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.readn1()
 	default:
 		return z.bi.readn1()
@@ -2026,9 +2027,9 @@ func (z *decReaderSwitch) readn1() uint8 {
 }
 func (z *decReaderSwitch) skip(accept *bitset256) (token byte) {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.skip(accept)
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.skip(accept)
 	default:
 		return z.bi.skip(accept)
@@ -2036,9 +2037,9 @@ func (z *decReaderSwitch) skip(accept *bitset256) (token byte) {
 }
 func (z *decReaderSwitch) readTo(in []byte, accept *bitset256) (out []byte) {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.readTo(in, accept)
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.readTo(in, accept)
 	default:
 		return z.bi.readTo(in, accept)
@@ -2046,16 +2047,16 @@ func (z *decReaderSwitch) readTo(in []byte, accept *bitset256) (out []byte) {
 }
 func (z *decReaderSwitch) readUntil(in []byte, stop byte) (out []byte) {
 	switch z.typ {
-	case decReaderTypeBytes:
+	case entryTypeBytes:
 		return z.rb.readUntil(in, stop)
-	case decReaderTypeIo:
+	case entryTypeIo:
 		return z.ri.readUntil(in, stop)
 	default:
 		return z.bi.readUntil(in, stop)
 	}
 }
 
-/*
+*/
 
 // the if/else-if/else block is expensive to inline.
 // Each node of this construct costs a lot and dominates the budget.
@@ -2066,6 +2067,34 @@ func (z *decReaderSwitch) readUntil(in []byte, stop byte) (out []byte) {
 // This allows for the inlining of the common path when z.bytes=true.
 // Go 1.12+ supports inlining methods with up to 1 inlined function (or 2 if no other constructs).
 
+func (z *decReaderSwitch) numread() int {
+	if z.bytes {
+		return z.rb.numread()
+	} else if z.bufio {
+		return z.bi.numread()
+	} else {
+		return z.ri.numread()
+	}
+}
+func (z *decReaderSwitch) track() {
+	if z.bytes {
+		z.rb.track()
+	} else if z.bufio {
+		z.bi.track()
+	} else {
+		z.ri.track()
+	}
+}
+func (z *decReaderSwitch) stopTrack() []byte {
+	if z.bytes {
+		return z.rb.stopTrack()
+	} else if z.bufio {
+		return z.bi.stopTrack()
+	} else {
+		return z.ri.stopTrack()
+	}
+}
+
 func (z *decReaderSwitch) unreadn1() {
 	if z.bytes {
 		z.rb.unreadn1()
@@ -2074,7 +2103,7 @@ func (z *decReaderSwitch) unreadn1() {
 	}
 }
 
-//go:noinline
+// //go:noinline
 func (z *decReaderSwitch) unreadn1IO() {
 	if z.bufio {
 		z.bi.unreadn1()
@@ -2089,7 +2118,7 @@ func (z *decReaderSwitch) readx(n int) []byte {
 	return z.readxIO(n)
 }
 
-//go:noinline
+// //go:noinline
 func (z *decReaderSwitch) readxIO(n int) []byte {
 	if z.bufio {
 		return z.bi.readx(n)
@@ -2119,7 +2148,7 @@ func (z *decReaderSwitch) readn1() uint8 {
 	return z.readn1IO()
 }
 
-//go:noinline
+// //go:noinline
 func (z *decReaderSwitch) readn1IO() uint8 {
 	if z.bufio {
 		return z.bi.readn1()
@@ -2133,7 +2162,7 @@ func (z *decReaderSwitch) skip(accept *bitset256) (token byte) {
 	return z.skipIO(accept)
 }
 
-//go:noinline
+// //go:noinline
 func (z *decReaderSwitch) skipIO(accept *bitset256) (token byte) {
 	if z.bufio {
 		return z.bi.skip(accept)
@@ -2161,7 +2190,7 @@ func (z *decReaderSwitch) readUntil(in []byte, stop byte) (out []byte) {
 	return z.readUntilIO(in, stop)
 }
 
-//go:noinline
+// //go:noinline
 func (z *decReaderSwitch) readUntilIO(in []byte, stop byte) (out []byte) {
 	if z.bufio {
 		return z.bi.readUntil(in, stop)
@@ -2169,8 +2198,6 @@ func (z *decReaderSwitch) readUntilIO(in []byte, stop byte) (out []byte) {
 	return z.ri.readUntil(in, stop)
 }
 
-*/
-
 // A Decoder reads and decodes an object from an input stream in the codec format.
 type Decoder struct {
 	panicHdl
@@ -2196,6 +2223,7 @@ type Decoder struct {
 	// ---- cpu cache line boundary?
 	decReaderSwitch
 
+	// ---- cpu cache line boundary?
 	codecFnPooler
 	// cr containerStateRecv
 	err error
@@ -2204,10 +2232,11 @@ type Decoder struct {
 	maxdepth int16
 	_        [4]uint8 // padding
 
+	is map[string]string // used for interning strings
+
 	// ---- cpu cache line boundary?
 	b [decScratchByteArrayLen]byte // scratch buffer, used by Decoder and xxxEncDrivers
 
-	is map[string]string // used for interning strings
 	// padding - false sharing help // modify 232 if Decoder struct changes.
 	// _ [cacheLineSize - 232%cacheLineSize]byte
 }
@@ -2282,7 +2311,7 @@ func (d *Decoder) Reset(r io.Reader) {
 		return
 	}
 	d.bytes = false
-	d.typ = decReaderTypeUnset
+	// d.typ = entryTypeUnset
 	if d.h.ReaderBufferSize > 0 {
 		if d.bi == nil {
 			d.bi = new(bufioDecReader)
@@ -2294,7 +2323,8 @@ func (d *Decoder) Reset(r io.Reader) {
 		}
 		d.bi.reset(r)
 		// d.r = d.bi
-		d.typ = decReaderTypeBufio
+		// d.typ = entryTypeBufio
+		d.bufio = true
 	} else {
 		// d.ri.x = &d.b
 		// d.s = d.sa[:0]
@@ -2303,7 +2333,8 @@ func (d *Decoder) Reset(r io.Reader) {
 		}
 		d.ri.reset(r)
 		// d.r = d.ri
-		d.typ = decReaderTypeIo
+		// d.typ = entryTypeIo
+		d.bufio = false
 	}
 	d.resetCommon()
 }
@@ -2315,7 +2346,8 @@ func (d *Decoder) ResetBytes(in []byte) {
 		return
 	}
 	d.bytes = true
-	d.typ = decReaderTypeBytes
+	d.bufio = false
+	// d.typ = entryTypeBytes
 	d.rb.reset(in)
 	// d.r = &d.rb
 	d.resetCommon()
@@ -2402,9 +2434,12 @@ func (d *Decoder) naked() *decNaked {
 // Note: we allow nil values in the stream anywhere except for map keys.
 // A nil value in the encoded stream where a map key is expected is treated as an error.
 func (d *Decoder) Decode(v interface{}) (err error) {
+	// tried to use closure, as runtime optimizes defer with no params.
+	// This seemed to be causing weird issues (like circular reference found, unexpected panic, etc).
+	// Also, see https://github.com/golang/go/issues/14939#issuecomment-417836139
+	// defer func() { d.deferred(&err) }()
+	// { x, y := d, &err; defer func() { x.deferred(y) }() }
 	defer d.deferred(&err)
-	// defer func() { d.deferred(&err) }() // use closure, as runtime optimizes defer with no params
-	// { x := d; y := &err; defer func() { x.deferred(y) }() } // https://github.com/golang/go/issues/14939#issuecomment-417836139
 	d.MustDecode(v)
 	return
 }

+ 245 - 40
codec/encode.go

@@ -4,7 +4,6 @@
 package codec
 
 import (
-	"bufio"
 	"encoding"
 	"errors"
 	"fmt"
@@ -66,10 +65,6 @@ type encDriver interface {
 	atEndOfEncode()
 }
 
-type ioEncStringWriter interface {
-	WriteString(s string) (n int, err error)
-}
-
 type encDriverAsis interface {
 	EncodeAsis(v []byte)
 }
@@ -186,6 +181,12 @@ type EncodeOptions struct {
 
 // ---------------------------------------------
 
+/*
+
+type ioEncStringWriter interface {
+	WriteString(s string) (n int, err error)
+}
+
 // ioEncWriter implements encWriter and can write to an io.Writer implementation
 type ioEncWriter struct {
 	w  io.Writer
@@ -196,6 +197,19 @@ type ioEncWriter struct {
 	b  [8]byte
 }
 
+func (z *ioEncWriter) reset(w io.Writer) {
+	z.w = w
+	var ok bool
+	if z.bw, ok = w.(io.ByteWriter); !ok {
+		z.bw = z
+	}
+	if z.sw, ok = w.(ioEncStringWriter); !ok {
+		z.sw = z
+	}
+	z.fw, _ = w.(ioFlusher)
+	z.ww = w
+}
+
 func (z *ioEncWriter) WriteByte(b byte) (err error) {
 	z.b[0] = b
 	_, err = z.w.Write(z.b[:1])
@@ -250,6 +264,96 @@ func (z *ioEncWriter) atEndOfEncode() {
 	}
 }
 
+*/
+
+// ---------------------------------------------
+
+// bufioEncWriter
+type bufioEncWriter struct {
+	buf []byte
+	w   io.Writer
+	n   int
+	// _   [2]uint64 // padding
+	// a int
+	// b   [4]byte
+	// err
+}
+
+func (z *bufioEncWriter) reset(w io.Writer, bufsize int) {
+	z.w = w
+	z.n = 0
+	if bufsize == 0 {
+		z.buf = make([]byte, 256)
+	} else if cap(z.buf) < bufsize {
+		z.buf = make([]byte, bufsize)
+	} else {
+		z.buf = z.buf[:bufsize]
+	}
+}
+
+//go:noinline
+func (z *bufioEncWriter) flush() {
+	n, err := z.w.Write(z.buf[:z.n])
+	z.n -= n
+	if z.n > 0 && err == nil {
+		err = io.ErrShortWrite
+	}
+	if err != nil {
+		if n > 0 && z.n > 0 {
+			copy(z.buf, z.buf[n:z.n+n])
+		}
+		panic(err)
+	}
+}
+
+func (z *bufioEncWriter) writeb(s []byte) {
+LOOP:
+	a := len(z.buf) - z.n
+	if len(s) > a {
+		z.n += copy(z.buf[z.n:], s[:a])
+		s = s[a:]
+		z.flush()
+		goto LOOP
+	}
+	z.n += copy(z.buf[z.n:], s)
+}
+
+func (z *bufioEncWriter) writestr(s string) {
+	// z.writeb(bytesView(s)) // inlined below
+LOOP:
+	a := len(z.buf) - z.n
+	if len(s) > a {
+		z.n += copy(z.buf[z.n:], s[:a])
+		s = s[a:]
+		z.flush()
+		goto LOOP
+	}
+	z.n += copy(z.buf[z.n:], s)
+}
+
+func (z *bufioEncWriter) writen1(b1 byte) {
+	if 1 > len(z.buf)-z.n {
+		z.flush()
+	}
+	z.buf[z.n] = b1
+	z.n++
+}
+
+func (z *bufioEncWriter) writen2(b1, b2 byte) {
+	if 2 > len(z.buf)-z.n {
+		z.flush()
+	}
+	z.buf[z.n+1] = b2
+	z.buf[z.n] = b1
+	z.n += 2
+}
+
+func (z *bufioEncWriter) atEndOfEncode() {
+	if z.n > 0 {
+		z.flush()
+	}
+}
+
 // ---------------------------------------------
 
 // bytesEncAppender implements encWriter and can write to an byte slice.
@@ -967,17 +1071,111 @@ func (e *Encoder) kMapCanonical(rtkey reflect.Type, rv reflect.Value, mks []refl
 // // --------------------------------------------------
 
 type encWriterSwitch struct {
-	wi   *ioEncWriter
-	wb   bytesEncAppender
-	wx   bool      // if bytes, wx=true
-	esep bool      // whether it has elem separators
-	isas bool      // whether e.as != nil
-	js   bool      // here, so that no need to piggy back on *codecFner for this
-	be   bool      // here, so that no need to piggy back on *codecFner for this
-	_    [3]byte   // padding
-	_    [2]uint64 // padding
+	// wi   *ioEncWriter
+	wf bufioEncWriter
+	wb bytesEncAppender
+	// typ  entryType
+	wx   bool    // if bytes, wx=true
+	esep bool    // whether it has elem separators
+	isas bool    // whether e.as != nil
+	js   bool    // captured here, so that no need to piggy back on *codecFner for this
+	be   bool    // captured here, so that no need to piggy back on *codecFner for this
+	_    [2]byte // padding
+	// _    [2]uint64 // padding
+	// _    uint64    // padding
 }
 
+func (z *encWriterSwitch) writeb(s []byte) {
+	if z.wx {
+		z.wb.writeb(s)
+	} else {
+		z.wf.writeb(s)
+	}
+}
+func (z *encWriterSwitch) writestr(s string) {
+	if z.wx {
+		z.wb.writestr(s)
+	} else {
+		z.wf.writestr(s)
+	}
+}
+func (z *encWriterSwitch) writen1(b1 byte) {
+	if z.wx {
+		z.wb.writen1(b1)
+	} else {
+		z.wf.writen1(b1)
+	}
+}
+func (z *encWriterSwitch) writen2(b1, b2 byte) {
+	if z.wx {
+		z.wb.writen2(b1, b2)
+	} else {
+		z.wf.writen2(b1, b2)
+	}
+}
+func (z *encWriterSwitch) atEndOfEncode() {
+	if z.wx {
+		z.wb.atEndOfEncode()
+	} else {
+		z.wf.atEndOfEncode()
+	}
+}
+
+/*
+
+// ------------------------------------------
+func (z *encWriterSwitch) writeb(s []byte) {
+	switch z.typ {
+	case entryTypeBytes:
+		z.wb.writeb(s)
+	case entryTypeIo:
+		z.wi.writeb(s)
+	default:
+		z.wf.writeb(s)
+	}
+}
+func (z *encWriterSwitch) writestr(s string) {
+	switch z.typ {
+	case entryTypeBytes:
+		z.wb.writestr(s)
+	case entryTypeIo:
+		z.wi.writestr(s)
+	default:
+		z.wf.writestr(s)
+	}
+}
+func (z *encWriterSwitch) writen1(b1 byte) {
+	switch z.typ {
+	case entryTypeBytes:
+		z.wb.writen1(b1)
+	case entryTypeIo:
+		z.wi.writen1(b1)
+	default:
+		z.wf.writen1(b1)
+	}
+}
+func (z *encWriterSwitch) writen2(b1, b2 byte) {
+	switch z.typ {
+	case entryTypeBytes:
+		z.wb.writen2(b1, b2)
+	case entryTypeIo:
+		z.wi.writen2(b1, b2)
+	default:
+		z.wf.writen2(b1, b2)
+	}
+}
+func (z *encWriterSwitch) atEndOfEncode() {
+	switch z.typ {
+	case entryTypeBytes:
+		z.wb.atEndOfEncode()
+	case entryTypeIo:
+		z.wi.atEndOfEncode()
+	default:
+		z.wf.atEndOfEncode()
+	}
+}
+
+// ------------------------------------------
 func (z *encWriterSwitch) writeb(s []byte) {
 	if z.wx {
 		z.wb.writeb(s)
@@ -1014,6 +1212,8 @@ func (z *encWriterSwitch) atEndOfEncode() {
 	}
 }
 
+*/
+
 // An Encoder writes an object to an output stream in the codec format.
 type Encoder struct {
 	panicHdl
@@ -1031,10 +1231,9 @@ type Encoder struct {
 
 	h *BasicHandle
 
-	// ---- cpu cache line boundary?
+	// ---- cpu cache line boundary? + 3
 	encWriterSwitch
 
-	// ---- cpu cache line boundary?
 	codecFnPooler
 	ci set
 
@@ -1043,7 +1242,7 @@ type Encoder struct {
 	// ---- cpu cache line boundary?
 	// b [scratchByteArrayLen]byte
 	// _ [cacheLineSize - scratchByteArrayLen]byte // padding
-	b [cacheLineSize + 8]byte // used for encoding a chan or (non-addressable) array of bytes
+	b [cacheLineSize - (8 * 2)]byte // used for encoding a chan or (non-addressable) array of bytes
 }
 
 // NewEncoder returns an Encoder for encoding into an io.Writer.
@@ -1096,28 +1295,30 @@ func (e *Encoder) Reset(w io.Writer) {
 	if w == nil {
 		return
 	}
-	if e.wi == nil {
-		e.wi = new(ioEncWriter)
-	}
-	var ok bool
+	// var ok bool
 	e.wx = false
-	e.wi.w = w
-	if e.h.WriterBufferSize > 0 {
-		bw := bufio.NewWriterSize(w, e.h.WriterBufferSize)
-		e.wi.bw = bw
-		e.wi.sw = bw
-		e.wi.fw = bw
-		e.wi.ww = bw
-	} else {
-		if e.wi.bw, ok = w.(io.ByteWriter); !ok {
-			e.wi.bw = e.wi
-		}
-		if e.wi.sw, ok = w.(ioEncStringWriter); !ok {
-			e.wi.sw = e.wi
-		}
-		e.wi.fw, _ = w.(ioFlusher)
-		e.wi.ww = w
-	}
+	// e.typ = entryTypeUnset
+	// if e.h.WriterBufferSize > 0 {
+	// 	// bw := bufio.NewWriterSize(w, e.h.WriterBufferSize)
+	// 	// e.wi.bw = bw
+	// 	// e.wi.sw = bw
+	// 	// e.wi.fw = bw
+	// 	// e.wi.ww = bw
+	// 	if e.wf == nil {
+	// 		e.wf = new(bufioEncWriter)
+	// 	}
+	// 	e.wf.reset(w, e.h.WriterBufferSize)
+	// 	e.typ = entryTypeBufio
+	// } else {
+	// 	if e.wi == nil {
+	// 		e.wi = new(ioEncWriter)
+	// 	}
+	// 	e.wi.reset(w)
+	// 	e.typ = entryTypeIo
+	// }
+	e.wf.reset(w, e.h.WriterBufferSize)
+	// e.typ = entryTypeBufio
+
 	// e.w = e.wi
 	e.resetCommon()
 }
@@ -1135,6 +1336,7 @@ func (e *Encoder) ResetBytes(out *[]byte) {
 		in = make([]byte, defEncByteBufSize)
 	}
 	e.wx = true
+	// e.typ = entryTypeBytes
 	e.wb.reset(in, out)
 	// e.w = &e.wb
 	e.resetCommon()
@@ -1224,9 +1426,12 @@ func (e *Encoder) ResetBytes(out *[]byte) {
 // Some formats support symbols (e.g. binc) and will properly encode the string
 // only once in the stream, and use a tag to refer to it thereafter.
 func (e *Encoder) Encode(v interface{}) (err error) {
+	// tried to use closure, as runtime optimizes defer with no params.
+	// This seemed to be causing weird issues (like circular reference found, unexpected panic, etc).
+	// Also, see https://github.com/golang/go/issues/14939#issuecomment-417836139
+	// defer func() { e.deferred(&err) }() }
+	// { x, y := e, &err; defer func() { x.deferred(y) }() }
 	defer e.deferred(&err)
-	// defer func() { e.deferred(&err) }() } // use closure, as runtime optimizes defer with no params
-	// { x := e; y := &err; defer func() { x.deferred(y) }() } // https://github.com/golang/go/issues/14939#issuecomment-417836139
 	e.MustEncode(v)
 	return
 }

+ 43 - 1
codec/helper.go

@@ -163,6 +163,15 @@ type clsErr struct {
 	errClosed error // error on closing
 }
 
+// type entryType uint8
+
+// const (
+// 	entryTypeBytes entryType = iota // make this 0, so a comparison is cheap
+// 	entryTypeIo
+// 	entryTypeBufio
+// 	entryTypeUnset = 255
+// )
+
 type charEncoding uint8
 
 const (
@@ -2342,8 +2351,9 @@ func (x *bitset32) set(pos byte) {
 type pooler struct {
 	dn                                          sync.Pool // for decNaked
 	cfn                                         sync.Pool // for codecFner
-	tiload                                      sync.Pool
+	tiload                                      sync.Pool // for type info loading
 	strRv8, strRv16, strRv32, strRv64, strRv128 sync.Pool // for stringRV
+	buf64, buf128, buf256, buf512, buf1024      sync.Pool // for [...]byte
 }
 
 func (p *pooler) init() {
@@ -2352,8 +2362,17 @@ func (p *pooler) init() {
 	p.strRv32.New = func() interface{} { return new([32]sfiRv) }
 	p.strRv64.New = func() interface{} { return new([64]sfiRv) }
 	p.strRv128.New = func() interface{} { return new([128]sfiRv) }
+
+	p.buf64.New = func() interface{} { return new([64]byte) }
+	p.buf128.New = func() interface{} { return new([128]byte) }
+	p.buf256.New = func() interface{} { return new([256]byte) }
+	p.buf512.New = func() interface{} { return new([512]byte) }
+	p.buf1024.New = func() interface{} { return new([1024]byte) }
+
 	p.dn.New = func() interface{} { x := new(decNaked); x.init(); return x }
+
 	p.tiload.New = func() interface{} { return new(typeInfoLoadArray) }
+
 	p.cfn.New = func() interface{} { return new(codecFner) }
 }
 
@@ -2372,6 +2391,23 @@ func (p *pooler) sfiRv64() (sp *sync.Pool, v interface{}) {
 func (p *pooler) sfiRv128() (sp *sync.Pool, v interface{}) {
 	return &p.strRv128, p.strRv128.Get()
 }
+
+func (p *pooler) bytes64() (sp *sync.Pool, v interface{}) {
+	return &p.buf64, p.buf64.Get()
+}
+func (p *pooler) bytes128() (sp *sync.Pool, v interface{}) {
+	return &p.buf128, p.buf128.Get()
+}
+func (p *pooler) bytes256() (sp *sync.Pool, v interface{}) {
+	return &p.buf256, p.buf256.Get()
+}
+func (p *pooler) bytes512() (sp *sync.Pool, v interface{}) {
+	return &p.buf512, p.buf512.Get()
+}
+func (p *pooler) bytes1024() (sp *sync.Pool, v interface{}) {
+	return &p.buf1024, p.buf1024.Get()
+}
+
 func (p *pooler) decNaked() (sp *sync.Pool, v interface{}) {
 	return &p.dn, p.dn.Get()
 }
@@ -2406,6 +2442,8 @@ func (p *pooler) tiLoad() (sp *sync.Pool, v interface{}) {
 // 	p.tiload.Put(v)
 // }
 
+// ----------------------------------------------------
+
 type panicHdl struct{}
 
 func (panicHdl) errorv(err error) {
@@ -2429,6 +2467,8 @@ func (panicHdl) errorf(format string, params ...interface{}) {
 	}
 }
 
+// ----------------------------------------------------
+
 type errDecorator interface {
 	wrapErr(in interface{}, out *error)
 }
@@ -2437,6 +2477,8 @@ type errDecoratorDef struct{}
 
 func (errDecoratorDef) wrapErr(v interface{}, e *error) { *e = fmt.Errorf("%v", v) }
 
+// ----------------------------------------------------
+
 type must struct{}
 
 func (must) String(s string, err error) string {

+ 1 - 0
codec/values_flex_test.go

@@ -75,6 +75,7 @@ type missingFielderT1 struct {
 }
 
 func (t *missingFielderT1) CodecMissingField(field []byte, value interface{}) bool {
+	// xdebugf(">> calling CodecMissingField with field: %s, value: %v", field, value)
 	switch string(field) {
 	case "F":
 		t.f = value.(float64)

+ 3 - 0
codec/z_all_test.go

@@ -68,6 +68,7 @@ func testSuite(t *testing.T, f func(t *testing.T)) {
 	testUseMust = true
 	testInternStr = true
 	testUseIoEncDec = 0
+	// xdebugf("setting StructToArray=true")
 	testStructToArray = true
 	testCheckCircRef = true
 	testUseReset = true
@@ -76,6 +77,8 @@ func testSuite(t *testing.T, f func(t *testing.T)) {
 	testReinit()
 	t.Run("optionsTrue", f)
 
+	// xdebugf("setting StructToArray=false")
+	testStructToArray = false
 	testDepth = 6
 	testReinit()
 	t.Run("optionsTrue-deepstruct", f)