Explorar o código

codec: only maintain pools for long-lived objects shared across goroutines

Previously, we used sync.Pool for everything
- getting buffer of bytes
- creating type info
- canonical encoding of structs
- etc

However, sync.Pool should only be used to relieve GC pressure when
multiple goroutines are using shared objects.
They should not be used for short-lived objects, but a free list should be used instead.

Encoders, Decoders and other objects that live withing their lifetime within a
single goroutine should thus not use sync.Pool, but rather use free lists.

For these, we now use free lists in place of sync.Pool.

This also cleans up the code some, and makes things like Release, etc unnecessary.
Ugorji Nwoke %!s(int64=6) %!d(string=hai) anos
pai
achega
1deb530697
Modificáronse 11 ficheiros con 389 adicións e 264 borrados
  1. 3 3
      codec/binc.go
  2. 5 4
      codec/codec_test.go
  3. 16 14
      codec/decode.go
  4. 32 24
      codec/encode.go
  5. 95 8
      codec/helper.go
  6. 1 1
      codec/helper_unsafe.go
  7. 49 47
      codec/json.go
  8. 3 3
      codec/msgpack.go
  9. 137 132
      codec/reader.go
  10. 2 3
      codec/simple.go
  11. 46 25
      codec/writer.go

+ 3 - 3
codec/binc.go

@@ -216,9 +216,9 @@ func (e *bincEncDriver) encUint(bd byte, pos bool, v uint64) {
 
 func (e *bincEncDriver) EncodeExt(v interface{}, xtag uint64, ext Ext) {
 	var bs []byte
-	var bufp bytesBufPooler
+	// var bufp bytesBufPooler
 	if ext == SelfExt {
-		bs = bufp.get(1024)[:0]
+		bs = e.e.blist.get(1024)[:0] // bufp.get(1024)[:0]
 		e.e.sideEncode(v, &bs)
 	} else {
 		bs = ext.WriteExt(v)
@@ -230,7 +230,7 @@ func (e *bincEncDriver) EncodeExt(v interface{}, xtag uint64, ext Ext) {
 	e.encodeExtPreamble(uint8(xtag), len(bs))
 	e.w.writeb(bs)
 	if ext == SelfExt {
-		bufp.end()
+		e.e.blist.put(bs) // bufp.end()
 	}
 }
 

+ 5 - 4
codec/codec_test.go

@@ -3028,7 +3028,8 @@ func doTestBufioDecReader(t *testing.T, bufsize int) {
 	// fmt.Printf("s: %s\n", s)
 	var r = strings.NewReader(s)
 	var br bufioDecReader
-	br.reset(r, bufsize)
+	var blist bytesFreelist
+	br.reset(r, bufsize, &blist)
 	b, err := ioutil.ReadAll(br.r)
 	if err != nil {
 		panic(err)
@@ -3044,7 +3045,7 @@ func doTestBufioDecReader(t *testing.T, bufsize int) {
 	// readUntil: see: 56789
 	var out []byte
 	var token byte
-	br.reset(strings.NewReader(s), bufsizehalf)
+	br.reset(strings.NewReader(s), bufsizehalf, &blist)
 	// println()
 	for _, v2 := range [...]string{
 		`01234'`,
@@ -3056,7 +3057,7 @@ func doTestBufioDecReader(t *testing.T, bufsize int) {
 		testDeepEqualErr(string(out), v2, t, "-")
 		// fmt.Printf("readUntil: out: `%s`\n", out)
 	}
-	br.reset(strings.NewReader(s), bufsizehalf)
+	br.reset(strings.NewReader(s), bufsizehalf, &blist)
 	// println()
 	for range [4]struct{}{} {
 		out = br.readTo(&jsonNumSet)
@@ -3073,7 +3074,7 @@ func doTestBufioDecReader(t *testing.T, bufsize int) {
 		// fmt.Printf("readUntil: out: `%s`\n", out)
 		br.unreadn1()
 	}
-	br.reset(strings.NewReader(s), bufsizehalf)
+	br.reset(strings.NewReader(s), bufsizehalf, &blist)
 	// println()
 	for range [4]struct{}{} {
 		out = br.readUntil(' ')

+ 16 - 14
codec/decode.go

@@ -1375,6 +1375,8 @@ type Decoder struct {
 	_ [1]byte                      // padding
 	b [decScratchByteArrayLen]byte // scratch buffer, used by Decoder and xxxDecDrivers
 
+	blist bytesFreelist
+
 	// padding - false sharing help // modify 232 if Decoder struct changes.
 	// _ [cacheLineSize - 232%cacheLineSize]byte
 }
@@ -1466,7 +1468,7 @@ func (d *Decoder) Reset(r io.Reader) {
 		if d.bi == nil {
 			d.bi = new(bufioDecReader)
 		}
-		d.bi.reset(r, d.h.ReaderBufferSize)
+		d.bi.reset(r, d.h.ReaderBufferSize, &d.blist)
 		// d.r = d.bi
 		// d.typ = entryTypeBufio
 		d.bufio = true
@@ -1476,7 +1478,7 @@ func (d *Decoder) Reset(r io.Reader) {
 		if d.ri == nil {
 			d.ri = new(ioDecReader)
 		}
-		d.ri.reset(r)
+		d.ri.reset(r, &d.blist)
 		// d.r = d.ri
 		// d.typ = entryTypeIo
 		d.bufio = false
@@ -1635,12 +1637,12 @@ func (d *Decoder) mustDecode(v interface{}) {
 	if d.calls == 0 {
 		d.d.atEndOfDecode()
 		// release
-		if !d.h.ExplicitRelease {
-			d.decReaderSwitch.release()
-			if d.jdec != nil {
-				d.jdec.release()
-			}
-		}
+		// if !d.h.ExplicitRelease {
+		// 	d.decReaderSwitch.release()
+		// 	// if d.jdec != nil {
+		// 	// 	d.jdec.release()
+		// 	// }
+		// }
 	}
 }
 
@@ -1665,12 +1667,12 @@ func (d *Decoder) finalize() {
 //
 // By default, Release() is automatically called unless the option ExplicitRelease is set.
 func (d *Decoder) Release() {
-	if d.bi != nil {
-		d.bi.release()
-	}
-	if d.jdec != nil {
-		d.jdec.release()
-	}
+	// if d.bi != nil {
+	// 	d.bi.release()
+	// }
+	// if d.jdec != nil {
+	// 	d.jdec.release()
+	// }
 	// d.decNakedPooler.end()
 }
 

+ 32 - 24
codec/encode.go

@@ -525,8 +525,8 @@ func (e *Encoder) kStruct(f *codecFnInfo, rv reflect.Value) {
 	// enough to reduce thread contention.
 
 	// fmt.Printf(">>>>>>>>>>>>>> encode.kStruct: newlen: %d\n", newlen)
-	var spool sfiRvPooler
-	var fkvs = spool.get(newlen)
+	// var spool sfiRvPooler
+	var fkvs = e.slist.get(newlen)
 
 	recur := e.h.RecursiveEmptyCheck
 	sfn := structFieldNode{v: rv, update: false}
@@ -602,7 +602,8 @@ func (e *Encoder) kStruct(f *codecFnInfo, rv reflect.Value) {
 	// do not use defer. Instead, use explicit pool return at end of function.
 	// defer has a cost we are trying to avoid.
 	// If there is a panic and these slices are not returned, it is ok.
-	spool.end()
+	// spool.end()
+	e.slist.put(fkvs)
 }
 
 func (e *Encoder) kMap(f *codecFnInfo, rv reflect.Value) {
@@ -800,8 +801,10 @@ func (e *Encoder) kMapCanonical(rtkey, rtval reflect.Type, rv, rvv reflect.Value
 	default:
 		// out-of-band
 		// first encode each key to a []byte first, then sort them, then record
-		var bufp bytesBufPooler
-		var mksv []byte = bufp.get(len(mks) * 16)[:0]
+		// var bufp bytesBufPooler
+		// var mksv []byte = bufp.get(len(mks) * 16)[:0]
+		// var mksv []byte = make([]byte, 0, len(mks)*16)
+		var mksv []byte = e.blist.get(len(mks) * 16)[:0]
 		e2 := NewEncoderBytes(&mksv, e.hh)
 		mksbv := make([]bytesRv, len(mks))
 		for i, k := range mks {
@@ -818,7 +821,8 @@ func (e *Encoder) kMapCanonical(rtkey, rtval reflect.Type, rv, rvv reflect.Value
 			e.mapElemValue()
 			e.encodeValue(mapGet(rv, mksbv[j].r, rvv), valFn)
 		}
-		bufp.end()
+		// bufp.end()
+		e.blist.put(mksv)
 	}
 }
 
@@ -858,7 +862,11 @@ type Encoder struct {
 
 	b [(5 * 8)]byte // for encoding chan byte, (non-addressable) [N]byte, etc
 
+	slist sfiRvFreelist
+	blist bytesFreelist
+
 	// ---- cpu cache line boundary?
+
 	// b [scratchByteArrayLen]byte
 	// _ [cacheLineSize - scratchByteArrayLen]byte // padding
 	// b [cacheLineSize - (8 * 0)]byte // used for encoding a chan or (non-addressable) array of bytes
@@ -959,7 +967,7 @@ func (e *Encoder) Reset(w io.Writer) {
 	// 	e.wi.reset(w)
 	// 	e.typ = entryTypeIo
 	// }
-	e.wf.reset(w, e.h.WriterBufferSize)
+	e.wf.reset(w, e.h.WriterBufferSize, &e.blist)
 	// e.typ = entryTypeBufio
 
 	// e.w = e.wi
@@ -1108,25 +1116,25 @@ func (e *Encoder) MustEncode(v interface{}) {
 }
 
 func (e *Encoder) mustEncode(v interface{}) {
-	if e.wf == nil {
-		e.encode(v)
-		e.e.atEndOfEncode()
-		e.w().end()
-		return
-	}
+	// if e.wf == nil {
+	// 	e.encode(v)
+	// 	e.e.atEndOfEncode()
+	// 	e.w().end()
+	// 	return
+	// }
 
-	if e.wf.buf == nil {
-		e.wf.buf = e.wf.bytesBufPooler.get(e.wf.sz)
-		e.wf.buf = e.wf.buf[:cap(e.wf.buf)]
-	}
-	e.wf.calls++
+	// if e.wf.buf == nil {
+	// 	e.wf.buf = e.wf.bytesBufPooler.get(e.wf.sz)
+	// 	e.wf.buf = e.wf.buf[:cap(e.wf.buf)]
+	// }
+	e.calls++
 	e.encode(v)
-	e.wf.calls--
-	if e.wf.calls == 0 {
+	e.calls--
+	if e.calls == 0 {
 		e.e.atEndOfEncode()
 		e.w().end()
 		if !e.h.ExplicitRelease {
-			e.wf.release()
+			e.Release()
 		}
 	}
 }
@@ -1151,9 +1159,9 @@ func (e *Encoder) finalize() {
 // It is important to call Release() when done with an Encoder, so those resources
 // are released instantly for use by subsequently created Encoders.
 func (e *Encoder) Release() {
-	if e.wf != nil {
-		e.wf.release()
-	}
+	// if e.wf != nil {
+	// 	e.wf.release()
+	// }
 }
 
 func (e *Encoder) encode(iv interface{}) {

+ 95 - 8
codec/helper.go

@@ -2772,6 +2772,8 @@ func (must) Float(s float64, err error) float64 {
 
 // -------------------
 
+/*
+
 type pooler struct {
 	pool  *sync.Pool
 	poolv interface{}
@@ -2961,18 +2963,18 @@ NEW:
 
 // ----------------
 
-type bytesBufPoolerPlus struct {
+type bytesBufSlicePooler struct {
 	bytesBufPooler
 	buf []byte
 }
 
-func (z *bytesBufPoolerPlus) ensureExtraCap(num int) {
+func (z *bytesBufSlicePooler) ensureExtraCap(num int) {
 	if cap(z.buf) < len(z.buf)+num {
 		z.ensureCap(len(z.buf) + num)
 	}
 }
 
-func (z *bytesBufPoolerPlus) ensureCap(newcap int) {
+func (z *bytesBufSlicePooler) ensureCap(newcap int) {
 	if cap(z.buf) >= newcap {
 		return
 	}
@@ -3000,26 +3002,26 @@ func (z *bytesBufPoolerPlus) ensureCap(newcap int) {
 	z.bytesBufPooler = bp2
 }
 
-func (z *bytesBufPoolerPlus) get(length int) {
+func (z *bytesBufSlicePooler) get(length int) {
 	z.buf = z.bytesBufPooler.get(length)
 }
 
-func (z *bytesBufPoolerPlus) append(b byte) {
+func (z *bytesBufSlicePooler) append(b byte) {
 	z.ensureExtraCap(1)
 	z.buf = append(z.buf, b)
 }
 
-func (z *bytesBufPoolerPlus) appends(b []byte) {
+func (z *bytesBufSlicePooler) appends(b []byte) {
 	z.ensureExtraCap(len(b))
 	z.buf = append(z.buf, b...)
 }
 
-func (z *bytesBufPoolerPlus) end() {
+func (z *bytesBufSlicePooler) end() {
 	z.bytesBufPooler.end()
 	z.buf = nil
 }
 
-func (z *bytesBufPoolerPlus) resetBuf() {
+func (z *bytesBufSlicePooler) resetBuf() {
 	if z.buf != nil {
 		z.buf = z.buf[:0]
 	}
@@ -3055,6 +3057,91 @@ func (z *sfiRvPooler) get(newlen int) (fkvs []sfiRv) {
 	return
 }
 
+*/
+
+// ----------------
+
+func freelistCapacity(length int) (capacity int) {
+	for capacity = 8; capacity < length; capacity *= 2 {
+	}
+	return
+}
+
+type bytesFreelist [][]byte
+
+func (x *bytesFreelist) get(length int) (out []byte) {
+	var j int = -1
+	for i := 0; i < len(*x); i++ {
+		if cap((*x)[i]) >= length && (j == -1 || cap((*x)[j]) > cap((*x)[i])) {
+			j = i
+		}
+	}
+	if j == -1 {
+		return make([]byte, length, freelistCapacity(length))
+	}
+	out = (*x)[j][:length]
+	(*x)[j] = nil
+	for i := 0; i < len(out); i++ {
+		out[i] = 0
+	}
+	return
+}
+
+func (x *bytesFreelist) put(v []byte) {
+	if len(v) == 0 {
+		return
+	}
+	for i := 0; i < len(*x); i++ {
+		if cap((*x)[i]) == 0 {
+			(*x)[i] = v
+			return
+		}
+	}
+	*x = append(*x, v)
+}
+
+func (x *bytesFreelist) check(v []byte, length int) (out []byte) {
+	if cap(v) < length {
+		x.put(v)
+		return x.get(length)
+	}
+	return v[:length]
+}
+
+// -------------------------
+
+type sfiRvFreelist [][]sfiRv
+
+func (x *sfiRvFreelist) get(length int) (out []sfiRv) {
+	var j int = -1
+	for i := 0; i < len(*x); i++ {
+		if cap((*x)[i]) >= length && (j == -1 || cap((*x)[j]) > cap((*x)[i])) {
+			j = i
+		}
+	}
+	if j == -1 {
+		return make([]sfiRv, length, freelistCapacity(length))
+	}
+	out = (*x)[j][:length]
+	(*x)[j] = nil
+	for i := 0; i < len(out); i++ {
+		out[i] = sfiRv{}
+	}
+	return
+}
+
+func (x *sfiRvFreelist) put(v []sfiRv) {
+	for i := 0; i < len(*x); i++ {
+		if cap((*x)[i]) == 0 {
+			(*x)[i] = v
+			return
+		}
+	}
+	*x = append(*x, v)
+}
+
+// -----------
+
 // xdebugf printf. the message in red on the terminal.
 // Use it in place of fmt.Printf (which it calls internally)
 func xdebugf(pattern string, args ...interface{}) {

+ 1 - 1
codec/helper_unsafe.go

@@ -699,7 +699,7 @@ type unsafeMapIter struct {
 // allocation of unsafeMapIter.
 // Options are to try to alloc on stack, or pool it.
 // Easiest to pool it.
-const unsafeMapIterUsePool = true
+const unsafeMapIterUsePool = false
 
 var unsafeMapIterPool = sync.Pool{
 	New: func() interface{} { return new(unsafeMapIter) },

+ 49 - 47
codec/json.go

@@ -607,8 +607,8 @@ type jsonDecDriver struct {
 	// ---- writable fields during execution --- *try* to keep in sep cache line
 	// bs []byte // scratch - for parsing strings, bytes
 
-	bp bytesBufPoolerPlus
-	se interfaceExtWrapper
+	buf []byte
+	se  interfaceExtWrapper
 
 	// _ [4]uint64 // padding
 
@@ -1015,7 +1015,7 @@ func (d *jsonDecDriver) DecodeBytes(bs []byte, zerocopy bool) (bsOut []byte) {
 
 	// base64 encodes []byte{} as "", and we encode nil []byte as null.
 	// Consequently, base64 should decode null as a nil []byte, and "" as an empty []byte{}.
-	// appendStringAsBytes returns a zero-len slice for both, so as not to reset d.bp.buf.
+	// appendStringAsBytes returns a zero-len slice for both, so as not to reset d.buf.
 	// However, it sets a fnil field to true, so we can check if a null was found.
 
 	// d.appendStringAsBytes()
@@ -1035,16 +1035,18 @@ func (d *jsonDecDriver) DecodeBytes(bs []byte, zerocopy bool) (bsOut []byte) {
 	} else if slen <= cap(bs) {
 		bsOut = bs[:slen]
 	} else if zerocopy {
-		// if d.bp.buf == nil {
-		// 	d.bp.buf = d.bp.get(slen)
+		// if d.buf == nil {
+		// 	d.buf = d.bp.get(slen)
+		// }
+		d.buf = d.d.blist.check(d.buf, slen)
+		bsOut = d.buf
+		// if slen <= cap(d.buf) {
+		// 	bsOut = d.buf[:slen]
+		// } else {
+		// 	d.bp.get(slen)
+		// 	bsOut = d.buf
+		// 	// bsOut = make([]byte, slen) // TODO: should i check pool? how to return it back?
 		// }
-		if slen <= cap(d.bp.buf) {
-			bsOut = d.bp.buf[:slen]
-		} else {
-			d.bp.get(slen)
-			bsOut = d.bp.buf
-			// bsOut = make([]byte, slen) // TODO: should i check pool? how to return it back?
-		}
 	} else {
 		bsOut = make([]byte, slen)
 	}
@@ -1106,8 +1108,8 @@ func (d *jsonDecDriver) readString() (bs []byte) {
 func (d *jsonDecDriver) appendStringAsBytes() (bs []byte) {
 	// xdebug2f("appendStringAsBytes: found: '%c'", d.tok)
 
-	if d.bp.buf != nil {
-		d.bp.buf = d.bp.buf[:0]
+	if d.buf != nil {
+		d.buf = d.buf[:0]
 	}
 	d.tok = 0
 
@@ -1126,9 +1128,9 @@ func (d *jsonDecDriver) appendStringAsBytes() (bs []byte) {
 	var i, cursor uint
 	for {
 		if i == cslen {
-			d.bp.appends(cs[cursor:])
+			// d.bp.appends(cs[cursor:])
 			// d.bp.ensureExtraCap(int(cslen - cursor))
-			// d.bp.buf = append(d.bp.buf, cs[cursor:]...)
+			d.buf = append(d.buf, cs[cursor:]...)
 			cs = d.r.readUntil('"')
 			// xdebugf("appendStringAsBytes: len: %d, cs: %s", len(cs), cs)
 			cslen = uint(len(cs))
@@ -1136,10 +1138,10 @@ func (d *jsonDecDriver) appendStringAsBytes() (bs []byte) {
 		}
 		c = cs[i]
 		if c == '"' {
-			if len(d.bp.buf) > 0 {
-				d.bp.appends(cs[cursor:i])
+			if len(d.buf) > 0 {
+				// d.bp.appends(cs[cursor:i])
 				// d.bp.ensureExtraCap(int(i - cursor))
-				// d.bp.buf = append(d.bp.buf, cs[cursor:i]...)
+				d.buf = append(d.buf, cs[cursor:i]...)
 			}
 			break
 		}
@@ -1147,25 +1149,25 @@ func (d *jsonDecDriver) appendStringAsBytes() (bs []byte) {
 			i++
 			continue
 		}
-		d.bp.appends(cs[cursor:i])
+		// d.bp.appends(cs[cursor:i])
 		// d.bp.ensureExtraCap(int(i - cursor))
-		// d.bp.buf = append(d.bp.buf, cs[cursor:i]...)
-		d.bp.ensureExtraCap(4) // NOTE: 1 is sufficient, but say 4 for now
+		d.buf = append(d.buf, cs[cursor:i]...)
+		// d.bp.ensureExtraCap(4) // NOTE: 1 is sufficient, but say 4 for now
 		i++
 		c = cs[i]
 		switch c {
 		case '"', '\\', '/', '\'':
-			d.bp.buf = append(d.bp.buf, c)
+			d.buf = append(d.buf, c)
 		case 'b':
-			d.bp.buf = append(d.bp.buf, '\b')
+			d.buf = append(d.buf, '\b')
 		case 'f':
-			d.bp.buf = append(d.bp.buf, '\f')
+			d.buf = append(d.buf, '\f')
 		case 'n':
-			d.bp.buf = append(d.bp.buf, '\n')
+			d.buf = append(d.buf, '\n')
 		case 'r':
-			d.bp.buf = append(d.bp.buf, '\r')
+			d.buf = append(d.buf, '\r')
 		case 't':
-			d.bp.buf = append(d.bp.buf, '\t')
+			d.buf = append(d.buf, '\t')
 		case 'u':
 			var r rune
 			var rr uint32
@@ -1219,15 +1221,14 @@ func (d *jsonDecDriver) appendStringAsBytes() (bs []byte) {
 			}
 		encode_rune:
 			w2 := utf8.EncodeRune(d.bstr[:], r)
-			d.bp.appends(d.bstr[:w2])
-			// d.bp.buf = append(d.bp.buf, d.bstr[:w2]...)
+			d.buf = append(d.buf, d.bstr[:w2]...)
 		default:
 			d.d.errorf("unsupported escaped value: %c", c)
 		}
 		i++
 		cursor = i
 	}
-	if len(d.bp.buf) == 0 {
+	if len(d.buf) == 0 {
 		// return cs[:len(cs)-1]
 		// returning cs was failing for bufio, as it seems bufio needs the buffer for other things.
 		// only return cs if bytesDecReader
@@ -1235,14 +1236,14 @@ func (d *jsonDecDriver) appendStringAsBytes() (bs []byte) {
 		if d.d.bytes {
 			return cs
 		}
-		d.bp.ensureExtraCap(len(cs))
-		d.bp.buf = d.bp.buf[:len(cs)]
-		copy(d.bp.buf, cs)
-		// xdebugf("cs: '%s', d.bp.buf: '%s'", cs, d.bp.buf)
-		return d.bp.buf
+		// d.bp.ensureExtraCap(len(cs))
+		d.buf = d.d.blist.check(d.buf, len(cs))
+		copy(d.buf, cs)
+		// xdebugf("cs: '%s', d.buf: '%s'", cs, d.buf)
+		return d.buf
 	}
-	// xdebug2f("returning d.bp.buf: %s", d.bp.buf)
-	return d.bp.buf
+	// xdebug2f("returning d.buf: %s", d.buf)
+	return d.buf
 }
 
 func (d *jsonDecDriver) nakedNum(z *decNaked, bs []byte) (err error) {
@@ -1498,21 +1499,22 @@ func (e *jsonEncDriver) reset() {
 func (d *jsonDecDriver) reset() {
 	d.r = d.d.r()
 	d.se.InterfaceExt = d.h.RawBytesExt
-	if d.bp.buf != nil {
-		d.bp.buf = d.bp.buf[:0]
-	}
+	d.buf = d.d.blist.check(d.buf, 256)[:0]
+	// if d.buf != nil {
+	// 	d.buf = d.buf[:0]
+	// }
 	d.tok = 0
 	d.fnil = false
 }
 
 func (d *jsonDecDriver) atEndOfDecode() {}
 
-func (d *jsonDecDriver) release() {
-	l := d.bp.capacity()
-	if l > 0 {
-		d.bp.end()
-	}
-}
+// func (d *jsonDecDriver) release() {
+// 	l := d.bp.capacity()
+// 	if l > 0 {
+// 		d.bp.end()
+// 	}
+// }
 
 // jsonFloatStrconvFmtPrec ...
 //

+ 3 - 3
codec/msgpack.go

@@ -319,9 +319,9 @@ func (e *msgpackEncDriver) EncodeTime(t time.Time) {
 
 func (e *msgpackEncDriver) EncodeExt(v interface{}, xtag uint64, ext Ext) {
 	var bs []byte
-	var bufp bytesBufPooler
+	// var bufp bytesBufPooler
 	if ext == SelfExt {
-		bs = bufp.get(1024)[:0]
+		bs = e.e.blist.get(1024)[:0] // bufp.get(1024)[:0]
 		e.e.sideEncode(v, &bs)
 	} else {
 		bs = ext.WriteExt(v)
@@ -337,7 +337,7 @@ func (e *msgpackEncDriver) EncodeExt(v interface{}, xtag uint64, ext Ext) {
 		e.EncodeStringBytesRaw(bs)
 	}
 	if ext == SelfExt {
-		bufp.end()
+		e.e.blist.put(bs) // bufp.end()
 	}
 }
 

+ 137 - 132
codec/reader.go

@@ -74,16 +74,18 @@ type ioDecReaderCommon struct {
 	_   bool
 	b   [4]byte // tiny buffer for reading single bytes
 
-	tr   bytesBufPoolerPlus // buffer for tracking bytes
-	bufr bytesBufPoolerPlus // buffer for readTo/readUntil
+	blist *bytesFreelist
+
+	tr   []byte // buffer for tracking bytes
+	bufr []byte // buffer for readTo/readUntil
 }
 
-func (z *ioDecReaderCommon) reset(r io.Reader) {
+func (z *ioDecReaderCommon) reset(r io.Reader, blist *bytesFreelist) {
+	z.blist = blist
 	z.r = r
 	z.ls = unreadByteUndefined
 	z.l, z.n = 0, 0
 	z.trb = false
-	z.tr.end()
 }
 
 func (z *ioDecReaderCommon) numread() uint {
@@ -91,25 +93,27 @@ func (z *ioDecReaderCommon) numread() uint {
 }
 
 func (z *ioDecReaderCommon) track() {
-	z.tr.ensureCap(1)
-	z.tr.buf = z.tr.buf[:0]
+	z.tr = z.blist.check(z.tr, 256)[:0]
 	z.trb = true
 }
 
 func (z *ioDecReaderCommon) stopTrack() (bs []byte) {
 	z.trb = false
-	return z.tr.buf
+	return z.tr
 }
 
-func (z *ioDecReaderCommon) resetBufr() {
-	z.bufr.ensureCap(1)
-	z.bufr.buf = z.bufr.buf[:0]
-}
+// func (z *ioDecReaderCommon) resetBufr() {
+// 	if cap(z.bufr) < 128 {
+// 		blist.put(z.bufr)
+// 		z.bufr = blist.get(128)
+// 	}
+// 	z.bufr = z.bufr[:0]
+// }
 
-func (z *ioDecReaderCommon) release() {
-	z.tr.end()
-	z.bufr.end()
-}
+// func (z *ioDecReaderCommon) release() {
+// 	z.tr.end()
+// 	z.bufr.end()
+// }
 
 // ------------------------------------------
 
@@ -126,8 +130,8 @@ type ioDecReader struct {
 	// _ [1]uint64                 // padding
 }
 
-func (z *ioDecReader) reset(r io.Reader) {
-	z.ioDecReaderCommon.reset(r)
+func (z *ioDecReader) reset(r io.Reader, blist *bytesFreelist) {
+	z.ioDecReaderCommon.reset(r, blist)
 
 	var ok bool
 	z.rr = r
@@ -206,7 +210,7 @@ func (z *ioDecReader) readx(n uint) (bs []byte) {
 	}
 	z.n += uint(len(bs))
 	if z.trb {
-		z.tr.appends(bs)
+		z.tr = append(z.tr, bs...)
 	}
 	return
 }
@@ -220,7 +224,7 @@ func (z *ioDecReader) readb(bs []byte) {
 	}
 	z.n += uint(len(bs))
 	if z.trb {
-		z.tr.appends(bs)
+		z.tr = append(z.tr, bs...)
 	}
 }
 
@@ -229,7 +233,7 @@ func (z *ioDecReader) readn1eof() (b uint8, eof bool) {
 	if err == nil {
 		z.n++
 		if z.trb {
-			z.tr.append(b)
+			z.tr = append(z.tr, b)
 		}
 	} else if err == io.EOF {
 		eof = true
@@ -244,7 +248,7 @@ func (z *ioDecReader) readn1() (b uint8) {
 	if err == nil {
 		z.n++
 		if z.trb {
-			z.tr.append(b)
+			z.tr = append(z.tr, b)
 		}
 		return
 	}
@@ -289,19 +293,19 @@ func (z *ioDecReader) readTo(accept *bitset256) []byte {
 	// 		return
 	// 	}
 	// }
-	z.resetBufr()
+	z.bufr = z.blist.check(z.bufr, 256)[:0]
 LOOP:
 	token, eof := z.readn1eof()
 	if eof {
-		return z.bufr.buf
+		return z.bufr
 	}
 	if accept.isset(token) {
 		// out = append(out, token)
-		z.bufr.append(token)
+		z.bufr = append(z.bufr, token)
 		goto LOOP
 	}
 	z.unreadn1()
-	return z.bufr.buf
+	return z.bufr
 }
 
 func (z *ioDecReader) readUntil(stop byte) []byte {
@@ -315,15 +319,15 @@ func (z *ioDecReader) readUntil(stop byte) []byte {
 	// 		return
 	// 	}
 	// }
-	z.resetBufr()
+	z.bufr = z.blist.check(z.bufr, 256)[:0]
 LOOP:
 	token, eof := z.readn1eof()
 	if eof {
 		panic(io.EOF)
 	}
-	z.bufr.append(token)
+	z.bufr = append(z.bufr, token)
 	if token == stop {
-		return z.bufr.buf
+		return z.bufr
 	}
 	goto LOOP
 }
@@ -336,8 +340,8 @@ func (z *ioDecReader) unreadn1() {
 	}
 	z.n--
 	if z.trb {
-		if l := len(z.tr.buf) - 1; l >= 0 {
-			z.tr.buf = z.tr.buf[:l]
+		if l := len(z.tr) - 1; l >= 0 {
+			z.tr = z.tr[:l]
 		}
 	}
 }
@@ -348,77 +352,78 @@ type bufioDecReader struct {
 	ioDecReaderCommon
 	// _ uint64 // padding (cache-aligned)
 
-	c uint // cursor
-	// buf []byte
+	c   uint // cursor
+	buf []byte
 
-	bp bytesBufPoolerPlus
+	// bp bytesBufSlicePooler
 
 	// err error
 }
 
-func (z *bufioDecReader) reset(r io.Reader, bufsize int) {
-	z.ioDecReaderCommon.reset(r)
+func (z *bufioDecReader) reset(r io.Reader, bufsize int, blist *bytesFreelist) {
+	z.ioDecReaderCommon.reset(r, blist)
 	z.c = 0
-	if cap(z.bp.buf) < bufsize {
-		z.bp.get(bufsize)
+	if cap(z.buf) < bufsize {
+		// z.bp.get(bufsize)
 		// z.buf = make([]byte, 0, bufsize)
+		z.buf = blist.get(bufsize)
 	}
-	z.bp.buf = z.bp.buf[:0]
+	z.buf = z.buf[:0]
 }
 
-func (z *bufioDecReader) release() {
-	z.ioDecReaderCommon.release()
-	z.bp.end()
-}
+// func (z *bufioDecReader) release() {
+// 	z.ioDecReaderCommon.release()
+// 	// z.bp.end()
+// }
 
 func (z *bufioDecReader) readb(p []byte) {
-	var n = uint(copy(p, z.bp.buf[z.c:]))
+	var n = uint(copy(p, z.buf[z.c:]))
 	z.n += n
 	z.c += n
 	if len(p) == int(n) {
 		if z.trb {
-			z.tr.appends(p)
+			z.tr = append(z.tr, p...)
 		}
 	} else {
 		z.readbFill(p, n)
 	}
 }
 
-//go:noinline - fallback when z.bp.buf is consumed
+//go:noinline - fallback when z.buf is consumed
 func (z *bufioDecReader) readbFill(p0 []byte, n uint) {
-	// at this point, there's nothing in z.bp.buf to read (z.bp.buf is fully consumed)
+	// at this point, there's nothing in z.buf to read (z.buf is fully consumed)
 	p := p0[n:]
 	var n2 uint
 	var err error
-	if len(p) > cap(z.bp.buf) {
+	if len(p) > cap(z.buf) {
 		n2, err = decReadFull(z.r, p)
 		if err != nil {
 			panic(err)
 		}
 		n += n2
 		z.n += n2
-		// always keep last byte in z.bp.buf
-		z.bp.buf = z.bp.buf[:1]
-		z.bp.buf[0] = p[len(p)-1]
+		// always keep last byte in z.buf
+		z.buf = z.buf[:1]
+		z.buf[0] = p[len(p)-1]
 		z.c = 1
 		if z.trb {
-			z.tr.appends(p0[:n])
+			z.tr = append(z.tr, p0[:n]...)
 		}
 		return
 	}
-	// z.c is now 0, and len(p) <= cap(z.bp.buf)
+	// z.c is now 0, and len(p) <= cap(z.buf)
 LOOP:
 	// for len(p) > 0 && z.err == nil {
 	if len(p) > 0 {
-		z.bp.buf = z.bp.buf[0:cap(z.bp.buf)]
+		z.buf = z.buf[0:cap(z.buf)]
 		var n1 int
-		n1, err = z.r.Read(z.bp.buf)
+		n1, err = z.r.Read(z.buf)
 		n2 = uint(n1)
 		if n2 == 0 && err != nil {
 			panic(err)
 		}
-		z.bp.buf = z.bp.buf[:n2]
-		n2 = uint(copy(p, z.bp.buf))
+		z.buf = z.buf[:n2]
+		n2 = uint(copy(p, z.buf))
 		z.c = n2
 		n += n2
 		z.n += n2
@@ -426,25 +431,25 @@ LOOP:
 		goto LOOP
 	}
 	if z.c == 0 {
-		z.bp.buf = z.bp.buf[:1]
-		z.bp.buf[0] = p[len(p)-1]
+		z.buf = z.buf[:1]
+		z.buf[0] = p[len(p)-1]
 		z.c = 1
 	}
 	if z.trb {
-		z.tr.appends(p0[:n])
+		z.tr = append(z.tr, p0[:n]...)
 	}
 }
 
 func (z *bufioDecReader) readn1() (b byte) {
 	// fast-path, so we elide calling into Read() most of the time
-	if z.c < uint(len(z.bp.buf)) {
-		b = z.bp.buf[z.c]
+	if z.c < uint(len(z.buf)) {
+		b = z.buf[z.c]
 		z.c++
 		z.n++
 		if z.trb {
-			z.tr.append(b)
+			z.tr = append(z.tr, b)
 		}
-	} else { // meaning z.c == len(z.bp.buf) or greater ... so need to fill
+	} else { // meaning z.c == len(z.buf) or greater ... so need to fill
 		z.readbFill(z.b[:1], 0)
 		b = z.b[0]
 	}
@@ -458,24 +463,24 @@ func (z *bufioDecReader) unreadn1() {
 	z.c--
 	z.n--
 	if z.trb {
-		z.tr.buf = z.tr.buf[:len(z.tr.buf)-1]
+		z.tr = z.tr[:len(z.tr)-1]
 	}
 }
 
 func (z *bufioDecReader) readx(n uint) (bs []byte) {
 	if n == 0 {
 		// return
-	} else if z.c+n <= uint(len(z.bp.buf)) {
-		bs = z.bp.buf[z.c : z.c+n]
+	} else if z.c+n <= uint(len(z.buf)) {
+		bs = z.buf[z.c : z.c+n]
 		z.n += n
 		z.c += n
 		if z.trb {
-			z.tr.appends(bs)
+			z.tr = append(z.tr, bs...)
 		}
 	} else {
 		bs = make([]byte, n)
 		// n no longer used - can reuse
-		n = uint(copy(bs, z.bp.buf[z.c:]))
+		n = uint(copy(bs, z.buf[z.c:]))
 		z.n += n
 		z.c += n
 		z.readbFill(bs, n)
@@ -484,14 +489,14 @@ func (z *bufioDecReader) readx(n uint) (bs []byte) {
 }
 
 // func (z *bufioDecReader) doTrack(y uint) {
-// 	z.tr = append(z.tr, z.bp.buf[z.c:y]...) // cost=14???
+// 	z.tr = append(z.tr, z.buf[z.c:y]...) // cost=14???
 // }
 
 // func (z *bufioDecReader) skipLoopFn(i uint) {
 // 	z.n += (i - z.c) - 1
 // 	i++
 // 	if z.trb {
-// 		// z.tr = append(z.tr, z.bp.buf[z.c:i]...)
+// 		// z.tr = append(z.tr, z.buf[z.c:i]...)
 // 		z.doTrack(i)
 // 	}
 // 	z.c = i
@@ -500,8 +505,8 @@ func (z *bufioDecReader) readx(n uint) (bs []byte) {
 func (z *bufioDecReader) skip(accept *bitset256) (token byte) {
 	// token, _ = z.search(nil, accept, 0, 1); return
 
-	// for i := z.c; i < len(z.bp.buf); i++ {
-	// 	if token = z.bp.buf[i]; !accept.isset(token) {
+	// for i := z.c; i < len(z.buf); i++ {
+	// 	if token = z.buf[i]; !accept.isset(token) {
 	// 		z.skipLoopFn(i)
 	// 		return
 	// 	}
@@ -509,16 +514,16 @@ func (z *bufioDecReader) skip(accept *bitset256) (token byte) {
 
 	i := z.c
 LOOP:
-	if i < uint(len(z.bp.buf)) {
+	if i < uint(len(z.buf)) {
 		// inline z.skipLoopFn(i) and refactor, so cost is within inline budget
-		token = z.bp.buf[i]
+		token = z.buf[i]
 		i++
 		if accept.isset(token) {
 			goto LOOP
 		}
 		z.n += i - 2 - z.c
 		if z.trb {
-			z.tr.appends(z.bp.buf[z.c:i]) // z.doTrack(i)
+			z.tr = append(z.tr, z.buf[z.c:i]...) // z.doTrack(i)
 		}
 		z.c = i
 		return
@@ -528,21 +533,21 @@ LOOP:
 
 func (z *bufioDecReader) skipFill(accept *bitset256) (token byte) {
 	// defer func() { xdebugf("skipFill '%c'", token) }()
-	z.n += uint(len(z.bp.buf)) - z.c
+	z.n += uint(len(z.buf)) - z.c
 	if z.trb {
-		z.tr.appends(z.bp.buf[z.c:])
+		z.tr = append(z.tr, z.buf[z.c:]...)
 	}
 	var i, n2 int
 	var err error
 	for {
 		z.c = 0
-		z.bp.buf = z.bp.buf[0:cap(z.bp.buf)]
-		n2, err = z.r.Read(z.bp.buf)
+		z.buf = z.buf[0:cap(z.buf)]
+		n2, err = z.r.Read(z.buf)
 		if n2 == 0 && err != nil {
 			panic(err)
 		}
-		z.bp.buf = z.bp.buf[:n2]
-		for i, token = range z.bp.buf {
+		z.buf = z.buf[:n2]
+		for i, token = range z.buf {
 			if !accept.isset(token) {
 				z.n += (uint(i) - z.c) - 1
 				z.loopFn(uint(i + 1))
@@ -550,27 +555,27 @@ func (z *bufioDecReader) skipFill(accept *bitset256) (token byte) {
 			}
 		}
 		// for i := 0; i < n2; i++ {
-		// 	if token = z.bp.buf[i]; !accept.isset(token) {
+		// 	if token = z.buf[i]; !accept.isset(token) {
 		// 		z.skipLoopFn(i)
 		// 		return
 		// 	}
 		// }
 		z.n += uint(n2)
 		if z.trb {
-			z.tr.appends(z.bp.buf)
+			z.tr = append(z.tr, z.buf...)
 		}
 	}
 }
 
 // func (z *bufioDecReader) readLoopFn(i uint, out0 []byte) (out []byte) {
-// 	out = appendPool(out0, z.bp.buf[z.c:i]...)
+// 	out = appendPool(out0, z.buf[z.c:i]...)
 // 	z.loopFn(i)
 // 	return
 // }
 
 func (z *bufioDecReader) loopFn(i uint) {
 	if z.trb {
-		z.tr.appends(z.bp.buf[z.c:i]) // z.doTrack(i)
+		z.tr = append(z.tr, z.buf[z.c:i]...) // z.doTrack(i)
 	}
 	z.c = i
 }
@@ -585,22 +590,22 @@ func (z *bufioDecReader) readTo(accept *bitset256) (out []byte) {
 	// defer func() { xdebug2f("bufio: readTo: %s", out) }()
 	// _, out = z.search(in, accept, 0, 2); return
 
-	// for i := z.c; i < len(z.bp.buf); i++ {
-	// 	if !accept.isset(z.bp.buf[i]) {
+	// for i := z.c; i < len(z.buf); i++ {
+	// 	if !accept.isset(z.buf[i]) {
 	// 		return z.readToLoopFn(i, nil)
 	// 	}
 	// }
 
 	i := z.c
 LOOP:
-	if i < uint(len(z.bp.buf)) {
-		if !accept.isset(z.bp.buf[i]) {
+	if i < uint(len(z.buf)) {
+		if !accept.isset(z.buf[i]) {
 			// return z.readToLoopFn(i, nil)
 			// inline readToLoopFn here (for performance)
 			z.n += (i - z.c) - 1
-			out = z.bp.buf[z.c:i]
+			out = z.buf[z.c:i]
 			if z.trb {
-				z.tr.appends(z.bp.buf[z.c:i]) // z.doTrack(i)
+				z.tr = append(z.tr, z.buf[z.c:i]...) // z.doTrack(i)
 			}
 			z.c = i
 			return
@@ -612,42 +617,42 @@ LOOP:
 }
 
 func (z *bufioDecReader) readToFill(accept *bitset256) []byte {
-	z.resetBufr()
-	z.n += uint(len(z.bp.buf)) - z.c
-	z.bufr.appends(z.bp.buf[z.c:])
+	z.bufr = z.blist.check(z.bufr, 256)[:0]
+	z.n += uint(len(z.buf)) - z.c
+	z.bufr = append(z.bufr, z.buf[z.c:]...)
 	if z.trb {
-		z.tr.appends(z.bp.buf[z.c:])
+		z.tr = append(z.tr, z.buf[z.c:]...)
 	}
 	var n2 int
 	var err error
 	for {
 		z.c = 0
-		z.bp.buf = z.bp.buf[:cap(z.bp.buf)]
-		n2, err = z.r.Read(z.bp.buf)
+		z.buf = z.buf[:cap(z.buf)]
+		n2, err = z.r.Read(z.buf)
 		if n2 == 0 && err != nil {
 			if err == io.EOF {
-				return z.bufr.buf // readTo should read until it matches or end is reached
+				return z.bufr // readTo should read until it matches or end is reached
 			}
 			panic(err)
 		}
-		z.bp.buf = z.bp.buf[:n2]
-		for i, token := range z.bp.buf {
+		z.buf = z.buf[:n2]
+		for i, token := range z.buf {
 			if !accept.isset(token) {
 				z.n += (uint(i) - z.c) - 1
-				z.bufr.appends(z.bp.buf[z.c:i])
+				z.bufr = append(z.bufr, z.buf[z.c:i]...)
 				z.loopFn(uint(i))
-				return z.bufr.buf
+				return z.bufr
 			}
 		}
 		// for i := 0; i < n2; i++ {
-		// 	if !accept.isset(z.bp.buf[i]) {
+		// 	if !accept.isset(z.buf[i]) {
 		// 		return z.readToLoopFn(i, out)
 		// 	}
 		// }
-		z.bufr.appends(z.bp.buf)
+		z.bufr = append(z.bufr, z.buf...)
 		z.n += uint(n2)
 		if z.trb {
-			z.tr.appends(z.bp.buf)
+			z.tr = append(z.tr, z.buf...)
 		}
 	}
 }
@@ -661,23 +666,23 @@ func (z *bufioDecReader) readUntil(stop byte) (out []byte) {
 	// defer func() { xdebug2f("bufio: readUntil: %s", out) }()
 	// _, out = z.search(in, nil, stop, 4); return
 
-	// for i := z.c; i < len(z.bp.buf); i++ {
-	// 	if z.bp.buf[i] == stop {
+	// for i := z.c; i < len(z.buf); i++ {
+	// 	if z.buf[i] == stop {
 	// 		return z.readUntilLoopFn(i, nil)
 	// 	}
 	// }
 
 	i := z.c
 LOOP:
-	if i < uint(len(z.bp.buf)) {
-		if z.bp.buf[i] == stop {
+	if i < uint(len(z.buf)) {
+		if z.buf[i] == stop {
 			// inline readUntilLoopFn
 			// return z.readUntilLoopFn(i, nil)
 			z.n += (i - z.c) - 1
 			i++
-			out = z.bp.buf[z.c:i]
+			out = z.buf[z.c:i]
 			if z.trb {
-				z.tr.appends(z.bp.buf[z.c:i]) // z.doTrack(i)
+				z.tr = append(z.tr, z.buf[z.c:i]...) // z.doTrack(i)
 			}
 			z.c = i
 			return
@@ -689,38 +694,38 @@ LOOP:
 }
 
 func (z *bufioDecReader) readUntilFill(stop byte) []byte {
-	z.resetBufr()
-	z.n += uint(len(z.bp.buf)) - z.c
-	z.bufr.appends(z.bp.buf[z.c:])
+	z.bufr = z.blist.check(z.bufr, 256)[:0]
+	z.n += uint(len(z.buf)) - z.c
+	z.bufr = append(z.bufr, z.buf[z.c:]...)
 	if z.trb {
-		z.tr.appends(z.bp.buf[z.c:])
+		z.tr = append(z.tr, z.buf[z.c:]...)
 	}
 	for {
 		z.c = 0
-		z.bp.buf = z.bp.buf[0:cap(z.bp.buf)]
-		n1, err := z.r.Read(z.bp.buf)
+		z.buf = z.buf[0:cap(z.buf)]
+		n1, err := z.r.Read(z.buf)
 		if n1 == 0 && err != nil {
 			panic(err)
 		}
 		n2 := uint(n1)
-		z.bp.buf = z.bp.buf[:n2]
-		for i, token := range z.bp.buf {
+		z.buf = z.buf[:n2]
+		for i, token := range z.buf {
 			if token == stop {
 				z.n += (uint(i) - z.c) - 1
-				z.bufr.appends(z.bp.buf[z.c : i+1])
+				z.bufr = append(z.bufr, z.buf[z.c:i+1]...)
 				z.loopFn(uint(i + 1))
-				return z.bufr.buf
+				return z.bufr
 			}
 		}
 		// for i := 0; i < n2; i++ {
-		// 	if z.bp.buf[i] == stop {
+		// 	if z.buf[i] == stop {
 		// 		return z.readUntilLoopFn(i, out)
 		// 	}
 		// }
-		z.bufr.appends(z.bp.buf)
+		z.bufr = append(z.bufr, z.buf...)
 		z.n += n2
 		if z.trb {
-			z.tr.appends(z.bp.buf)
+			z.tr = append(z.tr, z.buf...)
 		}
 	}
 }
@@ -1098,14 +1103,14 @@ func (z *decReaderSwitch) readUntil(stop byte) (out []byte) {
 // This allows for the inlining of the common path when z.bytes=true.
 // Go 1.12+ supports inlining methods with up to 1 inlined function (or 2 if no other constructs).
 
-func (z *decReaderSwitch) release() {
-	if z.bytes {
-	} else if z.bufio {
-		z.bi.release()
-	} else {
-		z.ri.release()
-	}
-}
+// func (z *decReaderSwitch) release() {
+// 	if z.bytes {
+// 	} else if z.bufio {
+// 		z.bi.release()
+// 	} else {
+// 		z.ri.release()
+// 	}
+// }
 func (z *decReaderSwitch) numread() uint {
 	if z.bytes {
 		return z.rb.numread()

+ 2 - 3
codec/simple.go

@@ -128,9 +128,8 @@ func (e *simpleEncDriver) encLen(bd byte, length int) {
 
 func (e *simpleEncDriver) EncodeExt(v interface{}, xtag uint64, ext Ext) {
 	var bs []byte
-	var bufp bytesBufPooler
 	if ext == SelfExt {
-		bs = bufp.get(1024)[:0]
+		bs = e.e.blist.get(1024)[:0]
 		e.e.sideEncode(v, &bs)
 	} else {
 		bs = ext.WriteExt(v)
@@ -142,7 +141,7 @@ func (e *simpleEncDriver) EncodeExt(v interface{}, xtag uint64, ext Ext) {
 	e.encodeExtPreamble(uint8(xtag), len(bs))
 	e.w.writeb(bs)
 	if ext == SelfExt {
-		bufp.end()
+		e.e.blist.put(bs)
 	}
 }
 

+ 46 - 25
codec/writer.go

@@ -129,50 +129,68 @@ type bufioEncWriter struct {
 
 	n int
 
-	// Extensions can call Encode() within a current Encode() call.
-	// We need to know when the top level Encode() call returns,
-	// so we can decide whether to Release() or not.
-	calls uint16 // what depth in mustDecode are we in now.
+	// // Extensions can call Encode() within a current Encode() call.
+	// // We need to know when the top level Encode() call returns,
+	// // so we can decide whether to Release() or not.
+	// calls uint16 // what depth in mustDecode are we in now.
 
-	sz int // buf size
+	// sz int // buf size
 	// _ uint64 // padding (cache-aligned)
 
 	// ---- cache line
 
 	// write-most fields below
 
-	// less used fields
-	bytesBufPooler
+	// // less used fields
+	// bytesBufPooler
 
-	b [40]byte // scratch buffer and padding (cache-aligned)
+	b [16]byte // scratch buffer and padding (cache-aligned)
 	// a int
 	// b   [4]byte
 	// err
 }
 
-func (z *bufioEncWriter) reset(w io.Writer, bufsize int) {
+func (z *bufioEncWriter) reset(w io.Writer, bufsize int, blist *bytesFreelist) {
 	z.w = w
 	z.n = 0
-	z.calls = 0
+	// z.calls = 0
 	if bufsize <= 0 {
 		bufsize = defEncByteBufSize
 	}
-	z.sz = bufsize
-	if cap(z.buf) >= bufsize {
-		z.buf = z.buf[:cap(z.buf)]
-	} else if bufsize <= len(z.b) {
-		z.buf = z.b[:]
-	} else {
-		z.buf = z.bytesBufPooler.get(bufsize)
-		z.buf = z.buf[:cap(z.buf)]
-		// z.buf = make([]byte, bufsize)
+	// z.sz = bufsize
+	if cap(z.buf) < bufsize {
+		if len(z.buf) > 0 && &z.buf[0] != &z.b[0] {
+			blist.put(z.buf)
+		}
+		if len(z.b) > bufsize {
+			z.buf = z.b[:]
+		} else {
+			z.buf = blist.get(bufsize)
+		}
 	}
-}
-
-func (z *bufioEncWriter) release() {
-	z.buf = nil
-	z.bytesBufPooler.end()
-}
+	z.buf = z.buf[:cap(z.buf)]
+	// if bufsize <= cap(z.buf) {
+	// 	z.buf = z.buf[:cap(z.buf)]
+	// } else {
+	// } else if bufsize <= len(z.b) {
+	// 	if len(z.buf) > 0 && &z.buf[0] != &z.b[0] {
+	// 		blist.put(z.buf)
+	// 	}
+	// 	z.buf = z.b[:]
+	// } else {
+	// 	// z.buf = z.bytesBufPooler.get(bufsize)
+	// 	// z.buf = z.buf[:cap(z.buf)]
+	// 	if len(z.buf) > 0 && &z.buf[0] != &z.b[0] {
+	// 		blist.put(z.buf)
+	// 	}
+	// 	z.buf = blist.get(bufsize)
+	// }
+}
+
+// func (z *bufioEncWriter) release() {
+// 	z.buf = nil
+// 	z.bytesBufPooler.end()
+// }
 
 //go:noinline - flush only called intermittently
 func (z *bufioEncWriter) flushErr() (err error) {
@@ -315,6 +333,9 @@ type encWriterSwitch struct {
 	be    bool // is binary encoder?
 
 	c containerState
+
+	calls uint16
+
 	// _    [3]byte // padding
 	// _    [2]uint64 // padding
 	// _    uint64    // padding