Browse Source

codec: faster bitset and skip whitespace

- move bitset for numbers and whitespace to helper.go
- add bitset64 for cheaply checking flags
- add skipWhitespace and comment out skip
- make bitset256 a [256]bool so you can easily check without
  relatively expensive bitset operations
Ugorji Nwoke 6 years ago
parent
commit
f819262d36
7 changed files with 198 additions and 65 deletions
  1. 3 3
      codec/codec_test.go
  2. 4 1
      codec/decode.go
  3. 3 0
      codec/encode.go
  4. 66 14
      codec/helper.go
  5. 12 27
      codec/json.go
  6. 109 17
      codec/reader.go
  7. 1 3
      go.mod

+ 3 - 3
codec/codec_test.go

@@ -3062,13 +3062,13 @@ func doTestBufioDecReader(t *testing.T, bufsize int) {
 	br.reset(strings.NewReader(s), bufsizehalf, &blist)
 	// println()
 	for range [4]struct{}{} {
-		out = br.readTo(&jsonNumSet)
+		out = br.readTo(&numCharBitset)
 		testDeepEqualErr(string(out), `01234`, t, "-")
 		// fmt.Printf("readTo: out: `%s`\n", out)
 		out = br.readUntil('\'', true)
 		testDeepEqualErr(string(out), "'", t, "-")
 		// fmt.Printf("readUntil: out: `%s`\n", out)
-		out = br.readTo(&jsonNumSet)
+		out = br.readTo(&numCharBitset)
 		testDeepEqualErr(string(out), `56789`, t, "-")
 		// fmt.Printf("readTo: out: `%s`\n", out)
 		out = br.readUntil('0', true)
@@ -3082,7 +3082,7 @@ func doTestBufioDecReader(t *testing.T, bufsize int) {
 		out = br.readUntil(' ', true)
 		testDeepEqualErr(string(out), `01234'56789 `, t, "-")
 		// fmt.Printf("readUntil: out: |%s|\n", out)
-		token = br.skip(&jsonCharWhitespaceSet)
+		token = br.skipWhitespace() // br.skip(&whitespaceCharBitset)
 		testDeepEqualErr(token, byte('0'), t, "-")
 		// fmt.Printf("skip: token: '%c'\n", token)
 		br.unreadn1()

+ 4 - 1
codec/decode.go

@@ -1269,8 +1269,11 @@ func (d *Decoder) ResetBytes(in []byte) {
 	if in == nil {
 		return
 	}
-	d.bytes = true
 	d.bufio = false
+	d.bytes = true
+	// if d.rb == nil {
+	// 	d.rb = new(bytesDecReader)
+	// }
 	d.rb.reset(in)
 	d.resetCommon()
 }

+ 3 - 0
codec/encode.go

@@ -915,6 +915,9 @@ func (e *Encoder) ResetBytes(out *[]byte) {
 		in = make([]byte, defEncByteBufSize)
 	}
 	e.bytes = true
+	// if e.wb == nil {
+	// 	e.wb = new(bytesEncAppender)
+	// }
 	e.wb.reset(in, out)
 	e.resetCommon()
 }

+ 66 - 14
codec/helper.go

@@ -144,6 +144,7 @@ import (
 	"sync"
 	"sync/atomic"
 	"time"
+	"unicode/utf8"
 )
 
 const (
@@ -190,6 +191,11 @@ var (
 	refBitset    bitset32
 	isnilBitset  bitset32
 	scalarBitset bitset32
+
+	numCharBitset        bitset256
+	whitespaceCharBitset bitset256
+
+	whitespaceCharBitset64 bitset64
 )
 
 var (
@@ -235,6 +241,17 @@ func init() {
 		set(byte(reflect.Complex128)).
 		set(byte(reflect.String))
 
+	var i byte
+	for i = 0; i <= utf8.RuneSelf; i++ {
+		switch i {
+		case ' ', '\t', '\r', '\n':
+			whitespaceCharBitset.set(i)
+			whitespaceCharBitset64 = whitespaceCharBitset64.set(i)
+		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'e', 'E', '.', '+', '-':
+			numCharBitset.set(i)
+		}
+	}
+
 }
 
 type handleFlag uint8
@@ -2343,6 +2360,16 @@ func noFrac32(f float32) (v bool) {
 	return
 }
 
+func isWhitespace(v byte) bool {
+	// these are in order of speed below ...
+
+	// return v < 33
+	return v < 33 && whitespaceCharBitset64.isset(v)
+	// return v < 33 && (v == ' ' || v == '\n' || v == '\t' || v == '\r')
+	// return v == ' ' || v == '\n' || v == '\t' || v == '\r'
+	// return whitespaceCharBitset.isset(v)
+}
+
 // func noFrac(f float64) bool {
 // 	_, frac := math.Modf(float64(f))
 // 	return frac == 0
@@ -2451,23 +2478,32 @@ func (s *set) remove(v interface{}) (exists bool) {
 // given x > 0 and n > 0 and x is exactly 2^n, then pos/x === pos>>n AND pos%x === pos&(x-1).
 // consequently, pos/32 === pos>>5, pos/16 === pos>>4, pos/8 === pos>>3, pos%8 == pos&7
 
-type bitset256 [32]byte
+// type bitset256 [32]byte
 
-func (x *bitset256) check(pos byte) uint8 {
-	return x[pos>>3] & (1 << (pos & 7))
-}
-
-func (x *bitset256) isset(pos byte) bool {
-	return x.check(pos) != 0
-	// return x[pos>>3]&(1<<(pos&7)) != 0
-}
-
-// func (x *bitset256) issetv(pos byte) byte {
+// func (x *bitset256) set(pos byte) {
+// 	x[pos>>3] |= (1 << (pos & 7))
+// }
+// func (x *bitset256) check(pos byte) uint8 {
 // 	return x[pos>>3] & (1 << (pos & 7))
 // }
+// func (x *bitset256) isset(pos byte) bool {
+// 	return x.check(pos) != 0
+// 	// return x[pos>>3]&(1<<(pos&7)) != 0
+// }
+// func (x *bitset256) isnotset(pos byte) bool {
+// 	return x.check(pos) == 0
+// }
+
+type bitset256 [256]bool
 
 func (x *bitset256) set(pos byte) {
-	x[pos>>3] |= (1 << (pos & 7))
+	x[pos] = true
+}
+func (x *bitset256) isset(pos byte) bool {
+	return x[pos]
+}
+func (x *bitset256) isnotset(pos byte) bool {
+	return !x[pos]
 }
 
 type bitset32 uint32
@@ -2475,13 +2511,29 @@ type bitset32 uint32
 func (x bitset32) set(pos byte) bitset32 {
 	return x | (1 << pos)
 }
-
 func (x bitset32) check(pos byte) uint32 {
 	return uint32(x) & (1 << pos)
 }
 func (x bitset32) isset(pos byte) bool {
 	return x.check(pos) != 0
-	// return x&(1<<pos) != 0
+}
+func (x bitset32) isnotset(pos byte) bool {
+	return x.check(pos) == 0
+}
+
+type bitset64 uint64
+
+func (x bitset64) set(pos byte) bitset64 {
+	return x | (1 << pos)
+}
+func (x bitset64) check(pos byte) uint64 {
+	return uint64(x) & (1 << pos)
+}
+func (x bitset64) isset(pos byte) bool {
+	return x.check(pos) != 0
+}
+func (x bitset64) isnotset(pos byte) bool {
+	return x.check(pos) == 0
 }
 
 // func (x *bitset256) unset(pos byte) {

+ 12 - 27
codec/json.go

@@ -87,10 +87,8 @@ var (
 	// jsonTabs and jsonSpaces are used as caches for indents
 	jsonTabs, jsonSpaces [jsonSpacesOrTabsLen]byte
 
-	jsonCharHtmlSafeSet   bitset256
-	jsonCharSafeSet       bitset256
-	jsonCharWhitespaceSet bitset256
-	jsonNumSet            bitset256
+	jsonCharHtmlSafeSet bitset256
+	jsonCharSafeSet     bitset256
 )
 
 func init() {
@@ -113,14 +111,6 @@ func init() {
 			jsonCharHtmlSafeSet.set(i)
 		}
 	}
-	for i = 0; i <= utf8.RuneSelf; i++ {
-		switch i {
-		case ' ', '\t', '\r', '\n':
-			jsonCharWhitespaceSet.set(i)
-		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'e', 'E', '.', '+', '-':
-			jsonNumSet.set(i)
-		}
-	}
 }
 
 // ----------------
@@ -393,18 +383,18 @@ func (e *jsonEncDriver) quoteStr(s string) {
 
 		// if 0x20 <= b && b != '\\' && b != '"' && b != '<' && b != '>' && b != '&' {
 		// if (htmlasis && jsonCharSafeSet.isset(b)) || jsonCharHtmlSafeSet.isset(b) {
-		b := s[i]
-		if e.s.isset(b) {
+		if e.s.isset(s[i]) {
 			i++
 			continue
 		}
-		if b < utf8.RuneSelf {
+		// b := s[i]
+		if s[i] < utf8.RuneSelf {
 			if start < i {
 				w.writestr(s[start:i])
 			}
-			switch b {
+			switch s[i] {
 			case '\\', '"':
-				w.writen2('\\', b)
+				w.writen2('\\', s[i])
 			case '\n':
 				w.writen2('\\', 'n')
 			case '\r':
@@ -417,7 +407,7 @@ func (e *jsonEncDriver) quoteStr(s string) {
 				w.writen2('\\', 't')
 			default:
 				w.writestr(`\u00`)
-				w.writen2(hex[b>>4], hex[b&0xF])
+				w.writen2(hex[s[i]>>4], hex[s[i]&0xF])
 			}
 			i++
 			start = i
@@ -486,11 +476,6 @@ type jsonDecDriver struct {
 	d Decoder
 }
 
-// func jsonIsWS(b byte) bool {
-// 	// return b == ' ' || b == '\t' || b == '\r' || b == '\n'
-// 	return jsonCharWhitespaceSet.isset(b)
-// }
-
 func (d *jsonDecDriver) decoder() *Decoder {
 	return &d.d
 }
@@ -622,7 +607,7 @@ func (d *jsonDecDriver) readLit4Null() {
 func (d *jsonDecDriver) advance() {
 	if d.tok == 0 {
 		d.fnil = false
-		d.tok = d.d.decRd.skip(&jsonCharWhitespaceSet)
+		d.tok = d.d.decRd.skipWhitespace() // skip(&whitespaceCharBitset)
 	}
 }
 
@@ -713,7 +698,7 @@ func (d *jsonDecDriver) decNumBytes() (bs []byte) {
 		d.readLit4Null()
 	} else {
 		d.d.decRd.unreadn1()
-		bs = d.d.decRd.readTo(&jsonNumSet)
+		bs = d.d.decRd.readTo(&numCharBitset)
 	}
 	d.tok = 0
 	return
@@ -831,14 +816,14 @@ func (d *jsonDecDriver) decBytesFromArray(bs []byte) []byte {
 	}
 	d.tok = 0
 	bs = append(bs, uint8(d.DecodeUint64()))
-	d.tok = d.d.decRd.skip(&jsonCharWhitespaceSet)
+	d.tok = d.d.decRd.skipWhitespace() // skip(&whitespaceCharBitset)
 	for d.tok != ']' {
 		if d.tok != ',' {
 			d.d.errorf("read array element - expect char '%c' but got char '%c'", ',', d.tok)
 		}
 		d.tok = 0
 		bs = append(bs, uint8(chkOvf.UintV(d.DecodeUint64(), 8)))
-		d.tok = d.d.decRd.skip(&jsonCharWhitespaceSet)
+		d.tok = d.d.decRd.skipWhitespace() // skip(&whitespaceCharBitset)
 	}
 	d.tok = 0
 	return bs

+ 109 - 17
codec/reader.go

@@ -9,22 +9,29 @@ import "io"
 // read from an io.Reader or directly off a byte slice with zero-copying.
 type decReader interface {
 	unreadn1()
+
 	// readx will use the implementation scratch buffer if possible i.e. n < len(scratchbuf), OR
 	// just return a view of the []byte being decoded from.
 	// Ensure you call detachZeroCopyBytes later if this needs to be sent outside codec control.
 	readx(n uint) []byte
 	readb([]byte)
 	readn1() uint8
+
 	// read up to 7 bytes at a time
 	readn(num uint8) (v [rwNLen]byte)
 	numread() uint // number of bytes read
 	track()
 	stopTrack() []byte
 
+	// skip any whitespace characters, and return the first non-matching byte
+	skipWhitespace() (token byte)
+
 	// skip will skip any byte that matches, and return the first non-matching byte
-	skip(accept *bitset256) (token byte)
+	// skip(accept *bitset256) (token byte)
+
 	// readTo will read any byte that matches, stopping once no-longer matching.
 	readTo(accept *bitset256) (out []byte)
+
 	// readUntil will read, only stopping once it matches the 'stop' byte.
 	readUntil(stop byte, includeLast bool) (out []byte)
 }
@@ -243,14 +250,27 @@ func (z *ioDecReader) readn1() (b uint8) {
 	panic(err)
 }
 
-func (z *ioDecReader) skip(accept *bitset256) (token byte) {
+// func (z *ioDecReader) skip(accept *bitset256) (token byte) {
+// 	var eof bool
+// LOOP:
+// 	token, eof = z.readn1eof()
+// 	if eof {
+// 		return
+// 	}
+// 	if accept.isset(token) {
+// 		goto LOOP
+// 	}
+// 	return
+// }
+
+func (z *ioDecReader) skipWhitespace() (token byte) {
 	var eof bool
 LOOP:
 	token, eof = z.readn1eof()
 	if eof {
 		return
 	}
-	if accept.isset(token) {
+	if isWhitespace(token) {
 		goto LOOP
 	}
 	return
@@ -441,14 +461,64 @@ func (z *bufioDecReader) readx(n uint) (bs []byte) {
 	return
 }
 
-func (z *bufioDecReader) skip(accept *bitset256) (token byte) {
+// func (z *bufioDecReader) skip(accept *bitset256) (token byte) {
+// 	i := z.c
+// LOOP:
+// 	if i < uint(len(z.buf)) {
+// 		// inline z.skipLoopFn(i) and refactor, so cost is within inline budget
+// 		token = z.buf[i]
+// 		i++
+// 		if accept.isset(token) {
+// 			goto LOOP
+// 		}
+// 		z.n += i - 2 - z.c
+// 		if z.trb {
+// 			z.tr = append(z.tr, z.buf[z.c:i]...) // z.doTrack(i)
+// 		}
+// 		z.c = i
+// 		return
+// 	}
+// 	return z.skipFill(accept)
+// }
+
+// func (z *bufioDecReader) skipFill(accept *bitset256) (token byte) {
+// 	z.n += uint(len(z.buf)) - z.c
+// 	if z.trb {
+// 		z.tr = append(z.tr, z.buf[z.c:]...)
+// 	}
+// 	var i, n2 int
+// 	var err error
+// 	for {
+// 		z.c = 0
+// 		z.buf = z.buf[0:cap(z.buf)]
+// 		n2, err = z.r.Read(z.buf)
+// 		if n2 == 0 && err != nil {
+// 			panic(err)
+// 		}
+// 		z.buf = z.buf[:n2]
+// 		for i, token = range z.buf {
+// 			// if !accept.isset(token) {
+// 			if accept.isnotset(token) {
+// 				z.n += (uint(i) - z.c) - 1
+// 				z.loopFn(uint(i + 1))
+// 				return
+// 			}
+// 		}
+// 		z.n += uint(n2)
+// 		if z.trb {
+// 			z.tr = append(z.tr, z.buf...)
+// 		}
+// 	}
+// }
+
+func (z *bufioDecReader) skipWhitespace() (token byte) {
 	i := z.c
 LOOP:
 	if i < uint(len(z.buf)) {
 		// inline z.skipLoopFn(i) and refactor, so cost is within inline budget
 		token = z.buf[i]
 		i++
-		if accept.isset(token) {
+		if isWhitespace(token) {
 			goto LOOP
 		}
 		z.n += i - 2 - z.c
@@ -458,10 +528,10 @@ LOOP:
 		z.c = i
 		return
 	}
-	return z.skipFill(accept)
+	return z.skipFillWhitespace()
 }
 
-func (z *bufioDecReader) skipFill(accept *bitset256) (token byte) {
+func (z *bufioDecReader) skipFillWhitespace() (token byte) {
 	z.n += uint(len(z.buf)) - z.c
 	if z.trb {
 		z.tr = append(z.tr, z.buf[z.c:]...)
@@ -477,8 +547,7 @@ func (z *bufioDecReader) skipFill(accept *bitset256) (token byte) {
 		}
 		z.buf = z.buf[:n2]
 		for i, token = range z.buf {
-			// if !accept.isset(token) {
-			if accept.check(token) == 0 {
+			if !isWhitespace(token) {
 				z.n += (uint(i) - z.c) - 1
 				z.loopFn(uint(i + 1))
 				return
@@ -503,7 +572,7 @@ func (z *bufioDecReader) readTo(accept *bitset256) (out []byte) {
 LOOP:
 	if i < uint(len(z.buf)) {
 		// if !accept.isset(z.buf[i]) {
-		if accept.check(z.buf[i]) == 0 {
+		if accept.isnotset(z.buf[i]) {
 			// inline readToLoopFn here (for performance)
 			z.n += (i - z.c) - 1
 			out = z.buf[z.c:i]
@@ -541,7 +610,7 @@ func (z *bufioDecReader) readToFill(accept *bitset256) []byte {
 		z.buf = z.buf[:n2]
 		for i, token := range z.buf {
 			// if !accept.isset(token) {
-			if accept.check(token) == 0 {
+			if accept.isnotset(token) {
 				z.n += (uint(i) - z.c) - 1
 				z.bufr = append(z.bufr, z.buf[z.c:i]...)
 				z.loopFn(uint(i))
@@ -684,13 +753,26 @@ LOOP:
 	return
 }
 
-func (z *bytesDecReader) skip(accept *bitset256) (token byte) {
+// func (z *bytesDecReader) skip(accept *bitset256) (token byte) {
+// 	i := z.c
+// LOOP:
+// 	// if i < uint(len(z.b)) {
+// 	token = z.b[i]
+// 	i++
+// 	if accept.isset(token) {
+// 		goto LOOP
+// 	}
+// 	z.c = i
+// 	return
+// }
+
+func (z *bytesDecReader) skipWhitespace() (token byte) {
 	i := z.c
 LOOP:
 	// if i < uint(len(z.b)) {
 	token = z.b[i]
 	i++
-	if accept.isset(token) {
+	if isWhitespace(token) {
 		goto LOOP
 	}
 	z.c = i
@@ -859,16 +941,26 @@ func (z *decRd) readn1() uint8 {
 	}
 }
 
-func (z *decRd) skip(accept *bitset256) (token byte) {
+func (z *decRd) skipWhitespace() (token byte) {
 	if z.bytes {
-		return z.rb.skip(accept)
+		return z.rb.skipWhitespace()
 	} else if z.bufio {
-		return z.bi.skip(accept)
+		return z.bi.skipWhitespace()
 	} else {
-		return z.ri.skip(accept)
+		return z.ri.skipWhitespace()
 	}
 }
 
+// func (z *decRd) skip(accept *bitset256) (token byte) {
+// 	if z.bytes {
+// 		return z.rb.skip(accept)
+// 	} else if z.bufio {
+// 		return z.bi.skip(accept)
+// 	} else {
+// 		return z.ri.skip(accept)
+// 	}
+// }
+
 func (z *decRd) readTo(accept *bitset256) (out []byte) {
 	if z.bytes {
 		return z.rb.readTo(accept)

+ 1 - 3
go.mod

@@ -1,5 +1,3 @@
 module github.com/ugorji/go
 
-require (
-github.com/ugorji/go/codec v1.1.7
-)
+require github.com/ugorji/go/codec v1.1.7