Bladeren bron

Raise the "always encode as literal" size threshold from 4 to 14.

This isn't an optimization per se, although it does trade off the
"encode 10 bytes" benchmark to favor speed over output size. The point
of this commit is to move closer to what the C++ snappy code does.

benchmark                     old MB/s     new MB/s     speedup
BenchmarkWordsEncode1e1-8     5.77         674.93       116.97x
BenchmarkWordsEncode1e2-8     47.96        47.92        1.00x
BenchmarkWordsEncode1e3-8     190.33       189.48       1.00x
BenchmarkWordsEncode1e4-8     190.25       193.17       1.02x
BenchmarkWordsEncode1e5-8     150.65       151.44       1.01x
BenchmarkWordsEncode1e6-8     180.11       180.63       1.00x
BenchmarkRandomEncode-8       4782.70      4700.25      0.98x
Benchmark_ZFlat0-8            372.49       372.12       1.00x
Benchmark_ZFlat1-8            186.49       187.62       1.01x
Benchmark_ZFlat2-8            4979.47      4891.26      0.98x
Benchmark_ZFlat3-8            85.76        86.16        1.00x
Benchmark_ZFlat4-8            566.31       570.31       1.01x
Benchmark_ZFlat5-8            366.01       366.84       1.00x
Benchmark_ZFlat6-8            162.13       164.18       1.01x
Benchmark_ZFlat7-8            153.69       155.23       1.01x
Benchmark_ZFlat8-8            167.91       169.62       1.01x
Benchmark_ZFlat9-8            147.71       149.43       1.01x
Benchmark_ZFlat10-8           414.06       412.63       1.00x
Benchmark_ZFlat11-8           248.87       247.98       1.00x
Nigel Tao 9 jaren geleden
bovenliggende
commit
ebebc71721
1 gewijzigde bestanden met toevoegingen van 27 en 7 verwijderingen
  1. 27 7
      encode.go

+ 27 - 7
encode.go

@@ -119,24 +119,44 @@ func Encode(dst, src []byte) []byte {
 		if len(p) > maxBlockSize {
 			p, src = p[:maxBlockSize], p[maxBlockSize:]
 		}
-		d += encodeBlock(dst[d:], p)
+		if len(p) < minBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
 	}
 	return dst[:d]
 }
 
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+//
+// TODO: implement this fast path.
+//
+// TODO: actually use inputMargin inside encodeBlock.
+const inputMargin = 16 - 1
+
+// minBlockSize is the minimum size of the input to encodeBlock. As above, we
+// want any emitLiteral calls inside encodeBlock's inner loop to use the fast
+// path if possible, which requires being able to overrun by inputMargin bytes.
+//
+// TODO: can we make this bound a little tighter, raising it by 1 or 2?
+const minBlockSize = inputMargin
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	0 < len(src) && len(src) <= maxBlockSize
+// 	minBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
-	// Return early if src is short.
-	if len(src) <= 4 {
-		return emitLiteral(dst, src)
-	}
-
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
 	const maxTableSize = 1 << 14
 	shift, tableSize := uint(32-8), 1<<8