Browse Source

Skip incompressible data

Make the compressor skip incompressible data.

This is a great feature of a compressor, since it will make it much better to serve it mixed data. When serving the compressor pre-compressed or random data, it will skip it very fast.

I have made the

The benchmark functions appear broken, so I added a simple one for this.

```
benchmark                     old ns/op     new ns/op     delta
BenchmarkCompressRandom-8     83659         2641          -96.84%

benchmark                     old MB/s     new MB/s     speedup
BenchmarkCompressRandom-8     195.84       6201.83      31.67x

benchmark                     old allocs     new allocs     delta
BenchmarkCompressRandom-8     0              0              +0.00%

benchmark                     old bytes     new bytes     delta
BenchmarkCompressRandom-8     0             0             +0.00%
```

There is a small penalty in terms of compression, but for a compressor like LZ4 that is mainly made for speed, this seems reasonable. Some examples:

```
Before: (mostly compressible)
file	out	level	insize	outsize	millis	mb/s
enwik9	lz4	0	1000000000	489160485	6968	136.86
enwik9	lz4	1	1000000000	472338434	10406	91.65
enwik9	lz4	2	1000000000	405391862	20422	46.70

After:
file	out	level	insize	outsize	millis	mb/s
enwik9	lz4	0	1000000000	489160479	6811	140.00
enwik9	lz4	1	1000000000	472338501	10583	90.11
enwik9	lz4	2	1000000000	405391924	20386	46.78

Before: (mixed contents)
file	out	level	insize	outsize	millis	mb/s
10gb.tar	lz4	0	10065157632	6441120237	71134	134.94
10gb.tar	lz4	1	10065157632	6369810447	117489	81.70
10gb.tar	lz4	2	10065157632	5697686196	266037	36.08

After:
file	out	level	insize	outsize	millis	mb/s
10gb.tar	lz4	0	10065157632	6481808667	49918	192.29
10gb.tar	lz4	1	10065157632	6410897705	72483	132.43
10gb.tar	lz4	2	10065157632	5741454818	219220	43.79

Before: (Random data)
file	out	level	insize	outsize	millis	mb/s
sharnd.out	lz4	0	500000000	500000495	5894	80.89
sharnd.out	lz4	1	500000000	500000495	9849	48.41
sharnd.out	lz4	2	500000000	500000495	9692	49.20

After:
file	out	level	insize	outsize	millis	mb/s
sharnd.out	lz4	0	500000000	500000495	442	1078.81
sharnd.out	lz4	1	500000000	500000495	410	1160.19
sharnd.out	lz4	2	500000000	500000495	414	1151.78

Before (very compressible)
file	out	level	insize	outsize	millis	mb/s
adresser.json	lz4	0	7983034785	481827641	11168	681.70
adresser.json	lz4	1	7983034785	522398797	31439	242.16
adresser.json	lz4	2	7983034785	380853678	36492	208.62

After:
file	out	level	insize	outsize	millis	mb/s
adresser.json	lz4	0	7983034785	481827641	11195	679.99
adresser.json	lz4	1	7983034785	522398797	31846	239.06
adresser.json	lz4	2	7983034785	380853678	37828	201.26
```
Klaus Post 6 years ago
parent
commit
e1405867e2
2 changed files with 26 additions and 3 deletions
  1. 13 0
      bench_test.go
  2. 13 3
      block.go

+ 13 - 0
bench_test.go

@@ -21,6 +21,19 @@ func BenchmarkCompress(b *testing.B) {
 	}
 }
 
+func BenchmarkCompressRandom(b *testing.B) {
+	var hashTable [1 << 16]int
+	buf := make([]byte, len(randomLZ4))
+
+	b.ReportAllocs()
+	b.SetBytes(int64(len(random)))
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		lz4.CompressBlock(random, buf, hashTable[:])
+	}
+}
+
 func BenchmarkCompressHC(b *testing.B) {
 	buf := make([]byte, len(pg1661))
 

+ 13 - 3
block.go

@@ -41,6 +41,11 @@ func UncompressBlock(src, dst []byte) (int, error) {
 func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 	defer recoverBlock(&err)
 
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compresssion.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
 	sn, dn := len(src)-mfLimit, len(dst)
 	if sn <= 0 || dn == 0 {
 		return 0, nil
@@ -59,13 +64,13 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 		ref := hashTable[h]
 		hashTable[h] = si
 		if ref >= sn { // Invalid reference (dirty hashtable).
-			si++
+			si += 1 + (si-anchor)>>adaptSkipLog
 			continue
 		}
 		offset := si - ref
 		if offset <= 0 || offset >= winSize || // Out of window.
 			match != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
-			si++
+			si += 1 + (si-anchor)>>adaptSkipLog
 			continue
 		}
 
@@ -167,6 +172,11 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
 	defer recoverBlock(&err)
 
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compresssion.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
 	sn, dn := len(src)-mfLimit, len(dst)
 	if sn <= 0 || dn == 0 {
 		return 0, nil
@@ -219,7 +229,7 @@ func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
 
 		// No match found.
 		if mLen == 0 {
-			si++
+			si += 1 + (si-anchor)>>adaptSkipLog
 			continue
 		}