Browse Source

Rework fast matching

This makes several changes that in combination gives close to the same compression, but with a big speedup in most cases.

We change the hash table to contain hashes of 6 bytes. The speed is about the same, but this usually gives better compression since hashes are of better quality. This typically also makes the content faster to decode since longer matches are preferred.

Hash table size is now defined separately of window size. I found that 16 bits was a good value, especially since the better hash table opens up for other optimization.

We check 3 bytes, then skip one (plus more if data is hard to compress. This gives most of the speedup, but also looses us some compression.

We index 2 bytes before the end of each match. This doesn't impact speed much and gives a nice compression boost.

This combines well with #49 (not included in this benchmark)

Now for the numbers. They are all before/after, best of 2 runs.
```
file	out	level	insize	outsize	millis	mb/s
consensus.db.10gb	lz4	0	10737418240	5057961420	35446	288.88
consensus.db.10gb	lz4	0	10737418240	5077608378	23226	440.87

file	out	level	insize	outsize	millis	mb/s
rawstudio-mint14.tar	lz4	0	8558382592	4568741520	25369	321.73
rawstudio-mint14.tar	lz4	0	8558382592	4592776475	17168	475.41

file	out	level	insize	outsize	millis	mb/s
github-ranks-backup.bin	lz4	0	1862623243	579273817	4074	436.02
github-ranks-backup.bin	lz4	0	1862623243	627056167	3522	504.35

file	out	level	insize	outsize	millis	mb/s
github-june-2days-2019.json	lz4	0	6273951764	1355117284	10763	555.86
github-june-2days-2019.json	lz4	0	6273951764	1293582359	9136	654.91

file	out	level	insize	outsize	millis	mb/s
gob-stream	lz4	0	1911399616	384235547	3481	523.66
gob-stream	lz4	0	1911399616	384292384	2827	644.80

file	out	level	insize	outsize	millis	mb/s
10gb.tar	lz4	0	10065157632	6481808453	23629	406.23
10gb.tar	lz4	0	10065157632	5902162074	22592	424.88

file	out	level	insize	outsize	millis	mb/s
enwik9	lz4	0	1000000000	489160425	3733	255.47
enwik9	lz4	0	1000000000	482276927	3520	270.93

file	out	level	insize	outsize	millis	mb/s
silesia.tar	lz4	0	211947520	99218419	691	292.51
silesia.tar	lz4	0	211947520	96766005	590	342.01

file	out	level	insize	outsize	millis	mb/s
sharnd.out	lz4	0	500000000	500000495	169	2821.52
sharnd.out	lz4	0	500000000	500000495	166	2872.51
```

Only [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) has a significant size increase. The others are very close or better than before.

All show minor to a significant speedup.
Klaus Post 6 years ago
parent
commit
a207029655
5 changed files with 81 additions and 33 deletions
  1. 2 2
      bench_test.go
  2. 63 20
      block.go
  3. 13 8
      block_test.go
  4. 3 3
      lz4.go
  5. BIN
      testdata/upperbound.data

+ 2 - 2
bench_test.go

@@ -10,7 +10,7 @@ import (
 )
 
 func BenchmarkCompress(b *testing.B) {
-	var hashTable [1 << 16]int
+	var hashTable [htSize]int
 	buf := make([]byte, len(pg1661))
 
 	b.ReportAllocs()
@@ -22,7 +22,7 @@ func BenchmarkCompress(b *testing.B) {
 }
 
 func BenchmarkCompressRandom(b *testing.B) {
-	var hashTable [1 << 16]int
+	var hashTable [htSize]int
 	buf := make([]byte, len(randomLZ4))
 
 	b.ReportAllocs()

+ 63 - 20
block.go

@@ -2,13 +2,14 @@ package lz4
 
 import (
 	"encoding/binary"
+	"fmt"
 	"math/bits"
 )
 
-// blockHash hashes 4 bytes into a value < winSize.
-func blockHash(x uint32) uint32 {
-	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
-	return x * hasher >> hashShift
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
 }
 
 // CompressBlockBound returns the maximum size of a given buffer of size n, when not compressible.
@@ -46,33 +47,62 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 	// This significantly speeds up incompressible data and usually has very small impact on compresssion.
 	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
 	const adaptSkipLog = 7
-
 	sn, dn := len(src)-mfLimit, len(dst)
 	if sn <= 0 || dn == 0 {
 		return 0, nil
 	}
-	var si int
-
-	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	if len(hashTable) < htSize {
+		return 0, fmt.Errorf("hash table too small, should be at least %d in size", htSize)
+	}
+	// Prove to the compiler the table has at least htSize elements.
+	// The compiler can see that "uint32() >> hashShift" cannot be out of bounds.
+	hashTable = hashTable[:htSize]
 
-	anchor := si // Position of the current literals.
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, anchor int
 
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
 	for si < sn {
-		// Hash the next 4 bytes (sequence)...
-		match := binary.LittleEndian.Uint32(src[si:])
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
 		h := blockHash(match)
+		h2 := blockHash(match >> 8)
 
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
 		ref := hashTable[h]
+		ref2 := hashTable[h2]
 		hashTable[h] = si
-		if ref >= sn { // Invalid reference (dirty hashtable).
-			si += 1 + (si-anchor)>>adaptSkipLog
-			continue
-		}
+		hashTable[h2] = si + 1
 		offset := si - ref
+
+		// If offset <= 0 we got an old entry in the hash table.
 		if offset <= 0 || offset >= winSize || // Out of window.
-			match != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
-			si += 1 + (si-anchor)>>adaptSkipLog
-			continue
+			uint32(match) != binary.LittleEndian.Uint32(src[ref:]) { // Hash collision on different matches.
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref = hashTable[h]
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize ||
+				uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref
+				hashTable[h] = si
+
+				if offset <= 0 || offset >= winSize ||
+					uint32(match>>16) != binary.LittleEndian.Uint32(src[ref:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
 		}
 
 		// Match found.
@@ -134,6 +164,13 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 			dst[di] = byte(mLen)
 			di++
 		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		hashTable[h] = si - 2
 	}
 
 	if anchor == 0 {
@@ -165,6 +202,12 @@ func CompressBlock(src, dst []byte, hashTable []int) (di int, err error) {
 	return di, nil
 }
 
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
 // CompressBlockHC compresses the source buffer src into the destination dst
 // with max search depth (use 0 or negative value for no max).
 //
@@ -199,7 +242,7 @@ func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
 	for si < sn {
 		// Hash the next 4 bytes (sequence).
 		match := binary.LittleEndian.Uint32(src[si:])
-		h := blockHash(match)
+		h := blockHashHC(match)
 
 		// Follow the chain until out of window and give the longest match.
 		mLen := 0
@@ -251,7 +294,7 @@ func CompressBlockHC(src, dst []byte, depth int) (di int, err error) {
 		for si, ml := winStart, si+mLen; si < ml; {
 			match >>= 8
 			match |= uint32(src[si+3]) << 24
-			h := blockHash(match)
+			h := blockHashHC(match)
 			chainTable[si&winMask] = hashTable[h]
 			hashTable[h] = si
 			si++

+ 13 - 8
block_test.go

@@ -11,8 +11,11 @@ import (
 	"github.com/pierrec/lz4"
 )
 
-// Hash table size.
-const htSize = 1 << 16 // 64kb
+const (
+	// Should match values in lz4.go
+	hashLog = 16
+	htSize  = 1 << hashLog
+)
 
 type testcase struct {
 	file         string
@@ -22,11 +25,11 @@ type testcase struct {
 
 var rawFiles = []testcase{
 	// {"testdata/207326ba-36f8-11e7-954a-aca46ba8ca73.png", true, nil},
-	{"testdata/e.txt", true, nil},
+	{"testdata/e.txt", false, nil},
 	{"testdata/gettysburg.txt", true, nil},
 	{"testdata/Mark.Twain-Tom.Sawyer.txt", true, nil},
 	{"testdata/pg1661.txt", true, nil},
-	{"testdata/pi.txt", true, nil},
+	{"testdata/pi.txt", false, nil},
 	{"testdata/random.data", false, nil},
 	{"testdata/repeat.txt", true, nil},
 	{"testdata/pg1661.txt", true, nil},
@@ -125,10 +128,12 @@ func TestCompressCornerCase_CopyDstUpperBound(t *testing.T) {
 		t.Helper()
 
 		// Compress the data.
-		zbuf := make([]byte, int(float64(len(src))*0.85))
+		// We provide a destination that is too small to trigger an out-of-bounds,
+		// which makes it return the error we want.
+		zbuf := make([]byte, int(float64(len(src))*0.40))
 		_, err := compress(src, zbuf)
 		if err != lz4.ErrInvalidSourceShortBuffer {
-			t.Fatal("err should be ErrInvalidSourceShortBuffer")
+			t.Fatal("err should be ErrInvalidSourceShortBuffer, was", err)
 		}
 	}
 
@@ -154,9 +159,9 @@ func TestCompressCornerCase_CopyDstUpperBound(t *testing.T) {
 }
 
 func TestIssue23(t *testing.T) {
-	compressBuf := make([]byte, lz4.CompressBlockBound(htSize))
+	compressBuf := make([]byte, lz4.CompressBlockBound(1<<16))
 	for j := 1; j < 16; j++ {
-		var buf [htSize]byte
+		var buf [1 << 16]byte
 		var ht [htSize]int
 
 		for i := 0; i < len(buf); i += j {

+ 3 - 3
lz4.go

@@ -30,9 +30,9 @@ const (
 	// hashLog determines the size of the hash table used to quickly find a previous match position.
 	// Its value influences the compression speed and memory usage, the lower the faster,
 	// but at the expense of the compression ratio.
-	// 16 seems to be the best compromise.
-	hashLog   = 16
-	hashShift = uint((minMatch * 8) - hashLog)
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
 
 	mfLimit = 8 + minMatch // The last match cannot start within the last 12 bytes.
 )

BIN
testdata/upperbound.data