Browse Source

implement block decode in assembly

~2x faster

name                old time/op    new time/op     delta
UncompressPg1661-4    1.62ms ± 2%     0.61ms ± 1%    -62.14%  (p=0.000 n=10+9)
UncompressDigits-4    98.3µs ± 0%     57.2µs ± 0%    -41.76%  (p=0.000 n=10+10)
UncompressTwain-4     1.05ms ± 1%     0.39ms ± 0%    -62.61%  (p=0.000 n=10+9)
UncompressRand-4      4.16µs ± 1%     4.18µs ± 0%     +0.45%  (p=0.004 n=9+9)

name                old speed      new speed       delta
UncompressPg1661-4   367MB/s ± 2%    968MB/s ± 1%   +164.12%  (p=0.000 n=10+9)
UncompressDigits-4  1.02GB/s ± 0%   1.75GB/s ± 0%    +71.72%  (p=0.000 n=10+10)
UncompressTwain-4    371MB/s ± 1%    992MB/s ± 0%   +167.47%  (p=0.000 n=10+9)
UncompressRand-4    3.94GB/s ± 1%   3.92GB/s ± 0%     -0.45%  (p=0.006 n=9+9)

name                old alloc/op   new alloc/op    delta
UncompressPg1661-4     8.00B ± 0%      4.00B ± 0%    -50.00%  (p=0.000 n=9+8)
UncompressDigits-4     0.00B           0.00B            ~     (all equal)
UncompressTwain-4     2.40B ±100%     0.40B ±150%       ~     (p=0.077 n=10+10)
UncompressRand-4       0.00B           0.00B            ~     (all equal)

name                old allocs/op  new allocs/op   delta
UncompressPg1661-4      0.00            0.00            ~     (all equal)
UncompressDigits-4      0.00            0.00            ~     (all equal)
UncompressTwain-4       0.00            0.00            ~     (all equal)
UncompressRand-4        0.00            0.00            ~     (all equal)
Chris Bannister 7 years ago
parent
commit
9a39efadad
6 changed files with 547 additions and 72 deletions
  1. 12 63
      block.go
  2. 27 9
      block_test.go
  3. 8 0
      decode_amd64.go
  4. 291 0
      decode_amd64.s
  5. 72 0
      decode_other.go
  6. 137 0
      decode_test.go

+ 12 - 63
block.go

@@ -30,75 +30,24 @@ func CompressBlockBound(n int) int {
 // The destination buffer must be sized appropriately.
 // The destination buffer must be sized appropriately.
 //
 //
 // An error is returned if the source data is invalid or the destination buffer is too small.
 // An error is returned if the source data is invalid or the destination buffer is too small.
-func UncompressBlock(src, dst []byte) (si int, err error) {
-	defer func() {
-		// It is now faster to let the runtime panic and recover on out of bound slice access
-		// than checking indices as we go along.
-		if recover() != nil {
-			err = ErrInvalidSourceShortBuffer
-		}
-	}()
+func UncompressBlock(src, dst []byte) (di int, err error) {
 	sn := len(src)
 	sn := len(src)
 	if sn == 0 {
 	if sn == 0 {
 		return 0, nil
 		return 0, nil
 	}
 	}
-	var di int
 
 
-	for {
-		// Literals and match lengths (token).
-		b := int(src[si])
-		si++
-
-		// Literals.
-		if lLen := b >> 4; lLen > 0 {
-			if lLen == 0xF {
-				for src[si] == 0xFF {
-					lLen += 0xFF
-					si++
-				}
-				lLen += int(src[si])
-				si++
-			}
-			i := si
-			si += lLen
-			di += copy(dst[di:di+si-i], src[i:si])
-
-			if si >= sn {
-				return di, nil
-			}
-		}
-
-		si++
-		_ = src[si] // Bound check elimination.
-		offset := int(src[si-1]) | int(src[si])<<8
-		si++
-
-		// Match.
-		mLen := b & 0xF
-		if mLen == 0xF {
-			for src[si] == 0xFF {
-				mLen += 0xFF
-				si++
-			}
-			mLen += int(src[si])
-			si++
-		}
-		mLen += minMatch
-
-		// Copy the match.
-		i := di - offset
-		if offset > 0 && mLen >= offset {
-			// Efficiently copy the match dst[di-offset:di] into the dst slice.
-			bytesToCopy := offset * (mLen / offset)
-			expanded := dst[i:]
-			for n := offset; n <= bytesToCopy+offset; n *= 2 {
-				copy(expanded[n:], expanded[:n])
-			}
-			di += bytesToCopy
-			mLen -= bytesToCopy
-		}
-		di += copy(dst[di:di+mLen], dst[i:i+mLen])
+	di = decodeBlock(dst, src)
+	switch di {
+	case -1:
+		return 0, errors.New("TODO: bad data")
+	case -2:
+		// this is short dst really
+		return 0, ErrInvalidSourceShortBuffer
 	}
 	}
+	if di < 0 {
+		return 0, ErrInvalidSourceShortBuffer
+	}
+	return di, nil
 }
 }
 
 
 // CompressBlock compresses the source buffer into the destination one.
 // CompressBlock compresses the source buffer into the destination one.

+ 27 - 9
block_test.go

@@ -3,9 +3,9 @@
 package lz4_test
 package lz4_test
 
 
 import (
 import (
+	"bytes"
 	"fmt"
 	"fmt"
 	"io/ioutil"
 	"io/ioutil"
-	"reflect"
 	"testing"
 	"testing"
 
 
 	"github.com/pierrec/lz4"
 	"github.com/pierrec/lz4"
@@ -26,12 +26,13 @@ var rawFiles = []testcase{
 	{"testdata/pi.txt", true, nil},
 	{"testdata/pi.txt", true, nil},
 	{"testdata/random.data", false, nil},
 	{"testdata/random.data", false, nil},
 	{"testdata/repeat.txt", true, nil},
 	{"testdata/repeat.txt", true, nil},
+	{"testdata/pg1661.txt", true, nil},
 }
 }
 
 
 func TestCompressUncompressBlock(t *testing.T) {
 func TestCompressUncompressBlock(t *testing.T) {
 	type compressor func(s, d []byte) (int, error)
 	type compressor func(s, d []byte) (int, error)
 
 
-	run := func(tc testcase, compress compressor) int {
+	run := func(t *testing.T, tc testcase, compress compressor) int {
 		t.Helper()
 		t.Helper()
 		src := tc.src
 		src := tc.src
 
 
@@ -59,10 +60,25 @@ func TestCompressUncompressBlock(t *testing.T) {
 		n, err = lz4.UncompressBlock(zbuf, buf)
 		n, err = lz4.UncompressBlock(zbuf, buf)
 		if err != nil {
 		if err != nil {
 			t.Fatal(err)
 			t.Fatal(err)
+		} else if n < 0 || n > len(buf) {
+			t.Fatalf("returned written bytes > len(buf): n=%d available=%d", n, len(buf))
+		} else if n != len(src) {
+			t.Errorf("expected to decompress into %d bytes got %d", len(src), n)
 		}
 		}
+
 		buf = buf[:n]
 		buf = buf[:n]
-		if !reflect.DeepEqual(src, buf) {
-			t.Error("uncompressed compressed data not matching initial input")
+		if !bytes.Equal(src, buf) {
+			var c int
+			for i, b := range buf {
+				if c > 10 {
+					break
+				}
+				if src[i] != b {
+					t.Errorf("%d: exp(%x) != got(%x)", i, src[i], buf[i])
+					c++
+				}
+			}
+			t.Fatal("uncompressed compressed data not matching initial input")
 			return 0
 			return 0
 		}
 		}
 
 
@@ -80,20 +96,22 @@ func TestCompressUncompressBlock(t *testing.T) {
 		t.Run("", func(t *testing.T) {
 		t.Run("", func(t *testing.T) {
 			tc := tc
 			tc := tc
 			t.Run(tc.file, func(t *testing.T) {
 			t.Run(tc.file, func(t *testing.T) {
-				t.Parallel()
-				n = run(tc, func(src, dst []byte) (int, error) {
+				// t.Parallel()
+				n = run(t, tc, func(src, dst []byte) (int, error) {
 					var ht [1 << 16]int
 					var ht [1 << 16]int
 					return lz4.CompressBlock(src, dst, ht[:])
 					return lz4.CompressBlock(src, dst, ht[:])
 				})
 				})
 			})
 			})
 			t.Run(fmt.Sprintf("%s HC", tc.file), func(t *testing.T) {
 			t.Run(fmt.Sprintf("%s HC", tc.file), func(t *testing.T) {
-				t.Parallel()
-				nhc = run(tc, func(src, dst []byte) (int, error) {
+				// t.Parallel()
+				nhc = run(t, tc, func(src, dst []byte) (int, error) {
 					return lz4.CompressBlockHC(src, dst, -1)
 					return lz4.CompressBlockHC(src, dst, -1)
 				})
 				})
 			})
 			})
 		})
 		})
-		fmt.Printf("%-40s: %8d / %8d / %8d\n", tc.file, n, nhc, len(src))
+		if !t.Failed() {
+			t.Logf("%-40s: %8d / %8d / %8d\n", tc.file, n, nhc, len(src))
+		}
 	}
 	}
 }
 }
 
 

+ 8 - 0
decode_amd64.go

@@ -0,0 +1,8 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package lz4
+
+//go:noescape
+func decodeBlock(dst, src []byte) int

+ 291 - 0
decode_amd64.s

@@ -0,0 +1,291 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX scratch
+// DX token
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// func decodeBlock(dst, src []byte) int
+// using 50 bytes of stack currently
+TEXT ·decodeBlock(SB), NOSPLIT, $64-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, R11
+	MOVQ dst_len+8(FP), R8
+	ADDQ DI, R8
+
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R9
+	ADDQ SI, R9
+
+loop:
+	// for si < len(src)
+	CMPQ SI, R9
+	JGE end
+
+	// token := uint32(src[si])
+	MOVBQZX (SI), DX
+	INCQ SI
+
+	// lit_len = token >> 4
+	// if lit_len > 0
+	// CX = lit_len
+	MOVQ DX, CX
+	SHRQ $4, CX
+
+	// if lit_len > 0
+	CMPQ CX, $0
+	JEQ offset
+
+	// if lit_len != 0xF
+	CMPQ CX, $0xF
+	JNE copy_literal
+
+lit_len_loop:
+	// for src[si] == 0xFF
+	CMPB (SI), $0xFF
+	JNE lit_len_finalise
+
+	// bounds check src[si+1]
+	MOVQ SI, AX
+	ADDQ $1, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// lit_len += 0xFF
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP lit_len_loop
+
+lit_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_literal:
+	// bounds check src and dst
+	MOVQ SI, AX
+	ADDQ CX, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	MOVQ DI, AX
+	ADDQ CX, AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// whats a good cut off to call memmove?
+	CMPQ CX, $16
+	JGT memmove_lit
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	// if len(src[si:]) < 16
+	MOVQ R9, AX
+	SUBQ SI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+
+	JMP finish_lit_copy
+
+memmove_lit:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	MOVB DX, 48(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVB 48(SP), DX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+
+finish_lit_copy:
+	ADDQ CX, SI
+	ADDQ CX, DI
+
+	CMPQ SI, R9
+	JGE end
+
+offset:
+	// CX := mLen
+	// free up DX to use for offset
+	MOVQ DX, CX
+
+	MOVQ SI, AX
+	ADDQ $2, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// offset
+	// DX := int(src[si]) | int(src[si+1])<<8
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	// 0 offset is invalid
+	CMPQ DX, $0
+	JEQ err_corrupt
+
+	// if mlen != 0xF
+	ANDB $0xF, CX
+	CMPB CX, $0xF
+	JNE copy_match
+
+match_len_loop:
+	// for src[si] == 0xFF
+	// lit_len += 0xFF
+	CMPB (SI), $0xFF
+	JNE match_len_finalise
+
+	// bounds check src[si+1]
+	MOVQ SI, AX
+	ADDQ $1, AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP match_len_loop
+
+match_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_match:
+	// mLen += minMatch
+	ADDQ $4, CX
+
+	// check we have match_len bytes left in dst
+	// di+match_len < len(dst)
+	MOVQ DI, AX
+	ADDQ CX, AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// DX = offset
+	// CX = match_len
+	// BX = &dst + (di - offset)
+	MOVQ DI, BX
+	SUBQ DX, BX
+
+	// check BX is within dst
+	// if BX < &dst
+	CMPQ BX, R11
+	JLT err_short_buf
+
+	// if offset + match_len < di
+	MOVQ BX, AX
+	ADDQ CX, AX
+	CMPQ DI, AX
+	JGT copy_interior_match
+
+	// AX := len(dst[:di])
+	// MOVQ DI, AX
+	// SUBQ R11, AX
+
+	// copy 16 bytes at a time
+	// if di-offset < 16 copy 16-(di-offset) bytes to di
+	// then do the remaining
+
+copy_match_loop:
+	// for match_len >= 0
+	// dst[di] = dst[i]
+	// di++
+	// i++
+	MOVB (BX), AX
+	MOVB AX, (DI)
+	INCQ DI
+	INCQ BX
+	DECQ CX
+
+	CMPQ CX, $0
+	JGT copy_match_loop
+
+	JMP loop
+
+copy_interior_match:
+	CMPQ CX, $16
+	JGT memmove_match
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_match
+
+	MOVOU (BX), X0
+	MOVOU X0, (DI)
+
+	ADDQ CX, DI
+	JMP loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+
+	ADDQ CX, DI
+	JMP loop
+
+err_corrupt:
+	MOVQ $-1, ret+48(FP)
+	RET
+
+err_short_buf:
+	MOVQ $-2, ret+48(FP)
+	RET
+
+end:
+	SUBQ R11, DI
+	MOVQ DI, ret+48(FP)
+	RET

+ 72 - 0
decode_other.go

@@ -0,0 +1,72 @@
+// +build !amd64 appengine !gc noasm
+
+package lz4
+
+func decodeBlock(dst, src []byte) (ret int) {
+	defer func() {
+		// It is now faster to let the runtime panic and recover on out of bound slice access
+		// than checking indices as we go along.
+		if recover() != nil {
+			ret = -2
+		}
+	}()
+
+	var si, di int
+	for {
+		// Literals and match lengths (token).
+		b := int(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			if lLen == 0xF {
+				for src[si] == 0xFF {
+					lLen += 0xFF
+					si++
+				}
+				lLen += int(src[si])
+				si++
+			}
+			i := si
+			si += lLen
+			di += copy(dst[di:di+si-i], src[i:si])
+
+			if si >= len(src) {
+				return di
+			}
+		}
+
+		si++
+		_ = src[si] // Bound check elimination.
+		offset := int(src[si-1]) | int(src[si])<<8
+		si++
+
+		// Match.
+		mLen := b & 0xF
+		if mLen == 0xF {
+			for src[si] == 0xFF {
+				mLen += 0xFF
+				si++
+			}
+			mLen += int(src[si])
+			si++
+		}
+		mLen += minMatch
+
+		// Copy the match.
+		i := di - offset
+		if offset > 0 && mLen >= offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			expanded := dst[i:]
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += copy(dst[di:di+mLen], dst[i:i+mLen])
+	}
+
+	return di
+}

+ 137 - 0
decode_test.go

@@ -0,0 +1,137 @@
+package lz4
+
+import (
+	"bytes"
+	"encoding/base64"
+	"strings"
+	"testing"
+)
+
+func unbase64(in string) []byte {
+	p, err := base64.StdEncoding.DecodeString(in)
+	if err != nil {
+		panic(err)
+	}
+	return p
+}
+
+func TestBlockDecode(t *testing.T) {
+	appendLen := func(p []byte, size int) []byte {
+		for size > 0xFF {
+			p = append(p, 0xFF)
+			size -= 0xFF
+		}
+
+		p = append(p, byte(size))
+		return p
+	}
+
+	emitSeq := func(lit string, offset uint16, matchLen int) []byte {
+		var b byte
+		litLen := len(lit)
+		if litLen < 15 {
+			b = byte(litLen << 4)
+			litLen = -1
+		} else {
+			b = 0xF0
+			litLen -= 15
+		}
+
+		if matchLen < 4 || offset == 0 {
+			out := []byte{b}
+			if litLen >= 0 {
+				out = appendLen(out, litLen)
+			}
+			return append(out, lit...)
+		}
+
+		matchLen -= 4
+		if matchLen < 15 {
+			b |= byte(matchLen)
+			matchLen = -1
+		} else {
+			b |= 0x0F
+			matchLen -= 15
+		}
+
+		out := []byte{b}
+		if litLen >= 0 {
+			out = appendLen(out, litLen)
+		}
+
+		if len(lit) > 0 {
+			out = append(out, lit...)
+		}
+
+		out = append(out, byte(offset), byte(offset>>8))
+
+		if matchLen >= 0 {
+			out = appendLen(out, matchLen)
+		}
+
+		return out
+	}
+	concat := func(in ...[]byte) []byte {
+		var p []byte
+		for _, b := range in {
+			p = append(p, b...)
+		}
+		return p
+	}
+
+	tests := []struct {
+		name string
+		src  []byte
+		exp  []byte
+	}{
+		{
+			"literal_only_short",
+			emitSeq("hello", 0, 0),
+			[]byte("hello"),
+		},
+		{
+			"literal_only_long",
+			emitSeq(strings.Repeat("A", 15+255+255+1), 0, 0),
+			bytes.Repeat([]byte("A"), 15+255+255+1),
+		},
+		{
+			"literal_only_long_1",
+			emitSeq(strings.Repeat("A", 15), 0, 0),
+			bytes.Repeat([]byte("A"), 15),
+		},
+		{
+			"repeat_match_len",
+			emitSeq("a", 1, 4),
+			[]byte("aaaaa"),
+		},
+		{
+			"repeat_match_len_2_seq",
+			concat(emitSeq("a", 1, 4), emitSeq("B", 1, 4)),
+			[]byte("aaaaaBBBBB"),
+		},
+		{
+			"long_match",
+			emitSeq("A", 1, 16),
+			bytes.Repeat([]byte("A"), 17),
+		},
+		{
+			"repeat_match_log_len_2_seq",
+			concat(emitSeq("a", 1, 15), emitSeq("B", 1, 15), emitSeq("end", 0, 0)),
+			[]byte(strings.Repeat("a", 16) + strings.Repeat("B", 16) + "end"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			buf := make([]byte, len(test.exp))
+			n := decodeBlock(buf, test.src)
+			if n <= 0 {
+				t.Log(-n)
+			}
+
+			if !bytes.Equal(buf, test.exp) {
+				t.Fatalf("expected %q got %q", test.exp, buf)
+			}
+		})
+	}
+}