7 năm trước cách đây · 7f42fed963
--- a/block.go
+++ b/block.go
@@ -30,75 +30,17 @@ func CompressBlockBound(n int) int {
 
				 // The destination buffer must be sized appropriately.
			
 
				 //
			
 
				 // An error is returned if the source data is invalid or the destination buffer is too small.
			
 
				-func UncompressBlock(src, dst []byte) (si int, err error) {
			
 
				-	defer func() {
			
 
				-		// It is now faster to let the runtime panic and recover on out of bound slice access
			
 
				-		// than checking indices as we go along.
			
 
				-		if recover() != nil {
			
 
				-			err = ErrInvalidSourceShortBuffer
			
 
				-		}
			
 
				-	}()
			
 
				+func UncompressBlock(src, dst []byte) (di int, err error) {
			
 
				 	sn := len(src)
			
 
				 	if sn == 0 {
			
 
				 		return 0, nil
			
 
				 	}
			
 
				-	var di int
			
 
				-
			
 
				-	for {
			
 
				-		// Literals and match lengths (token).
			
 
				-		b := int(src[si])
			
 
				-		si++
			
 
				-
			
 
				-		// Literals.
			
 
				-		if lLen := b >> 4; lLen > 0 {
			
 
				-			if lLen == 0xF {
			
 
				-				for src[si] == 0xFF {
			
 
				-					lLen += 0xFF
			
 
				-					si++
			
 
				-				}
			
 
				-				lLen += int(src[si])
			
 
				-				si++
			
 
				-			}
			
 
				-			i := si
			
 
				-			si += lLen
			
 
				-			di += copy(dst[di:di+si-i], src[i:si])
			
 
				 
			
 
				-			if si >= sn {
			
 
				-				return di, nil
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		si++
			
 
				-		_ = src[si] // Bound check elimination.
			
 
				-		offset := int(src[si-1]) | int(src[si])<<8
			
 
				-		si++
			
 
				-
			
 
				-		// Match.
			
 
				-		mLen := b & 0xF
			
 
				-		if mLen == 0xF {
			
 
				-			for src[si] == 0xFF {
			
 
				-				mLen += 0xFF
			
 
				-				si++
			
 
				-			}
			
 
				-			mLen += int(src[si])
			
 
				-			si++
			
 
				-		}
			
 
				-		mLen += minMatch
			
 
				-
			
 
				-		// Copy the match.
			
 
				-		i := di - offset
			
 
				-		if offset > 0 && mLen >= offset {
			
 
				-			// Efficiently copy the match dst[di-offset:di] into the dst slice.
			
 
				-			bytesToCopy := offset * (mLen / offset)
			
 
				-			expanded := dst[i:]
			
 
				-			for n := offset; n <= bytesToCopy+offset; n *= 2 {
			
 
				-				copy(expanded[n:], expanded[:n])
			
 
				-			}
			
 
				-			di += bytesToCopy
			
 
				-			mLen -= bytesToCopy
			
 
				-		}
			
 
				-		di += copy(dst[di:di+mLen], dst[i:i+mLen])
			
 
				+	di = decodeBlock(dst, src)
			
 
				+	if di < 0 {
			
 
				+		return 0, ErrInvalidSourceShortBuffer
			
 
				 	}
			
 
				+	return di, nil
			
 
				 }
			
 
				 
			
 
				 // CompressBlock compresses the source buffer into the destination one.
			
--- a/block_test.go
+++ b/block_test.go
@@ -3,9 +3,9 @@
 
				 package lz4_test
			
 
				 
			
 
				 import (
			
 
				+	"bytes"
			
 
				 	"fmt"
			
 
				 	"io/ioutil"
			
 
				-	"reflect"
			
 
				 	"testing"
			
 
				 
			
 
				 	"github.com/pierrec/lz4"
			
@@ -26,12 +26,13 @@ var rawFiles = []testcase{
 
				 	{"testdata/pi.txt", true, nil},
			
 
				 	{"testdata/random.data", false, nil},
			
 
				 	{"testdata/repeat.txt", true, nil},
			
 
				+	{"testdata/pg1661.txt", true, nil},
			
 
				 }
			
 
				 
			
 
				 func TestCompressUncompressBlock(t *testing.T) {
			
 
				 	type compressor func(s, d []byte) (int, error)
			
 
				 
			
 
				-	run := func(tc testcase, compress compressor) int {
			
 
				+	run := func(t *testing.T, tc testcase, compress compressor) int {
			
 
				 		t.Helper()
			
 
				 		src := tc.src
			
 
				 
			
@@ -59,10 +60,25 @@ func TestCompressUncompressBlock(t *testing.T) {
 
				 		n, err = lz4.UncompressBlock(zbuf, buf)
			
 
				 		if err != nil {
			
 
				 			t.Fatal(err)
			
 
				+		} else if n < 0 || n > len(buf) {
			
 
				+			t.Fatalf("returned written bytes > len(buf): n=%d available=%d", n, len(buf))
			
 
				+		} else if n != len(src) {
			
 
				+			t.Errorf("expected to decompress into %d bytes got %d", len(src), n)
			
 
				 		}
			
 
				+
			
 
				 		buf = buf[:n]
			
 
				-		if !reflect.DeepEqual(src, buf) {
			
 
				-			t.Error("uncompressed compressed data not matching initial input")
			
 
				+		if !bytes.Equal(src, buf) {
			
 
				+			var c int
			
 
				+			for i, b := range buf {
			
 
				+				if c > 10 {
			
 
				+					break
			
 
				+				}
			
 
				+				if src[i] != b {
			
 
				+					t.Errorf("%d: exp(%x) != got(%x)", i, src[i], buf[i])
			
 
				+					c++
			
 
				+				}
			
 
				+			}
			
 
				+			t.Fatal("uncompressed compressed data not matching initial input")
			
 
				 			return 0
			
 
				 		}
			
 
				 
			
@@ -80,20 +96,22 @@ func TestCompressUncompressBlock(t *testing.T) {
 
				 		t.Run("", func(t *testing.T) {
			
 
				 			tc := tc
			
 
				 			t.Run(tc.file, func(t *testing.T) {
			
 
				-				t.Parallel()
			
 
				-				n = run(tc, func(src, dst []byte) (int, error) {
			
 
				+				// t.Parallel()
			
 
				+				n = run(t, tc, func(src, dst []byte) (int, error) {
			
 
				 					var ht [1 << 16]int
			
 
				 					return lz4.CompressBlock(src, dst, ht[:])
			
 
				 				})
			
 
				 			})
			
 
				 			t.Run(fmt.Sprintf("%s HC", tc.file), func(t *testing.T) {
			
 
				-				t.Parallel()
			
 
				-				nhc = run(tc, func(src, dst []byte) (int, error) {
			
 
				+				// t.Parallel()
			
 
				+				nhc = run(t, tc, func(src, dst []byte) (int, error) {
			
 
				 					return lz4.CompressBlockHC(src, dst, -1)
			
 
				 				})
			
 
				 			})
			
 
				 		})
			
 
				-		fmt.Printf("%-40s: %8d / %8d / %8d\n", tc.file, n, nhc, len(src))
			
 
				+		if !t.Failed() {
			
 
				+			t.Logf("%-40s: %8d / %8d / %8d\n", tc.file, n, nhc, len(src))
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/decode_amd64.go
+++ b/decode_amd64.go
@@ -0,0 +1,8 @@
 
				+// +build !appengine
			
 
				+// +build gc
			
 
				+// +build !noasm
			
 
				+
			
 
				+package lz4
			
 
				+
			
 
				+//go:noescape
			
 
				+func decodeBlock(dst, src []byte) int
			
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -0,0 +1,375 @@
 
				+// +build !appengine
			
 
				+// +build gc
			
 
				+// +build !noasm
			
 
				+
			
 
				+#include "textflag.h"
			
 
				+
			
 
				+// AX scratch
			
 
				+// BX scratch
			
 
				+// CX scratch
			
 
				+// DX token
			
 
				+//
			
 
				+// DI &dst
			
 
				+// SI &src
			
 
				+// R8 &dst + len(dst)
			
 
				+// R9 &src + len(src)
			
 
				+// R11 &dst
			
 
				+// R12 short output end
			
 
				+// R13 short input end
			
 
				+// func decodeBlock(dst, src []byte) int
			
 
				+// using 50 bytes of stack currently
			
 
				+TEXT ·decodeBlock(SB), NOSPLIT, $64-56
			
 
				+	MOVQ dst_base+0(FP), DI
			
 
				+	MOVQ DI, R11
			
 
				+	MOVQ dst_len+8(FP), R8
			
 
				+	ADDQ DI, R8
			
 
				+
			
 
				+	MOVQ src_base+24(FP), SI
			
 
				+	MOVQ src_len+32(FP), R9
			
 
				+	ADDQ SI, R9
			
 
				+
			
 
				+	// shortcut ends
			
 
				+	// short output end
			
 
				+	MOVQ R8, R12
			
 
				+	SUBQ $32, R12
			
 
				+	// short input end
			
 
				+	MOVQ R9, R13
			
 
				+	SUBQ $16, R13
			
 
				+
			
 
				+loop:
			
 
				+	// for si < len(src)
			
 
				+	CMPQ SI, R9
			
 
				+	JGE end
			
 
				+
			
 
				+	// token := uint32(src[si])
			
 
				+	MOVBQZX (SI), DX
			
 
				+	INCQ SI
			
 
				+
			
 
				+	// lit_len = token >> 4
			
 
				+	// if lit_len > 0
			
 
				+	// CX = lit_len
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $4, CX
			
 
				+
			
 
				+	// if lit_len != 0xF
			
 
				+	CMPQ CX, $0xF
			
 
				+	JEQ lit_len_loop_pre
			
 
				+	CMPQ DI, R12
			
 
				+	JGE lit_len_loop_pre
			
 
				+	CMPQ SI, R13
			
 
				+	JGE lit_len_loop_pre
			
 
				+
			
 
				+	// copy shortcut
			
 
				+
			
 
				+	// A two-stage shortcut for the most common case:
			
 
				+	// 1) If the literal length is 0..14, and there is enough space,
			
 
				+	// enter the shortcut and copy 16 bytes on behalf of the literals
			
 
				+	// (in the fast mode, only 8 bytes can be safely copied this way).
			
 
				+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
			
 
				+	// manner; but we ensure that there's enough space in the output for
			
 
				+	// those 18 bytes earlier, upon entering the shortcut (in other words,
			
 
				+	// there is a combined check for both stages).
			
 
				+
			
 
				+	// copy literal
			
 
				+	MOVOU (SI), X0
			
 
				+	MOVOU X0, (DI)
			
 
				+	ADDQ CX, DI
			
 
				+	ADDQ CX, SI
			
 
				+
			
 
				+	MOVQ DX, CX
			
 
				+	ANDQ $0xF, CX
			
 
				+
			
 
				+	// The second stage: prepare for match copying, decode full info.
			
 
				+	// If it doesn't work out, the info won't be wasted.
			
 
				+	// offset := uint16(data[:2])
			
 
				+	MOVWQZX (SI), DX
			
 
				+	ADDQ $2, SI
			
 
				+
			
 
				+	MOVQ DI, AX
			
 
				+	SUBQ DX, AX
			
 
				+	CMPQ AX, DI
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	// if we can't do the second stage then jump straight to read the
			
 
				+	// match length, we already have the offset.
			
 
				+	CMPQ CX, $0xF
			
 
				+	JEQ match_len_loop_pre
			
 
				+	CMPQ DX, $8
			
 
				+	JLT match_len_loop_pre
			
 
				+	CMPQ AX, R11
			
 
				+	JLT err_short_buf
			
 
				+
			
 
				+	// memcpy(op + 0, match + 0, 8);
			
 
				+	MOVQ (AX), BX
			
 
				+	MOVQ BX, (DI)
			
 
				+	// memcpy(op + 8, match + 8, 8);
			
 
				+	MOVQ 8(AX), BX
			
 
				+	MOVQ BX, 8(DI)
			
 
				+	// memcpy(op +16, match +16, 2);
			
 
				+	MOVW 16(AX), BX
			
 
				+	MOVW BX, 16(DI)
			
 
				+
			
 
				+	ADDQ $4, DI // minmatch
			
 
				+	ADDQ CX, DI
			
 
				+
			
 
				+	// shortcut complete, load next token
			
 
				+	JMP loop
			
 
				+
			
 
				+lit_len_loop_pre:
			
 
				+	// if lit_len > 0
			
 
				+	CMPQ CX, $0
			
 
				+	JEQ offset
			
 
				+	CMPQ CX, $0xF
			
 
				+	JNE copy_literal
			
 
				+
			
 
				+lit_len_loop:
			
 
				+	// for src[si] == 0xFF
			
 
				+	CMPB (SI), $0xFF
			
 
				+	JNE lit_len_finalise
			
 
				+
			
 
				+	// bounds check src[si+1]
			
 
				+	MOVQ SI, AX
			
 
				+	ADDQ $1, AX
			
 
				+	CMPQ AX, R9
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	// lit_len += 0xFF
			
 
				+	ADDQ $0xFF, CX
			
 
				+	INCQ SI
			
 
				+	JMP lit_len_loop
			
 
				+
			
 
				+lit_len_finalise:
			
 
				+	// lit_len += int(src[si])
			
 
				+	// si++
			
 
				+	MOVBQZX (SI), AX
			
 
				+	ADDQ AX, CX
			
 
				+	INCQ SI
			
 
				+
			
 
				+copy_literal:
			
 
				+	// bounds check src and dst
			
 
				+	MOVQ SI, AX
			
 
				+	ADDQ CX, AX
			
 
				+	CMPQ AX, R9
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	MOVQ DI, AX
			
 
				+	ADDQ CX, AX
			
 
				+	CMPQ AX, R8
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	// whats a good cut off to call memmove?
			
 
				+	CMPQ CX, $16
			
 
				+	JGT memmove_lit
			
 
				+
			
 
				+	// if len(dst[di:]) < 16
			
 
				+	MOVQ R8, AX
			
 
				+	SUBQ DI, AX
			
 
				+	CMPQ AX, $16
			
 
				+	JLT memmove_lit
			
 
				+
			
 
				+	// if len(src[si:]) < 16
			
 
				+	MOVQ R9, AX
			
 
				+	SUBQ SI, AX
			
 
				+	CMPQ AX, $16
			
 
				+	JLT memmove_lit
			
 
				+
			
 
				+	MOVOU (SI), X0
			
 
				+	MOVOU X0, (DI)
			
 
				+
			
 
				+	JMP finish_lit_copy
			
 
				+
			
 
				+memmove_lit:
			
 
				+	// memmove(to, from, len)
			
 
				+	MOVQ DI, 0(SP)
			
 
				+	MOVQ SI, 8(SP)
			
 
				+	MOVQ CX, 16(SP)
			
 
				+	// spill
			
 
				+	MOVQ DI, 24(SP)
			
 
				+	MOVQ SI, 32(SP)
			
 
				+	MOVQ CX, 40(SP) // need len to inc SI, DI after
			
 
				+	MOVB DX, 48(SP)
			
 
				+	CALL runtime·memmove(SB)
			
 
				+
			
 
				+	// restore registers
			
 
				+	MOVQ 24(SP), DI
			
 
				+	MOVQ 32(SP), SI
			
 
				+	MOVQ 40(SP), CX
			
 
				+	MOVB 48(SP), DX
			
 
				+
			
 
				+	// recalc initial values
			
 
				+	MOVQ dst_base+0(FP), R8
			
 
				+	MOVQ R8, R11
			
 
				+	ADDQ dst_len+8(FP), R8
			
 
				+	MOVQ src_base+24(FP), R9
			
 
				+	ADDQ src_len+32(FP), R9
			
 
				+	MOVQ R8, R12
			
 
				+	SUBQ $32, R12
			
 
				+	MOVQ R9, R13
			
 
				+	SUBQ $16, R13
			
 
				+
			
 
				+finish_lit_copy:
			
 
				+	ADDQ CX, SI
			
 
				+	ADDQ CX, DI
			
 
				+
			
 
				+	CMPQ SI, R9
			
 
				+	JGE end
			
 
				+
			
 
				+offset:
			
 
				+	// CX := mLen
			
 
				+	// free up DX to use for offset
			
 
				+	MOVQ DX, CX
			
 
				+
			
 
				+	MOVQ SI, AX
			
 
				+	ADDQ $2, AX
			
 
				+	CMPQ AX, R9
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	// offset
			
 
				+	// DX := int(src[si]) | int(src[si+1])<<8
			
 
				+	MOVWQZX (SI), DX
			
 
				+	ADDQ $2, SI
			
 
				+
			
 
				+	// 0 offset is invalid
			
 
				+	CMPQ DX, $0
			
 
				+	JEQ err_corrupt
			
 
				+
			
 
				+	ANDB $0xF, CX
			
 
				+
			
 
				+match_len_loop_pre:
			
 
				+	// if mlen != 0xF
			
 
				+	CMPB CX, $0xF
			
 
				+	JNE copy_match
			
 
				+
			
 
				+match_len_loop:
			
 
				+	// for src[si] == 0xFF
			
 
				+	// lit_len += 0xFF
			
 
				+	CMPB (SI), $0xFF
			
 
				+	JNE match_len_finalise
			
 
				+
			
 
				+	// bounds check src[si+1]
			
 
				+	MOVQ SI, AX
			
 
				+	ADDQ $1, AX
			
 
				+	CMPQ AX, R9
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	ADDQ $0xFF, CX
			
 
				+	INCQ SI
			
 
				+	JMP match_len_loop
			
 
				+
			
 
				+match_len_finalise:
			
 
				+	// lit_len += int(src[si])
			
 
				+	// si++
			
 
				+	MOVBQZX (SI), AX
			
 
				+	ADDQ AX, CX
			
 
				+	INCQ SI
			
 
				+
			
 
				+copy_match:
			
 
				+	// mLen += minMatch
			
 
				+	ADDQ $4, CX
			
 
				+
			
 
				+	// check we have match_len bytes left in dst
			
 
				+	// di+match_len < len(dst)
			
 
				+	MOVQ DI, AX
			
 
				+	ADDQ CX, AX
			
 
				+	CMPQ AX, R8
			
 
				+	JGT err_short_buf
			
 
				+
			
 
				+	// DX = offset
			
 
				+	// CX = match_len
			
 
				+	// BX = &dst + (di - offset)
			
 
				+	MOVQ DI, BX
			
 
				+	SUBQ DX, BX
			
 
				+
			
 
				+	// check BX is within dst
			
 
				+	// if BX < &dst
			
 
				+	CMPQ BX, R11
			
 
				+	JLT err_short_buf
			
 
				+
			
 
				+	// if offset + match_len < di
			
 
				+	MOVQ BX, AX
			
 
				+	ADDQ CX, AX
			
 
				+	CMPQ DI, AX
			
 
				+	JGT copy_interior_match
			
 
				+
			
 
				+	// AX := len(dst[:di])
			
 
				+	// MOVQ DI, AX
			
 
				+	// SUBQ R11, AX
			
 
				+
			
 
				+	// copy 16 bytes at a time
			
 
				+	// if di-offset < 16 copy 16-(di-offset) bytes to di
			
 
				+	// then do the remaining
			
 
				+
			
 
				+copy_match_loop:
			
 
				+	// for match_len >= 0
			
 
				+	// dst[di] = dst[i]
			
 
				+	// di++
			
 
				+	// i++
			
 
				+	MOVB (BX), AX
			
 
				+	MOVB AX, (DI)
			
 
				+	INCQ DI
			
 
				+	INCQ BX
			
 
				+	DECQ CX
			
 
				+
			
 
				+	CMPQ CX, $0
			
 
				+	JGT copy_match_loop
			
 
				+
			
 
				+	JMP loop
			
 
				+
			
 
				+copy_interior_match:
			
 
				+	CMPQ CX, $16
			
 
				+	JGT memmove_match
			
 
				+
			
 
				+	// if len(dst[di:]) < 16
			
 
				+	MOVQ R8, AX
			
 
				+	SUBQ DI, AX
			
 
				+	CMPQ AX, $16
			
 
				+	JLT memmove_match
			
 
				+
			
 
				+	MOVOU (BX), X0
			
 
				+	MOVOU X0, (DI)
			
 
				+
			
 
				+	ADDQ CX, DI
			
 
				+	JMP loop
			
 
				+
			
 
				+memmove_match:
			
 
				+	// memmove(to, from, len)
			
 
				+	MOVQ DI, 0(SP)
			
 
				+	MOVQ BX, 8(SP)
			
 
				+	MOVQ CX, 16(SP)
			
 
				+	// spill
			
 
				+	MOVQ DI, 24(SP)
			
 
				+	MOVQ SI, 32(SP)
			
 
				+	MOVQ CX, 40(SP) // need len to inc SI, DI after
			
 
				+	CALL runtime·memmove(SB)
			
 
				+
			
 
				+	// restore registers
			
 
				+	MOVQ 24(SP), DI
			
 
				+	MOVQ 32(SP), SI
			
 
				+	MOVQ 40(SP), CX
			
 
				+
			
 
				+	// recalc initial values
			
 
				+	MOVQ dst_base+0(FP), R8
			
 
				+	MOVQ R8, R11 // TODO: make these sensible numbers
			
 
				+	ADDQ dst_len+8(FP), R8
			
 
				+	MOVQ src_base+24(FP), R9
			
 
				+	ADDQ src_len+32(FP), R9
			
 
				+	MOVQ R8, R12
			
 
				+	SUBQ $32, R12
			
 
				+	MOVQ R9, R13
			
 
				+	SUBQ $16, R13
			
 
				+
			
 
				+	ADDQ CX, DI
			
 
				+	JMP loop
			
 
				+
			
 
				+err_corrupt:
			
 
				+	MOVQ $-1, ret+48(FP)
			
 
				+	RET
			
 
				+
			
 
				+err_short_buf:
			
 
				+	MOVQ $-2, ret+48(FP)
			
 
				+	RET
			
 
				+
			
 
				+end:
			
 
				+	SUBQ R11, DI
			
 
				+	MOVQ DI, ret+48(FP)
			
 
				+	RET
			
--- a/decode_other.go
+++ b/decode_other.go
@@ -0,0 +1,72 @@
 
				+// +build !amd64 appengine !gc noasm
			
 
				+
			
 
				+package lz4
			
 
				+
			
 
				+func decodeBlock(dst, src []byte) (ret int) {
			
 
				+	defer func() {
			
 
				+		// It is now faster to let the runtime panic and recover on out of bound slice access
			
 
				+		// than checking indices as we go along.
			
 
				+		if recover() != nil {
			
 
				+			ret = -2
			
 
				+		}
			
 
				+	}()
			
 
				+
			
 
				+	var si, di int
			
 
				+	for {
			
 
				+		// Literals and match lengths (token).
			
 
				+		b := int(src[si])
			
 
				+		si++
			
 
				+
			
 
				+		// Literals.
			
 
				+		if lLen := b >> 4; lLen > 0 {
			
 
				+			if lLen == 0xF {
			
 
				+				for src[si] == 0xFF {
			
 
				+					lLen += 0xFF
			
 
				+					si++
			
 
				+				}
			
 
				+				lLen += int(src[si])
			
 
				+				si++
			
 
				+			}
			
 
				+			i := si
			
 
				+			si += lLen
			
 
				+			di += copy(dst[di:di+si-i], src[i:si])
			
 
				+
			
 
				+			if si >= len(src) {
			
 
				+				return di
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		si++
			
 
				+		_ = src[si] // Bound check elimination.
			
 
				+		offset := int(src[si-1]) | int(src[si])<<8
			
 
				+		si++
			
 
				+
			
 
				+		// Match.
			
 
				+		mLen := b & 0xF
			
 
				+		if mLen == 0xF {
			
 
				+			for src[si] == 0xFF {
			
 
				+				mLen += 0xFF
			
 
				+				si++
			
 
				+			}
			
 
				+			mLen += int(src[si])
			
 
				+			si++
			
 
				+		}
			
 
				+		mLen += minMatch
			
 
				+
			
 
				+		// Copy the match.
			
 
				+		i := di - offset
			
 
				+		if offset > 0 && mLen >= offset {
			
 
				+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
			
 
				+			bytesToCopy := offset * (mLen / offset)
			
 
				+			expanded := dst[i:]
			
 
				+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
			
 
				+				copy(expanded[n:], expanded[:n])
			
 
				+			}
			
 
				+			di += bytesToCopy
			
 
				+			mLen -= bytesToCopy
			
 
				+		}
			
 
				+		di += copy(dst[di:di+mLen], dst[i:i+mLen])
			
 
				+	}
			
 
				+
			
 
				+	return di
			
 
				+}
			
--- a/decode_test.go
+++ b/decode_test.go
@@ -0,0 +1,137 @@
 
				+package lz4
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"encoding/base64"
			
 
				+	"strings"
			
 
				+	"testing"
			
 
				+)
			
 
				+
			
 
				+func unbase64(in string) []byte {
			
 
				+	p, err := base64.StdEncoding.DecodeString(in)
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+	return p
			
 
				+}
			
 
				+
			
 
				+func TestBlockDecode(t *testing.T) {
			
 
				+	appendLen := func(p []byte, size int) []byte {
			
 
				+		for size > 0xFF {
			
 
				+			p = append(p, 0xFF)
			
 
				+			size -= 0xFF
			
 
				+		}
			
 
				+
			
 
				+		p = append(p, byte(size))
			
 
				+		return p
			
 
				+	}
			
 
				+
			
 
				+	emitSeq := func(lit string, offset uint16, matchLen int) []byte {
			
 
				+		var b byte
			
 
				+		litLen := len(lit)
			
 
				+		if litLen < 15 {
			
 
				+			b = byte(litLen << 4)
			
 
				+			litLen = -1
			
 
				+		} else {
			
 
				+			b = 0xF0
			
 
				+			litLen -= 15
			
 
				+		}
			
 
				+
			
 
				+		if matchLen < 4 || offset == 0 {
			
 
				+			out := []byte{b}
			
 
				+			if litLen >= 0 {
			
 
				+				out = appendLen(out, litLen)
			
 
				+			}
			
 
				+			return append(out, lit...)
			
 
				+		}
			
 
				+
			
 
				+		matchLen -= 4
			
 
				+		if matchLen < 15 {
			
 
				+			b |= byte(matchLen)
			
 
				+			matchLen = -1
			
 
				+		} else {
			
 
				+			b |= 0x0F
			
 
				+			matchLen -= 15
			
 
				+		}
			
 
				+
			
 
				+		out := []byte{b}
			
 
				+		if litLen >= 0 {
			
 
				+			out = appendLen(out, litLen)
			
 
				+		}
			
 
				+
			
 
				+		if len(lit) > 0 {
			
 
				+			out = append(out, lit...)
			
 
				+		}
			
 
				+
			
 
				+		out = append(out, byte(offset), byte(offset>>8))
			
 
				+
			
 
				+		if matchLen >= 0 {
			
 
				+			out = appendLen(out, matchLen)
			
 
				+		}
			
 
				+
			
 
				+		return out
			
 
				+	}
			
 
				+	concat := func(in ...[]byte) []byte {
			
 
				+		var p []byte
			
 
				+		for _, b := range in {
			
 
				+			p = append(p, b...)
			
 
				+		}
			
 
				+		return p
			
 
				+	}
			
 
				+
			
 
				+	tests := []struct {
			
 
				+		name string
			
 
				+		src  []byte
			
 
				+		exp  []byte
			
 
				+	}{
			
 
				+		{
			
 
				+			"literal_only_short",
			
 
				+			emitSeq("hello", 0, 0),
			
 
				+			[]byte("hello"),
			
 
				+		},
			
 
				+		{
			
 
				+			"literal_only_long",
			
 
				+			emitSeq(strings.Repeat("A", 15+255+255+1), 0, 0),
			
 
				+			bytes.Repeat([]byte("A"), 15+255+255+1),
			
 
				+		},
			
 
				+		{
			
 
				+			"literal_only_long_1",
			
 
				+			emitSeq(strings.Repeat("A", 15), 0, 0),
			
 
				+			bytes.Repeat([]byte("A"), 15),
			
 
				+		},
			
 
				+		{
			
 
				+			"repeat_match_len",
			
 
				+			emitSeq("a", 1, 4),
			
 
				+			[]byte("aaaaa"),
			
 
				+		},
			
 
				+		{
			
 
				+			"repeat_match_len_2_seq",
			
 
				+			concat(emitSeq("a", 1, 4), emitSeq("B", 1, 4)),
			
 
				+			[]byte("aaaaaBBBBB"),
			
 
				+		},
			
 
				+		{
			
 
				+			"long_match",
			
 
				+			emitSeq("A", 1, 16),
			
 
				+			bytes.Repeat([]byte("A"), 17),
			
 
				+		},
			
 
				+		{
			
 
				+			"repeat_match_log_len_2_seq",
			
 
				+			concat(emitSeq("a", 1, 15), emitSeq("B", 1, 15), emitSeq("end", 0, 0)),
			
 
				+			[]byte(strings.Repeat("a", 16) + strings.Repeat("B", 16) + "end"),
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, test := range tests {
			
 
				+		t.Run(test.name, func(t *testing.T) {
			
 
				+			buf := make([]byte, len(test.exp))
			
 
				+			n := decodeBlock(buf, test.src)
			
 
				+			if n <= 0 {
			
 
				+				t.Log(-n)
			
 
				+			}
			
 
				+
			
 
				+			if !bytes.Equal(buf, test.exp) {
			
 
				+				t.Fatalf("expected %q got %q", test.exp, buf)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}