9 år sedan · 6880122951
--- a/encode.go
+++ b/encode.go
@@ -10,17 +10,6 @@ import (
 
				 	"io"
			
 
				 )
			
 
				 
			
 
				-func load32(b []byte, i int) uint32 {
			
 
				-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
			
 
				-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
			
 
				-}
			
 
				-
			
 
				-func load64(b []byte, i int) uint64 {
			
 
				-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
			
 
				-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
			
 
				-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
			
 
				-}
			
 
				-
			
 
				 // Encode returns the encoded form of src. The returned slice may be a sub-
			
 
				 // slice of dst if dst was large enough to hold the entire encoded block.
			
 
				 // Otherwise, a newly allocated slice will be returned.
			
@@ -82,138 +71,6 @@ const inputMargin = 16 - 1
 
				 // TestSameEncodingAsCppShortCopies.
			
 
				 const minNonLiteralBlockSize = 1 + 1 + inputMargin
			
 
				 
			
 
				-func hash(u, shift uint32) uint32 {
			
 
				-	return (u * 0x1e35a7bd) >> shift
			
 
				-}
			
 
				-
			
 
				-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
			
 
				-// assumes that the varint-encoded length of the decompressed bytes has already
			
 
				-// been written.
			
 
				-//
			
 
				-// It also assumes that:
			
 
				-//	len(dst) >= MaxEncodedLen(len(src)) &&
			
 
				-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
			
 
				-func encodeBlock(dst, src []byte) (d int) {
			
 
				-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
			
 
				-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
			
 
				-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
			
 
				-	const (
			
 
				-		maxTableSize = 1 << 14
			
 
				-		// tableMask is redundant, but helps the compiler eliminate bounds
			
 
				-		// checks.
			
 
				-		tableMask = maxTableSize - 1
			
 
				-	)
			
 
				-	shift := uint32(32 - 8)
			
 
				-	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
			
 
				-		shift--
			
 
				-	}
			
 
				-	// In Go, all array elements are zero-initialized, so there is no advantage
			
 
				-	// to a smaller tableSize per se. However, it matches the C++ algorithm,
			
 
				-	// and in the asm versions of this code, we can get away with zeroing only
			
 
				-	// the first tableSize elements.
			
 
				-	var table [maxTableSize]uint16
			
 
				-
			
 
				-	// sLimit is when to stop looking for offset/length copies. The inputMargin
			
 
				-	// lets us use a fast path for emitLiteral in the main loop, while we are
			
 
				-	// looking for copies.
			
 
				-	sLimit := len(src) - inputMargin
			
 
				-
			
 
				-	// nextEmit is where in src the next emitLiteral should start from.
			
 
				-	nextEmit := 0
			
 
				-
			
 
				-	// The encoded form must start with a literal, as there are no previous
			
 
				-	// bytes to copy, so we start looking for hash matches at s == 1.
			
 
				-	s := 1
			
 
				-	nextHash := hash(load32(src, s), shift)
			
 
				-
			
 
				-	for {
			
 
				-		// Copied from the C++ snappy implementation:
			
 
				-		//
			
 
				-		// Heuristic match skipping: If 32 bytes are scanned with no matches
			
 
				-		// found, start looking only at every other byte. If 32 more bytes are
			
 
				-		// scanned (or skipped), look at every third byte, etc.. When a match
			
 
				-		// is found, immediately go back to looking at every byte. This is a
			
 
				-		// small loss (~5% performance, ~0.1% density) for compressible data
			
 
				-		// due to more bookkeeping, but for non-compressible data (such as
			
 
				-		// JPEG) it's a huge win since the compressor quickly "realizes" the
			
 
				-		// data is incompressible and doesn't bother looking for matches
			
 
				-		// everywhere.
			
 
				-		//
			
 
				-		// The "skip" variable keeps track of how many bytes there are since
			
 
				-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
			
 
				-		// the number of bytes to move ahead for each iteration.
			
 
				-		skip := 32
			
 
				-
			
 
				-		nextS := s
			
 
				-		candidate := 0
			
 
				-		for {
			
 
				-			s = nextS
			
 
				-			bytesBetweenHashLookups := skip >> 5
			
 
				-			nextS = s + bytesBetweenHashLookups
			
 
				-			skip += bytesBetweenHashLookups
			
 
				-			if nextS > sLimit {
			
 
				-				goto emitRemainder
			
 
				-			}
			
 
				-			candidate = int(table[nextHash&tableMask])
			
 
				-			table[nextHash&tableMask] = uint16(s)
			
 
				-			nextHash = hash(load32(src, nextS), shift)
			
 
				-			if load32(src, s) == load32(src, candidate) {
			
 
				-				break
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		// A 4-byte match has been found. We'll later see if more than 4 bytes
			
 
				-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
			
 
				-		// them as literal bytes.
			
 
				-		d += emitLiteral(dst[d:], src[nextEmit:s])
			
 
				-
			
 
				-		// Call emitCopy, and then see if another emitCopy could be our next
			
 
				-		// move. Repeat until we find no match for the input immediately after
			
 
				-		// what was consumed by the last emitCopy call.
			
 
				-		//
			
 
				-		// If we exit this loop normally then we need to call emitLiteral next,
			
 
				-		// though we don't yet know how big the literal will be. We handle that
			
 
				-		// by proceeding to the next iteration of the main loop. We also can
			
 
				-		// exit this loop via goto if we get close to exhausting the input.
			
 
				-		for {
			
 
				-			// Invariant: we have a 4-byte match at s, and no need to emit any
			
 
				-			// literal bytes prior to s.
			
 
				-			base := s
			
 
				-			// Extend the 4-byte match as long as possible.
			
 
				-			s = extendMatch(src, candidate+4, s+4)
			
 
				-			d += emitCopy(dst[d:], base-candidate, s-base)
			
 
				-			nextEmit = s
			
 
				-			if s >= sLimit {
			
 
				-				goto emitRemainder
			
 
				-			}
			
 
				-
			
 
				-			// We could immediately start working at s now, but to improve
			
 
				-			// compression we first update the hash table at s-1 and at s. If
			
 
				-			// another emitCopy is not our next move, also calculate nextHash
			
 
				-			// at s+1. At least on GOARCH=amd64, these three hash calculations
			
 
				-			// are faster as one load64 call (with some shifts) instead of
			
 
				-			// three load32 calls.
			
 
				-			x := load64(src, s-1)
			
 
				-			prevHash := hash(uint32(x>>0), shift)
			
 
				-			table[prevHash&tableMask] = uint16(s - 1)
			
 
				-			currHash := hash(uint32(x>>8), shift)
			
 
				-			candidate = int(table[currHash&tableMask])
			
 
				-			table[currHash&tableMask] = uint16(s)
			
 
				-			if uint32(x>>8) != load32(src, candidate) {
			
 
				-				nextHash = hash(uint32(x>>16), shift)
			
 
				-				s++
			
 
				-				break
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-emitRemainder:
			
 
				-	if nextEmit < len(src) {
			
 
				-		d += emitLiteral(dst[d:], src[nextEmit:])
			
 
				-	}
			
 
				-	return d
			
 
				-}
			
 
				-
			
 
				 // MaxEncodedLen returns the maximum length of a snappy block, given its
			
 
				 // uncompressed length.
			
 
				 //
			
--- a/encode_amd64.go
+++ b/encode_amd64.go
@@ -22,3 +22,8 @@ func emitCopy(dst []byte, offset, length int) int
 
				 //
			
 
				 //go:noescape
			
 
				 func extendMatch(src []byte, i, j int) int
			
 
				+
			
 
				+// encodeBlock has the same semantics as in encode_other.go.
			
 
				+//
			
 
				+//go:noescape
			
 
				+func encodeBlock(dst, src []byte) (d int)
			
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -210,3 +210,329 @@ end:
 
				 	SUBQ CX, DI
			
 
				 	MOVQ DI, ret+40(FP)
			
 
				 	RET
			
 
				+
			
 
				+// ----------------------------------------------------------------------------
			
 
				+
			
 
				+// func encodeBlock(dst, src []byte) (d int)
			
 
				+//
			
 
				+// All local variables fit into registers, other than "var table". The register
			
 
				+// allocation:
			
 
				+//	- AX	.	.
			
 
				+//	- BX	.	.
			
 
				+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
			
 
				+//	- DX	64	&src[0], tableSize
			
 
				+//	- SI	72	&src[s]
			
 
				+//	- DI	80	&dst[d]
			
 
				+//	- R9	88	sLimit
			
 
				+//	- R10	.	&src[nextEmit]
			
 
				+//	- R11	96	prevHash, currHash, nextHash, offset
			
 
				+//	- R12	104	&src[base], skip
			
 
				+//	- R13	.	&src[nextS]
			
 
				+//	- R14	.	len(src), bytesBetweenHashLookups, x
			
 
				+//	- R15	112	candidate
			
 
				+//
			
 
				+// The second column (56, 64, etc) is the stack offset to spill the registers
			
 
				+// when calling other functions. We could pack this slightly tighter, but it's
			
 
				+// simpler to have a dedicated spill map independent of the function called.
			
 
				+//
			
 
				+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
			
 
				+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
			
 
				+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
			
 
				+TEXT ·encodeBlock(SB), 0, $32888-56
			
 
				+	MOVQ dst_base+0(FP), DI
			
 
				+	MOVQ src_base+24(FP), SI
			
 
				+	MOVQ src_len+32(FP), R14
			
 
				+
			
 
				+	// shift, tableSize := uint32(32-8), 1<<8
			
 
				+	MOVQ $24, CX
			
 
				+	MOVQ $256, DX
			
 
				+
			
 
				+calcShift:
			
 
				+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
			
 
				+	//	shift--
			
 
				+	// }
			
 
				+	CMPQ DX, $16384
			
 
				+	JGE  varTable
			
 
				+	CMPQ DX, R14
			
 
				+	JGE  varTable
			
 
				+	SUBQ $1, CX
			
 
				+	SHLQ $1, DX
			
 
				+	JMP  calcShift
			
 
				+
			
 
				+varTable:
			
 
				+	// var table [maxTableSize]uint16
			
 
				+	//
			
 
				+	// sizeof(table) is 32768 bytes, which is 2048 16-byte writes.
			
 
				+	MOVQ $2048, DX
			
 
				+	LEAQ table-32768(SP), BX
			
 
				+	PXOR X0, X0
			
 
				+
			
 
				+memclr:
			
 
				+	MOVOU X0, 0(BX)
			
 
				+	ADDQ  $16, BX
			
 
				+	SUBQ  $1, DX
			
 
				+	JNZ   memclr
			
 
				+
			
 
				+	// !!! DX = &src[0]
			
 
				+	MOVQ SI, DX
			
 
				+
			
 
				+	// sLimit := len(src) - inputMargin
			
 
				+	MOVQ R14, R9
			
 
				+	SUBQ $15, R9
			
 
				+
			
 
				+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
			
 
				+	// change for the rest of the function.
			
 
				+	MOVQ CX, 56(SP)
			
 
				+	MOVQ DX, 64(SP)
			
 
				+	MOVQ R9, 88(SP)
			
 
				+
			
 
				+	// nextEmit := 0
			
 
				+	MOVQ DX, R10
			
 
				+
			
 
				+	// s := 1
			
 
				+	ADDQ $1, SI
			
 
				+
			
 
				+	// nextHash := hash(load32(src, s), shift)
			
 
				+	MOVL  0(SI), R11
			
 
				+	IMULL $0x1e35a7bd, R11
			
 
				+	SHRL  CX, R11
			
 
				+
			
 
				+outer:
			
 
				+	// for { etc }
			
 
				+
			
 
				+	// skip := 32
			
 
				+	MOVQ $32, R12
			
 
				+
			
 
				+	// nextS := s
			
 
				+	MOVQ SI, R13
			
 
				+
			
 
				+	// candidate := 0
			
 
				+	MOVQ $0, R15
			
 
				+
			
 
				+inner0:
			
 
				+	// for { etc }
			
 
				+
			
 
				+	// s := nextS
			
 
				+	MOVQ R13, SI
			
 
				+
			
 
				+	// bytesBetweenHashLookups := skip >> 5
			
 
				+	MOVQ R12, R14
			
 
				+	SHRQ $5, R14
			
 
				+
			
 
				+	// nextS = s + bytesBetweenHashLookups
			
 
				+	ADDQ R14, R13
			
 
				+
			
 
				+	// skip += bytesBetweenHashLookups
			
 
				+	ADDQ R14, R12
			
 
				+
			
 
				+	// if nextS > sLimit { goto emitRemainder }
			
 
				+	MOVQ R13, AX
			
 
				+	SUBQ DX, AX
			
 
				+	CMPQ AX, R9
			
 
				+	JA   emitRemainder
			
 
				+
			
 
				+	// candidate = int(table[nextHash])
			
 
				+	MOVWQZX table-32768(SP)(R11*2), R15
			
 
				+
			
 
				+	// table[nextHash] = uint16(s)
			
 
				+	MOVQ SI, AX
			
 
				+	SUBQ DX, AX
			
 
				+	MOVW AX, table-32768(SP)(R11*2)
			
 
				+
			
 
				+	// nextHash = hash(load32(src, nextS), shift)
			
 
				+	MOVL  0(R13), R11
			
 
				+	IMULL $0x1e35a7bd, R11
			
 
				+	SHRL  CX, R11
			
 
				+
			
 
				+	// if load32(src, s) != load32(src, candidate) { continue } break
			
 
				+	MOVL 0(SI), AX
			
 
				+	MOVL (DX)(R15*1), BX
			
 
				+	CMPL AX, BX
			
 
				+	JNE  inner0
			
 
				+
			
 
				+fourByteMatch:
			
 
				+	// As per the encode_other.go code:
			
 
				+	//
			
 
				+	// A 4-byte match has been found. We'll later see etc.
			
 
				+
			
 
				+	// d += emitLiteral(dst[d:], src[nextEmit:s])
			
 
				+	//
			
 
				+	// Push args.
			
 
				+	MOVQ DI, 0(SP)
			
 
				+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	MOVQ R10, 24(SP)
			
 
				+	MOVQ SI, AX
			
 
				+	SUBQ R10, AX
			
 
				+	MOVQ AX, 32(SP)
			
 
				+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
			
 
				+
			
 
				+	// Spill local variables (registers) onto the stack; call; unspill.
			
 
				+	MOVQ SI, 72(SP)
			
 
				+	MOVQ DI, 80(SP)
			
 
				+	MOVQ R15, 112(SP)
			
 
				+	CALL ·emitLiteral(SB)
			
 
				+	MOVQ 56(SP), CX
			
 
				+	MOVQ 64(SP), DX
			
 
				+	MOVQ 72(SP), SI
			
 
				+	MOVQ 80(SP), DI
			
 
				+	MOVQ 88(SP), R9
			
 
				+	MOVQ 112(SP), R15
			
 
				+
			
 
				+	// Finish the "d +=" part of "d += emitLiteral(etc)".
			
 
				+	ADDQ 48(SP), DI
			
 
				+
			
 
				+inner1:
			
 
				+	// for { etc }
			
 
				+
			
 
				+	// base := s
			
 
				+	MOVQ SI, R12
			
 
				+
			
 
				+	// !!! offset := base - candidate
			
 
				+	MOVQ R12, R11
			
 
				+	SUBQ R15, R11
			
 
				+	SUBQ DX, R11
			
 
				+
			
 
				+	// s = extendMatch(src, candidate+4, s+4)
			
 
				+	//
			
 
				+	// Push args.
			
 
				+	MOVQ DX, 0(SP)
			
 
				+	MOVQ src_len+32(FP), R14
			
 
				+	MOVQ R14, 8(SP)
			
 
				+	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	ADDQ $4, R15
			
 
				+	MOVQ R15, 24(SP)
			
 
				+	ADDQ $4, SI
			
 
				+	SUBQ DX, SI
			
 
				+	MOVQ SI, 32(SP)
			
 
				+
			
 
				+	// Spill local variables (registers) onto the stack; call; unspill.
			
 
				+	//
			
 
				+	// We don't need to unspill CX or R9 as we are just about to call another
			
 
				+	// function.
			
 
				+	MOVQ DI, 80(SP)
			
 
				+	MOVQ R11, 96(SP)
			
 
				+	MOVQ R12, 104(SP)
			
 
				+	CALL ·extendMatch(SB)
			
 
				+	MOVQ 64(SP), DX
			
 
				+	MOVQ 80(SP), DI
			
 
				+	MOVQ 96(SP), R11
			
 
				+	MOVQ 104(SP), R12
			
 
				+
			
 
				+	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
			
 
				+	// register holds &src[s], not s.
			
 
				+	MOVQ 40(SP), SI
			
 
				+	ADDQ DX, SI
			
 
				+
			
 
				+	// d += emitCopy(dst[d:], base-candidate, s-base)
			
 
				+	//
			
 
				+	// Push args.
			
 
				+	MOVQ DI, 0(SP)
			
 
				+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	MOVQ R11, 24(SP)
			
 
				+	MOVQ SI, AX
			
 
				+	SUBQ R12, AX
			
 
				+	MOVQ AX, 32(SP)
			
 
				+
			
 
				+	// Spill local variables (registers) onto the stack; call; unspill.
			
 
				+	MOVQ SI, 72(SP)
			
 
				+	MOVQ DI, 80(SP)
			
 
				+	CALL ·emitCopy(SB)
			
 
				+	MOVQ 56(SP), CX
			
 
				+	MOVQ 64(SP), DX
			
 
				+	MOVQ 72(SP), SI
			
 
				+	MOVQ 80(SP), DI
			
 
				+	MOVQ 88(SP), R9
			
 
				+
			
 
				+	// Finish the "d +=" part of "d += emitCopy(etc)".
			
 
				+	ADDQ 40(SP), DI
			
 
				+
			
 
				+	// nextEmit = s
			
 
				+	MOVQ SI, R10
			
 
				+
			
 
				+	// if s >= sLimit { goto emitRemainder }
			
 
				+	MOVQ SI, AX
			
 
				+	SUBQ DX, AX
			
 
				+	CMPQ AX, R9
			
 
				+	JAE  emitRemainder
			
 
				+
			
 
				+	// As per the encode_other.go code:
			
 
				+	//
			
 
				+	// We could immediately etc.
			
 
				+
			
 
				+	// x := load64(src, s-1)
			
 
				+	MOVQ -1(SI), R14
			
 
				+
			
 
				+	// prevHash := hash(uint32(x>>0), shift)
			
 
				+	MOVL  R14, R11
			
 
				+	IMULL $0x1e35a7bd, R11
			
 
				+	SHRL  CX, R11
			
 
				+
			
 
				+	// table[prevHash] = uint16(s-1)
			
 
				+	MOVQ SI, AX
			
 
				+	SUBQ DX, AX
			
 
				+	SUBQ $1, AX
			
 
				+	MOVW AX, table-32768(SP)(R11*2)
			
 
				+
			
 
				+	// currHash := hash(uint32(x>>8), shift)
			
 
				+	SHRQ  $8, R14
			
 
				+	MOVL  R14, R11
			
 
				+	IMULL $0x1e35a7bd, R11
			
 
				+	SHRL  CX, R11
			
 
				+
			
 
				+	// candidate = int(table[currHash])
			
 
				+	MOVWQZX table-32768(SP)(R11*2), R15
			
 
				+
			
 
				+	// table[currHash] = uint16(s)
			
 
				+	ADDQ $1, AX
			
 
				+	MOVW AX, table-32768(SP)(R11*2)
			
 
				+
			
 
				+	// if uint32(x>>8) == load32(src, candidate) { continue }
			
 
				+	MOVL (DX)(R15*1), BX
			
 
				+	CMPL R14, BX
			
 
				+	JEQ  inner1
			
 
				+
			
 
				+	// nextHash = hash(uint32(x>>16), shift)
			
 
				+	SHRQ  $8, R14
			
 
				+	MOVL  R14, R11
			
 
				+	IMULL $0x1e35a7bd, R11
			
 
				+	SHRL  CX, R11
			
 
				+
			
 
				+	// s++
			
 
				+	ADDQ $1, SI
			
 
				+
			
 
				+	// break out of the inner1 for loop, i.e. continue the outer loop.
			
 
				+	JMP outer
			
 
				+
			
 
				+emitRemainder:
			
 
				+	// if nextEmit < len(src) { etc }
			
 
				+	MOVQ src_len+32(FP), AX
			
 
				+	ADDQ DX, AX
			
 
				+	CMPQ R10, AX
			
 
				+	JEQ  end
			
 
				+
			
 
				+	// d += emitLiteral(dst[d:], src[nextEmit:])
			
 
				+	//
			
 
				+	// Push args.
			
 
				+	MOVQ DI, 0(SP)
			
 
				+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
			
 
				+	MOVQ R10, 24(SP)
			
 
				+	SUBQ R10, AX
			
 
				+	MOVQ AX, 32(SP)
			
 
				+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
			
 
				+
			
 
				+	// Spill local variables (registers) onto the stack; call; unspill.
			
 
				+	MOVQ DI, 80(SP)
			
 
				+	CALL ·emitLiteral(SB)
			
 
				+	MOVQ 80(SP), DI
			
 
				+
			
 
				+	// Finish the "d +=" part of "d += emitLiteral(etc)".
			
 
				+	ADDQ 48(SP), DI
			
 
				+
			
 
				+end:
			
 
				+	MOVQ dst_base+0(FP), AX
			
 
				+	SUBQ AX, DI
			
 
				+	MOVQ DI, d+48(FP)
			
 
				+	RET
			
--- a/encode_other.go
+++ b/encode_other.go
@@ -6,6 +6,17 @@
 
				 
			
 
				 package snappy
			
 
				 
			
 
				+func load32(b []byte, i int) uint32 {
			
 
				+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
			
 
				+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
			
 
				+}
			
 
				+
			
 
				+func load64(b []byte, i int) uint64 {
			
 
				+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
			
 
				+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
			
 
				+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
			
 
				+}
			
 
				+
			
 
				 // emitLiteral writes a literal chunk and returns the number of bytes written.
			
 
				 //
			
 
				 // It assumes that:
			
@@ -86,3 +97,135 @@ func extendMatch(src []byte, i, j int) int {
 
				 	}
			
 
				 	return j
			
 
				 }
			
 
				+
			
 
				+func hash(u, shift uint32) uint32 {
			
 
				+	return (u * 0x1e35a7bd) >> shift
			
 
				+}
			
 
				+
			
 
				+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
			
 
				+// assumes that the varint-encoded length of the decompressed bytes has already
			
 
				+// been written.
			
 
				+//
			
 
				+// It also assumes that:
			
 
				+//	len(dst) >= MaxEncodedLen(len(src)) &&
			
 
				+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
			
 
				+func encodeBlock(dst, src []byte) (d int) {
			
 
				+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
			
 
				+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
			
 
				+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
			
 
				+	const (
			
 
				+		maxTableSize = 1 << 14
			
 
				+		// tableMask is redundant, but helps the compiler eliminate bounds
			
 
				+		// checks.
			
 
				+		tableMask = maxTableSize - 1
			
 
				+	)
			
 
				+	shift := uint32(32 - 8)
			
 
				+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
			
 
				+		shift--
			
 
				+	}
			
 
				+	// In Go, all array elements are zero-initialized, so there is no advantage
			
 
				+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
			
 
				+	// and in the asm versions of this code, we can get away with zeroing only
			
 
				+	// the first tableSize elements.
			
 
				+	var table [maxTableSize]uint16
			
 
				+
			
 
				+	// sLimit is when to stop looking for offset/length copies. The inputMargin
			
 
				+	// lets us use a fast path for emitLiteral in the main loop, while we are
			
 
				+	// looking for copies.
			
 
				+	sLimit := len(src) - inputMargin
			
 
				+
			
 
				+	// nextEmit is where in src the next emitLiteral should start from.
			
 
				+	nextEmit := 0
			
 
				+
			
 
				+	// The encoded form must start with a literal, as there are no previous
			
 
				+	// bytes to copy, so we start looking for hash matches at s == 1.
			
 
				+	s := 1
			
 
				+	nextHash := hash(load32(src, s), shift)
			
 
				+
			
 
				+	for {
			
 
				+		// Copied from the C++ snappy implementation:
			
 
				+		//
			
 
				+		// Heuristic match skipping: If 32 bytes are scanned with no matches
			
 
				+		// found, start looking only at every other byte. If 32 more bytes are
			
 
				+		// scanned (or skipped), look at every third byte, etc.. When a match
			
 
				+		// is found, immediately go back to looking at every byte. This is a
			
 
				+		// small loss (~5% performance, ~0.1% density) for compressible data
			
 
				+		// due to more bookkeeping, but for non-compressible data (such as
			
 
				+		// JPEG) it's a huge win since the compressor quickly "realizes" the
			
 
				+		// data is incompressible and doesn't bother looking for matches
			
 
				+		// everywhere.
			
 
				+		//
			
 
				+		// The "skip" variable keeps track of how many bytes there are since
			
 
				+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
			
 
				+		// the number of bytes to move ahead for each iteration.
			
 
				+		skip := 32
			
 
				+
			
 
				+		nextS := s
			
 
				+		candidate := 0
			
 
				+		for {
			
 
				+			s = nextS
			
 
				+			bytesBetweenHashLookups := skip >> 5
			
 
				+			nextS = s + bytesBetweenHashLookups
			
 
				+			skip += bytesBetweenHashLookups
			
 
				+			if nextS > sLimit {
			
 
				+				goto emitRemainder
			
 
				+			}
			
 
				+			candidate = int(table[nextHash&tableMask])
			
 
				+			table[nextHash&tableMask] = uint16(s)
			
 
				+			nextHash = hash(load32(src, nextS), shift)
			
 
				+			if load32(src, s) == load32(src, candidate) {
			
 
				+				break
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// A 4-byte match has been found. We'll later see if more than 4 bytes
			
 
				+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
			
 
				+		// them as literal bytes.
			
 
				+		d += emitLiteral(dst[d:], src[nextEmit:s])
			
 
				+
			
 
				+		// Call emitCopy, and then see if another emitCopy could be our next
			
 
				+		// move. Repeat until we find no match for the input immediately after
			
 
				+		// what was consumed by the last emitCopy call.
			
 
				+		//
			
 
				+		// If we exit this loop normally then we need to call emitLiteral next,
			
 
				+		// though we don't yet know how big the literal will be. We handle that
			
 
				+		// by proceeding to the next iteration of the main loop. We also can
			
 
				+		// exit this loop via goto if we get close to exhausting the input.
			
 
				+		for {
			
 
				+			// Invariant: we have a 4-byte match at s, and no need to emit any
			
 
				+			// literal bytes prior to s.
			
 
				+			base := s
			
 
				+			// Extend the 4-byte match as long as possible.
			
 
				+			s = extendMatch(src, candidate+4, s+4)
			
 
				+			d += emitCopy(dst[d:], base-candidate, s-base)
			
 
				+			nextEmit = s
			
 
				+			if s >= sLimit {
			
 
				+				goto emitRemainder
			
 
				+			}
			
 
				+
			
 
				+			// We could immediately start working at s now, but to improve
			
 
				+			// compression we first update the hash table at s-1 and at s. If
			
 
				+			// another emitCopy is not our next move, also calculate nextHash
			
 
				+			// at s+1. At least on GOARCH=amd64, these three hash calculations
			
 
				+			// are faster as one load64 call (with some shifts) instead of
			
 
				+			// three load32 calls.
			
 
				+			x := load64(src, s-1)
			
 
				+			prevHash := hash(uint32(x>>0), shift)
			
 
				+			table[prevHash&tableMask] = uint16(s - 1)
			
 
				+			currHash := hash(uint32(x>>8), shift)
			
 
				+			candidate = int(table[currHash&tableMask])
			
 
				+			table[currHash&tableMask] = uint16(s)
			
 
				+			if uint32(x>>8) != load32(src, candidate) {
			
 
				+				nextHash = hash(uint32(x>>16), shift)
			
 
				+				s++
			
 
				+				break
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+emitRemainder:
			
 
				+	if nextEmit < len(src) {
			
 
				+		d += emitLiteral(dst[d:], src[nextEmit:])
			
 
				+	}
			
 
				+	return d
			
 
				+}