9 年之前 · 6880122951
--- a/encode.go
+++ b/encode.go
@@ -10,17 +10,6 @@ import (
 
															 	"io"
														
 
															 )
														
 
															-func load32(b []byte, i int) uint32 {
														
 
															-	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
														
 
															-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
														
 
															-}
														
 
															-
														
 
															-func load64(b []byte, i int) uint64 {
														
 
															-	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
														
 
															-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
														
 
															-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
														
 
															-}
														
 
															-
														
 
															 // Encode returns the encoded form of src. The returned slice may be a sub-
														
 
															 // slice of dst if dst was large enough to hold the entire encoded block.
														
 
															 // Otherwise, a newly allocated slice will be returned.
														
@@ -82,138 +71,6 @@ const inputMargin = 16 - 1
 
															 // TestSameEncodingAsCppShortCopies.
														
 
															 const minNonLiteralBlockSize = 1 + 1 + inputMargin
														
 
															-func hash(u, shift uint32) uint32 {
														
 
															-	return (u * 0x1e35a7bd) >> shift
														
 
															-}
														
 
															-
														
 
															-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
														
 
															-// assumes that the varint-encoded length of the decompressed bytes has already
														
 
															-// been written.
														
 
															-//
														
 
															-// It also assumes that:
														
 
															-//	len(dst) >= MaxEncodedLen(len(src)) &&
														
 
															-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
														
 
															-func encodeBlock(dst, src []byte) (d int) {
														
 
															-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
														
 
															-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
														
 
															-	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
														
 
															-	const (
														
 
															-		maxTableSize = 1 << 14
														
 
															-		// tableMask is redundant, but helps the compiler eliminate bounds
														
 
															-		// checks.
														
 
															-		tableMask = maxTableSize - 1
														
 
															-	)
														
 
															-	shift := uint32(32 - 8)
														
 
															-	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
														
 
															-		shift--
														
 
															-	}
														
 
															-	// In Go, all array elements are zero-initialized, so there is no advantage
														
 
															-	// to a smaller tableSize per se. However, it matches the C++ algorithm,
														
 
															-	// and in the asm versions of this code, we can get away with zeroing only
														
 
															-	// the first tableSize elements.
														
 
															-	var table [maxTableSize]uint16
														
 
															-
														
 
															-	// sLimit is when to stop looking for offset/length copies. The inputMargin
														
 
															-	// lets us use a fast path for emitLiteral in the main loop, while we are
														
 
															-	// looking for copies.
														
 
															-	sLimit := len(src) - inputMargin
														
 
															-
														
 
															-	// nextEmit is where in src the next emitLiteral should start from.
														
 
															-	nextEmit := 0
														
 
															-
														
 
															-	// The encoded form must start with a literal, as there are no previous
														
 
															-	// bytes to copy, so we start looking for hash matches at s == 1.
														
 
															-	s := 1
														
 
															-	nextHash := hash(load32(src, s), shift)
														
 
															-
														
 
															-	for {
														
 
															-		// Copied from the C++ snappy implementation:
														
 
															-		//
														
 
															-		// Heuristic match skipping: If 32 bytes are scanned with no matches
														
 
															-		// found, start looking only at every other byte. If 32 more bytes are
														
 
															-		// scanned (or skipped), look at every third byte, etc.. When a match
														
 
															-		// is found, immediately go back to looking at every byte. This is a
														
 
															-		// small loss (~5% performance, ~0.1% density) for compressible data
														
 
															-		// due to more bookkeeping, but for non-compressible data (such as
														
 
															-		// JPEG) it's a huge win since the compressor quickly "realizes" the
														
 
															-		// data is incompressible and doesn't bother looking for matches
														
 
															-		// everywhere.
														
 
															-		//
														
 
															-		// The "skip" variable keeps track of how many bytes there are since
														
 
															-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
														
 
															-		// the number of bytes to move ahead for each iteration.
														
 
															-		skip := 32
														
 
															-
														
 
															-		nextS := s
														
 
															-		candidate := 0
														
 
															-		for {
														
 
															-			s = nextS
														
 
															-			bytesBetweenHashLookups := skip >> 5
														
 
															-			nextS = s + bytesBetweenHashLookups
														
 
															-			skip += bytesBetweenHashLookups
														
 
															-			if nextS > sLimit {
														
 
															-				goto emitRemainder
														
 
															-			}
														
 
															-			candidate = int(table[nextHash&tableMask])
														
 
															-			table[nextHash&tableMask] = uint16(s)
														
 
															-			nextHash = hash(load32(src, nextS), shift)
														
 
															-			if load32(src, s) == load32(src, candidate) {
														
 
															-				break
														
 
															-			}
														
 
															-		}
														
 
															-
														
 
															-		// A 4-byte match has been found. We'll later see if more than 4 bytes
														
 
															-		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
														
 
															-		// them as literal bytes.
														
 
															-		d += emitLiteral(dst[d:], src[nextEmit:s])
														
 
															-
														
 
															-		// Call emitCopy, and then see if another emitCopy could be our next
														
 
															-		// move. Repeat until we find no match for the input immediately after
														
 
															-		// what was consumed by the last emitCopy call.
														
 
															-		//
														
 
															-		// If we exit this loop normally then we need to call emitLiteral next,
														
 
															-		// though we don't yet know how big the literal will be. We handle that
														
 
															-		// by proceeding to the next iteration of the main loop. We also can
														
 
															-		// exit this loop via goto if we get close to exhausting the input.
														
 
															-		for {
														
 
															-			// Invariant: we have a 4-byte match at s, and no need to emit any
														
 
															-			// literal bytes prior to s.
														
 
															-			base := s
														
 
															-			// Extend the 4-byte match as long as possible.
														
 
															-			s = extendMatch(src, candidate+4, s+4)
														
 
															-			d += emitCopy(dst[d:], base-candidate, s-base)
														
 
															-			nextEmit = s
														
 
															-			if s >= sLimit {
														
 
															-				goto emitRemainder
														
 
															-			}
														
 
															-
														
 
															-			// We could immediately start working at s now, but to improve
														
 
															-			// compression we first update the hash table at s-1 and at s. If
														
 
															-			// another emitCopy is not our next move, also calculate nextHash
														
 
															-			// at s+1. At least on GOARCH=amd64, these three hash calculations
														
 
															-			// are faster as one load64 call (with some shifts) instead of
														
 
															-			// three load32 calls.
														
 
															-			x := load64(src, s-1)
														
 
															-			prevHash := hash(uint32(x>>0), shift)
														
 
															-			table[prevHash&tableMask] = uint16(s - 1)
														
 
															-			currHash := hash(uint32(x>>8), shift)
														
 
															-			candidate = int(table[currHash&tableMask])
														
 
															-			table[currHash&tableMask] = uint16(s)
														
 
															-			if uint32(x>>8) != load32(src, candidate) {
														
 
															-				nextHash = hash(uint32(x>>16), shift)
														
 
															-				s++
														
 
															-				break
														
 
															-			}
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-emitRemainder:
														
 
															-	if nextEmit < len(src) {
														
 
															-		d += emitLiteral(dst[d:], src[nextEmit:])
														
 
															-	}
														
 
															-	return d
														
 
															-}
														
 
															-
														
 
															 // MaxEncodedLen returns the maximum length of a snappy block, given its
														
 
															 // uncompressed length.
														
 
															 //
														
--- a/encode_amd64.go
+++ b/encode_amd64.go
@@ -22,3 +22,8 @@ func emitCopy(dst []byte, offset, length int) int
 
															 //
														
 
															 //go:noescape
														
 
															 func extendMatch(src []byte, i, j int) int
														
 
															+
														
 
															+// encodeBlock has the same semantics as in encode_other.go.
														
 
															+//
														
 
															+//go:noescape
														
 
															+func encodeBlock(dst, src []byte) (d int)
														
--- a/encode_amd64.s
+++ b/encode_amd64.s
@@ -210,3 +210,329 @@ end:
 
															 	SUBQ CX, DI
														
 
															 	MOVQ DI, ret+40(FP)
														
 
															 	RET
														
 
															+
														
 
															+// ----------------------------------------------------------------------------
														
 
															+
														
 
															+// func encodeBlock(dst, src []byte) (d int)
														
 
															+//
														
 
															+// All local variables fit into registers, other than "var table". The register
														
 
															+// allocation:
														
 
															+//	- AX	.	.
														
 
															+//	- BX	.	.
														
 
															+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
														
 
															+//	- DX	64	&src[0], tableSize
														
 
															+//	- SI	72	&src[s]
														
 
															+//	- DI	80	&dst[d]
														
 
															+//	- R9	88	sLimit
														
 
															+//	- R10	.	&src[nextEmit]
														
 
															+//	- R11	96	prevHash, currHash, nextHash, offset
														
 
															+//	- R12	104	&src[base], skip
														
 
															+//	- R13	.	&src[nextS]
														
 
															+//	- R14	.	len(src), bytesBetweenHashLookups, x
														
 
															+//	- R15	112	candidate
														
 
															+//
														
 
															+// The second column (56, 64, etc) is the stack offset to spill the registers
														
 
															+// when calling other functions. We could pack this slightly tighter, but it's
														
 
															+// simpler to have a dedicated spill map independent of the function called.
														
 
															+//
														
 
															+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
														
 
															+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
														
 
															+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
														
 
															+TEXT ·encodeBlock(SB), 0, $32888-56
														
 
															+	MOVQ dst_base+0(FP), DI
														
 
															+	MOVQ src_base+24(FP), SI
														
 
															+	MOVQ src_len+32(FP), R14
														
 
															+
														
 
															+	// shift, tableSize := uint32(32-8), 1<<8
														
 
															+	MOVQ $24, CX
														
 
															+	MOVQ $256, DX
														
 
															+
														
 
															+calcShift:
														
 
															+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
														
 
															+	//	shift--
														
 
															+	// }
														
 
															+	CMPQ DX, $16384
														
 
															+	JGE  varTable
														
 
															+	CMPQ DX, R14
														
 
															+	JGE  varTable
														
 
															+	SUBQ $1, CX
														
 
															+	SHLQ $1, DX
														
 
															+	JMP  calcShift
														
 
															+
														
 
															+varTable:
														
 
															+	// var table [maxTableSize]uint16
														
 
															+	//
														
 
															+	// sizeof(table) is 32768 bytes, which is 2048 16-byte writes.
														
 
															+	MOVQ $2048, DX
														
 
															+	LEAQ table-32768(SP), BX
														
 
															+	PXOR X0, X0
														
 
															+
														
 
															+memclr:
														
 
															+	MOVOU X0, 0(BX)
														
 
															+	ADDQ  $16, BX
														
 
															+	SUBQ  $1, DX
														
 
															+	JNZ   memclr
														
 
															+
														
 
															+	// !!! DX = &src[0]
														
 
															+	MOVQ SI, DX
														
 
															+
														
 
															+	// sLimit := len(src) - inputMargin
														
 
															+	MOVQ R14, R9
														
 
															+	SUBQ $15, R9
														
 
															+
														
 
															+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
														
 
															+	// change for the rest of the function.
														
 
															+	MOVQ CX, 56(SP)
														
 
															+	MOVQ DX, 64(SP)
														
 
															+	MOVQ R9, 88(SP)
														
 
															+
														
 
															+	// nextEmit := 0
														
 
															+	MOVQ DX, R10
														
 
															+
														
 
															+	// s := 1
														
 
															+	ADDQ $1, SI
														
 
															+
														
 
															+	// nextHash := hash(load32(src, s), shift)
														
 
															+	MOVL  0(SI), R11
														
 
															+	IMULL $0x1e35a7bd, R11
														
 
															+	SHRL  CX, R11
														
 
															+
														
 
															+outer:
														
 
															+	// for { etc }
														
 
															+
														
 
															+	// skip := 32
														
 
															+	MOVQ $32, R12
														
 
															+
														
 
															+	// nextS := s
														
 
															+	MOVQ SI, R13
														
 
															+
														
 
															+	// candidate := 0
														
 
															+	MOVQ $0, R15
														
 
															+
														
 
															+inner0:
														
 
															+	// for { etc }
														
 
															+
														
 
															+	// s := nextS
														
 
															+	MOVQ R13, SI
														
 
															+
														
 
															+	// bytesBetweenHashLookups := skip >> 5
														
 
															+	MOVQ R12, R14
														
 
															+	SHRQ $5, R14
														
 
															+
														
 
															+	// nextS = s + bytesBetweenHashLookups
														
 
															+	ADDQ R14, R13
														
 
															+
														
 
															+	// skip += bytesBetweenHashLookups
														
 
															+	ADDQ R14, R12
														
 
															+
														
 
															+	// if nextS > sLimit { goto emitRemainder }
														
 
															+	MOVQ R13, AX
														
 
															+	SUBQ DX, AX
														
 
															+	CMPQ AX, R9
														
 
															+	JA   emitRemainder
														
 
															+
														
 
															+	// candidate = int(table[nextHash])
														
 
															+	MOVWQZX table-32768(SP)(R11*2), R15
														
 
															+
														
 
															+	// table[nextHash] = uint16(s)
														
 
															+	MOVQ SI, AX
														
 
															+	SUBQ DX, AX
														
 
															+	MOVW AX, table-32768(SP)(R11*2)
														
 
															+
														
 
															+	// nextHash = hash(load32(src, nextS), shift)
														
 
															+	MOVL  0(R13), R11
														
 
															+	IMULL $0x1e35a7bd, R11
														
 
															+	SHRL  CX, R11
														
 
															+
														
 
															+	// if load32(src, s) != load32(src, candidate) { continue } break
														
 
															+	MOVL 0(SI), AX
														
 
															+	MOVL (DX)(R15*1), BX
														
 
															+	CMPL AX, BX
														
 
															+	JNE  inner0
														
 
															+
														
 
															+fourByteMatch:
														
 
															+	// As per the encode_other.go code:
														
 
															+	//
														
 
															+	// A 4-byte match has been found. We'll later see etc.
														
 
															+
														
 
															+	// d += emitLiteral(dst[d:], src[nextEmit:s])
														
 
															+	//
														
 
															+	// Push args.
														
 
															+	MOVQ DI, 0(SP)
														
 
															+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	MOVQ R10, 24(SP)
														
 
															+	MOVQ SI, AX
														
 
															+	SUBQ R10, AX
														
 
															+	MOVQ AX, 32(SP)
														
 
															+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
														
 
															+
														
 
															+	// Spill local variables (registers) onto the stack; call; unspill.
														
 
															+	MOVQ SI, 72(SP)
														
 
															+	MOVQ DI, 80(SP)
														
 
															+	MOVQ R15, 112(SP)
														
 
															+	CALL ·emitLiteral(SB)
														
 
															+	MOVQ 56(SP), CX
														
 
															+	MOVQ 64(SP), DX
														
 
															+	MOVQ 72(SP), SI
														
 
															+	MOVQ 80(SP), DI
														
 
															+	MOVQ 88(SP), R9
														
 
															+	MOVQ 112(SP), R15
														
 
															+
														
 
															+	// Finish the "d +=" part of "d += emitLiteral(etc)".
														
 
															+	ADDQ 48(SP), DI
														
 
															+
														
 
															+inner1:
														
 
															+	// for { etc }
														
 
															+
														
 
															+	// base := s
														
 
															+	MOVQ SI, R12
														
 
															+
														
 
															+	// !!! offset := base - candidate
														
 
															+	MOVQ R12, R11
														
 
															+	SUBQ R15, R11
														
 
															+	SUBQ DX, R11
														
 
															+
														
 
															+	// s = extendMatch(src, candidate+4, s+4)
														
 
															+	//
														
 
															+	// Push args.
														
 
															+	MOVQ DX, 0(SP)
														
 
															+	MOVQ src_len+32(FP), R14
														
 
															+	MOVQ R14, 8(SP)
														
 
															+	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	ADDQ $4, R15
														
 
															+	MOVQ R15, 24(SP)
														
 
															+	ADDQ $4, SI
														
 
															+	SUBQ DX, SI
														
 
															+	MOVQ SI, 32(SP)
														
 
															+
														
 
															+	// Spill local variables (registers) onto the stack; call; unspill.
														
 
															+	//
														
 
															+	// We don't need to unspill CX or R9 as we are just about to call another
														
 
															+	// function.
														
 
															+	MOVQ DI, 80(SP)
														
 
															+	MOVQ R11, 96(SP)
														
 
															+	MOVQ R12, 104(SP)
														
 
															+	CALL ·extendMatch(SB)
														
 
															+	MOVQ 64(SP), DX
														
 
															+	MOVQ 80(SP), DI
														
 
															+	MOVQ 96(SP), R11
														
 
															+	MOVQ 104(SP), R12
														
 
															+
														
 
															+	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
														
 
															+	// register holds &src[s], not s.
														
 
															+	MOVQ 40(SP), SI
														
 
															+	ADDQ DX, SI
														
 
															+
														
 
															+	// d += emitCopy(dst[d:], base-candidate, s-base)
														
 
															+	//
														
 
															+	// Push args.
														
 
															+	MOVQ DI, 0(SP)
														
 
															+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	MOVQ R11, 24(SP)
														
 
															+	MOVQ SI, AX
														
 
															+	SUBQ R12, AX
														
 
															+	MOVQ AX, 32(SP)
														
 
															+
														
 
															+	// Spill local variables (registers) onto the stack; call; unspill.
														
 
															+	MOVQ SI, 72(SP)
														
 
															+	MOVQ DI, 80(SP)
														
 
															+	CALL ·emitCopy(SB)
														
 
															+	MOVQ 56(SP), CX
														
 
															+	MOVQ 64(SP), DX
														
 
															+	MOVQ 72(SP), SI
														
 
															+	MOVQ 80(SP), DI
														
 
															+	MOVQ 88(SP), R9
														
 
															+
														
 
															+	// Finish the "d +=" part of "d += emitCopy(etc)".
														
 
															+	ADDQ 40(SP), DI
														
 
															+
														
 
															+	// nextEmit = s
														
 
															+	MOVQ SI, R10
														
 
															+
														
 
															+	// if s >= sLimit { goto emitRemainder }
														
 
															+	MOVQ SI, AX
														
 
															+	SUBQ DX, AX
														
 
															+	CMPQ AX, R9
														
 
															+	JAE  emitRemainder
														
 
															+
														
 
															+	// As per the encode_other.go code:
														
 
															+	//
														
 
															+	// We could immediately etc.
														
 
															+
														
 
															+	// x := load64(src, s-1)
														
 
															+	MOVQ -1(SI), R14
														
 
															+
														
 
															+	// prevHash := hash(uint32(x>>0), shift)
														
 
															+	MOVL  R14, R11
														
 
															+	IMULL $0x1e35a7bd, R11
														
 
															+	SHRL  CX, R11
														
 
															+
														
 
															+	// table[prevHash] = uint16(s-1)
														
 
															+	MOVQ SI, AX
														
 
															+	SUBQ DX, AX
														
 
															+	SUBQ $1, AX
														
 
															+	MOVW AX, table-32768(SP)(R11*2)
														
 
															+
														
 
															+	// currHash := hash(uint32(x>>8), shift)
														
 
															+	SHRQ  $8, R14
														
 
															+	MOVL  R14, R11
														
 
															+	IMULL $0x1e35a7bd, R11
														
 
															+	SHRL  CX, R11
														
 
															+
														
 
															+	// candidate = int(table[currHash])
														
 
															+	MOVWQZX table-32768(SP)(R11*2), R15
														
 
															+
														
 
															+	// table[currHash] = uint16(s)
														
 
															+	ADDQ $1, AX
														
 
															+	MOVW AX, table-32768(SP)(R11*2)
														
 
															+
														
 
															+	// if uint32(x>>8) == load32(src, candidate) { continue }
														
 
															+	MOVL (DX)(R15*1), BX
														
 
															+	CMPL R14, BX
														
 
															+	JEQ  inner1
														
 
															+
														
 
															+	// nextHash = hash(uint32(x>>16), shift)
														
 
															+	SHRQ  $8, R14
														
 
															+	MOVL  R14, R11
														
 
															+	IMULL $0x1e35a7bd, R11
														
 
															+	SHRL  CX, R11
														
 
															+
														
 
															+	// s++
														
 
															+	ADDQ $1, SI
														
 
															+
														
 
															+	// break out of the inner1 for loop, i.e. continue the outer loop.
														
 
															+	JMP outer
														
 
															+
														
 
															+emitRemainder:
														
 
															+	// if nextEmit < len(src) { etc }
														
 
															+	MOVQ src_len+32(FP), AX
														
 
															+	ADDQ DX, AX
														
 
															+	CMPQ R10, AX
														
 
															+	JEQ  end
														
 
															+
														
 
															+	// d += emitLiteral(dst[d:], src[nextEmit:])
														
 
															+	//
														
 
															+	// Push args.
														
 
															+	MOVQ DI, 0(SP)
														
 
															+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
														
 
															+	MOVQ R10, 24(SP)
														
 
															+	SUBQ R10, AX
														
 
															+	MOVQ AX, 32(SP)
														
 
															+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
														
 
															+
														
 
															+	// Spill local variables (registers) onto the stack; call; unspill.
														
 
															+	MOVQ DI, 80(SP)
														
 
															+	CALL ·emitLiteral(SB)
														
 
															+	MOVQ 80(SP), DI
														
 
															+
														
 
															+	// Finish the "d +=" part of "d += emitLiteral(etc)".
														
 
															+	ADDQ 48(SP), DI
														
 
															+
														
 
															+end:
														
 
															+	MOVQ dst_base+0(FP), AX
														
 
															+	SUBQ AX, DI
														
 
															+	MOVQ DI, d+48(FP)
														
 
															+	RET
														
--- a/encode_other.go
+++ b/encode_other.go
@@ -6,6 +6,17 @@
 
															 package snappy
														
 
															+func load32(b []byte, i int) uint32 {
														
 
															+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
														
 
															+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
														
 
															+}
														
 
															+
														
 
															+func load64(b []byte, i int) uint64 {
														
 
															+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
														
 
															+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
														
 
															+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
														
 
															+}
														
 
															+
														
 
															 // emitLiteral writes a literal chunk and returns the number of bytes written.
														
 
															 //
														
 
															 // It assumes that:
														
@@ -86,3 +97,135 @@ func extendMatch(src []byte, i, j int) int {
 
															 	}
														
 
															 	return j
														
 
															 }
														
 
															+
														
 
															+func hash(u, shift uint32) uint32 {
														
 
															+	return (u * 0x1e35a7bd) >> shift
														
 
															+}
														
 
															+
														
 
															+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
														
 
															+// assumes that the varint-encoded length of the decompressed bytes has already
														
 
															+// been written.
														
 
															+//
														
 
															+// It also assumes that:
														
 
															+//	len(dst) >= MaxEncodedLen(len(src)) &&
														
 
															+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
														
 
															+func encodeBlock(dst, src []byte) (d int) {
														
 
															+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
														
 
															+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
														
 
															+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
														
 
															+	const (
														
 
															+		maxTableSize = 1 << 14
														
 
															+		// tableMask is redundant, but helps the compiler eliminate bounds
														
 
															+		// checks.
														
 
															+		tableMask = maxTableSize - 1
														
 
															+	)
														
 
															+	shift := uint32(32 - 8)
														
 
															+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
														
 
															+		shift--
														
 
															+	}
														
 
															+	// In Go, all array elements are zero-initialized, so there is no advantage
														
 
															+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
														
 
															+	// and in the asm versions of this code, we can get away with zeroing only
														
 
															+	// the first tableSize elements.
														
 
															+	var table [maxTableSize]uint16
														
 
															+
														
 
															+	// sLimit is when to stop looking for offset/length copies. The inputMargin
														
 
															+	// lets us use a fast path for emitLiteral in the main loop, while we are
														
 
															+	// looking for copies.
														
 
															+	sLimit := len(src) - inputMargin
														
 
															+
														
 
															+	// nextEmit is where in src the next emitLiteral should start from.
														
 
															+	nextEmit := 0
														
 
															+
														
 
															+	// The encoded form must start with a literal, as there are no previous
														
 
															+	// bytes to copy, so we start looking for hash matches at s == 1.
														
 
															+	s := 1
														
 
															+	nextHash := hash(load32(src, s), shift)
														
 
															+
														
 
															+	for {
														
 
															+		// Copied from the C++ snappy implementation:
														
 
															+		//
														
 
															+		// Heuristic match skipping: If 32 bytes are scanned with no matches
														
 
															+		// found, start looking only at every other byte. If 32 more bytes are
														
 
															+		// scanned (or skipped), look at every third byte, etc.. When a match
														
 
															+		// is found, immediately go back to looking at every byte. This is a
														
 
															+		// small loss (~5% performance, ~0.1% density) for compressible data
														
 
															+		// due to more bookkeeping, but for non-compressible data (such as
														
 
															+		// JPEG) it's a huge win since the compressor quickly "realizes" the
														
 
															+		// data is incompressible and doesn't bother looking for matches
														
 
															+		// everywhere.
														
 
															+		//
														
 
															+		// The "skip" variable keeps track of how many bytes there are since
														
 
															+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
														
 
															+		// the number of bytes to move ahead for each iteration.
														
 
															+		skip := 32
														
 
															+
														
 
															+		nextS := s
														
 
															+		candidate := 0
														
 
															+		for {
														
 
															+			s = nextS
														
 
															+			bytesBetweenHashLookups := skip >> 5
														
 
															+			nextS = s + bytesBetweenHashLookups
														
 
															+			skip += bytesBetweenHashLookups
														
 
															+			if nextS > sLimit {
														
 
															+				goto emitRemainder
														
 
															+			}
														
 
															+			candidate = int(table[nextHash&tableMask])
														
 
															+			table[nextHash&tableMask] = uint16(s)
														
 
															+			nextHash = hash(load32(src, nextS), shift)
														
 
															+			if load32(src, s) == load32(src, candidate) {
														
 
															+				break
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		// A 4-byte match has been found. We'll later see if more than 4 bytes
														
 
															+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
														
 
															+		// them as literal bytes.
														
 
															+		d += emitLiteral(dst[d:], src[nextEmit:s])
														
 
															+
														
 
															+		// Call emitCopy, and then see if another emitCopy could be our next
														
 
															+		// move. Repeat until we find no match for the input immediately after
														
 
															+		// what was consumed by the last emitCopy call.
														
 
															+		//
														
 
															+		// If we exit this loop normally then we need to call emitLiteral next,
														
 
															+		// though we don't yet know how big the literal will be. We handle that
														
 
															+		// by proceeding to the next iteration of the main loop. We also can
														
 
															+		// exit this loop via goto if we get close to exhausting the input.
														
 
															+		for {
														
 
															+			// Invariant: we have a 4-byte match at s, and no need to emit any
														
 
															+			// literal bytes prior to s.
														
 
															+			base := s
														
 
															+			// Extend the 4-byte match as long as possible.
														
 
															+			s = extendMatch(src, candidate+4, s+4)
														
 
															+			d += emitCopy(dst[d:], base-candidate, s-base)
														
 
															+			nextEmit = s
														
 
															+			if s >= sLimit {
														
 
															+				goto emitRemainder
														
 
															+			}
														
 
															+
														
 
															+			// We could immediately start working at s now, but to improve
														
 
															+			// compression we first update the hash table at s-1 and at s. If
														
 
															+			// another emitCopy is not our next move, also calculate nextHash
														
 
															+			// at s+1. At least on GOARCH=amd64, these three hash calculations
														
 
															+			// are faster as one load64 call (with some shifts) instead of
														
 
															+			// three load32 calls.
														
 
															+			x := load64(src, s-1)
														
 
															+			prevHash := hash(uint32(x>>0), shift)
														
 
															+			table[prevHash&tableMask] = uint16(s - 1)
														
 
															+			currHash := hash(uint32(x>>8), shift)
														
 
															+			candidate = int(table[currHash&tableMask])
														
 
															+			table[currHash&tableMask] = uint16(s)
														
 
															+			if uint32(x>>8) != load32(src, candidate) {
														
 
															+				nextHash = hash(uint32(x>>16), shift)
														
 
															+				s++
														
 
															+				break
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+emitRemainder:
														
 
															+	if nextEmit < len(src) {
														
 
															+		d += emitLiteral(dst[d:], src[nextEmit:])
														
 
															+	}
														
 
															+	return d
														
 
															+}