9 yıl önce · 8c7c9dec59
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -94,6 +94,47 @@ doLit:
 
				 	MOVQ R13, BX
			
 
				 	SUBQ SI, BX
			
 
				 
			
 
				+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
			
 
				+	//
			
 
				+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
			
 
				+	//   goto callMemmove // Fall back on calling runtime·memmove.
			
 
				+	// }
			
 
				+	//
			
 
				+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
			
 
				+	// against 21 instead of 16, because it cannot assume that all of its input
			
 
				+	// is contiguous in memory and so it needs to leave enough source bytes to
			
 
				+	// read the next tag without refilling buffers, but Go's Decode assumes
			
 
				+	// contiguousness (the src argument is a []byte).
			
 
				+	CMPQ CX, $16
			
 
				+	JGT  callMemmove
			
 
				+	CMPQ AX, $16
			
 
				+	JLT  callMemmove
			
 
				+	CMPQ BX, $16
			
 
				+	JLT  callMemmove
			
 
				+
			
 
				+	// !!! Implement the copy from src to dst as two 8-byte loads and stores.
			
 
				+	// (Decode's documentation says that dst and src must not overlap.)
			
 
				+	//
			
 
				+	// This always copies 16 bytes, instead of only length bytes, but that's
			
 
				+	// OK. If the input is a valid Snappy encoding then subsequent iterations
			
 
				+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
			
 
				+	// non-nil error), so the overrun will be ignored.
			
 
				+	//
			
 
				+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte
			
 
				+	// loads and stores. This technique probably wouldn't be as effective on
			
 
				+	// architectures that are fussier about alignment.
			
 
				+	MOVQ 0(SI), AX
			
 
				+	MOVQ AX, 0(DI)
			
 
				+	MOVQ 8(SI), BX
			
 
				+	MOVQ BX, 8(DI)
			
 
				+
			
 
				+	// d += length
			
 
				+	// s += length
			
 
				+	ADDQ CX, DI
			
 
				+	ADDQ CX, SI
			
 
				+	JMP  loop
			
 
				+
			
 
				+callMemmove:
			
 
				 	// if length > len(dst)-d || length > len(src)-s { etc }
			
 
				 	CMPQ CX, AX
			
 
				 	JGT  errCorrupt