9 years ago · 4c1fc8e426
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -23,7 +23,7 @@
 
				 //	+ R11	src_base
			
 
				 //	+ R12	src_len
			
 
				 //	+ R13	src_base + src_len
			
 
				-//	- R14	unused
			
 
				+//	- R14	used by doCopy
			
 
				 //	- R15	used by doCopy
			
 
				 //
			
 
				 // The registers R8-R13 (marked with a "+") are set at the start of the
			
@@ -299,10 +299,37 @@ doCopy:
 
				 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
			
 
				 	//
			
 
				 	// Set:
			
 
				+	//	- R14 = len(dst)-d
			
 
				 	//	- R15 = &dst[d-offset]
			
 
				+	MOVQ R10, R14
			
 
				+	SUBQ DI, R14
			
 
				 	MOVQ DI, R15
			
 
				 	SUBQ DX, R15
			
 
				 
			
 
				+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
			
 
				+	//
			
 
				+	// First, try using two 8-byte load/stores, similar to the doLit technique
			
 
				+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
			
 
				+	// still OK if offset >= 8.
			
 
				+	//
			
 
				+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
			
 
				+	//   goto slowForwardCopy
			
 
				+	// }
			
 
				+	// copy 16 bytes
			
 
				+	// d += length
			
 
				+	CMPQ CX, $16
			
 
				+	JGT  verySlowForwardCopy
			
 
				+	CMPQ DX, $8
			
 
				+	JLT  verySlowForwardCopy
			
 
				+	CMPQ R14, $16
			
 
				+	JLT  verySlowForwardCopy
			
 
				+	MOVQ 0(R15), AX
			
 
				+	MOVQ AX, 0(DI)
			
 
				+	MOVQ 8(R15), BX
			
 
				+	MOVQ BX, 8(DI)
			
 
				+	ADDQ CX, DI
			
 
				+	JMP  loop
			
 
				+
			
 
				 verySlowForwardCopy:
			
 
				 	// verySlowForwardCopy is a simple implementation of forward copy. In C
			
 
				 	// parlance, this is a do/while loop instead of a while loop, since we know