7 years ago · 7590f738d0
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -13,6 +13,9 @@
 
															 // SI &src
														
 
															 // R8 &dst + len(dst)
														
 
															 // R9 &src + len(src)
														
 
															+// R11 &dst
														
 
															+// R12 short output end
														
 
															+// R13 short input end
														
 
															 // func decodeBlock(dst, src []byte) int
														
 
															 // using 50 bytes of stack currently
														
 
															 TEXT ·decodeBlock(SB), NOSPLIT, $64-56
														
@@ -25,6 +28,14 @@ TEXT ·decodeBlock(SB), NOSPLIT, $64-56
 
															 	MOVQ src_len+32(FP), R9
														
 
															 	ADDQ SI, R9
														
 
															+	// shortcut ends
														
 
															+	// short output end
														
 
															+	MOVQ R8, R12
														
 
															+	SUBQ $32, R12
														
 
															+	// short input end
														
 
															+	MOVQ R9, R13
														
 
															+	SUBQ $16, R13
														
 
															+
														
 
															 loop:
														
 
															 	// for si < len(src)
														
 
															 	CMPQ SI, R9
														
@@ -40,11 +51,74 @@ loop:
 
															 	MOVQ DX, CX
														
 
															 	SHRQ $4, CX
														
 
															+	// if lit_len != 0xF
														
 
															+	CMPQ CX, $0xF
														
 
															+	JEQ lit_len_loop_pre
														
 
															+	CMPQ DI, R12
														
 
															+	JGE lit_len_loop_pre
														
 
															+	CMPQ SI, R13
														
 
															+	JGE lit_len_loop_pre
														
 
															+
														
 
															+	// copy shortcut
														
 
															+
														
 
															+	// A two-stage shortcut for the most common case:
														
 
															+	// 1) If the literal length is 0..14, and there is enough space,
														
 
															+	// enter the shortcut and copy 16 bytes on behalf of the literals
														
 
															+	// (in the fast mode, only 8 bytes can be safely copied this way).
														
 
															+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
														
 
															+	// manner; but we ensure that there's enough space in the output for
														
 
															+	// those 18 bytes earlier, upon entering the shortcut (in other words,
														
 
															+	// there is a combined check for both stages).
														
 
															+
														
 
															+	// copy literal
														
 
															+	MOVOU (SI), X0
														
 
															+	MOVOU X0, (DI)
														
 
															+	ADDQ CX, DI
														
 
															+	ADDQ CX, SI
														
 
															+
														
 
															+	MOVQ DX, CX
														
 
															+	ANDQ $0xF, CX
														
 
															+
														
 
															+	// The second stage: prepare for match copying, decode full info.
														
 
															+	// If it doesn't work out, the info won't be wasted.
														
 
															+	// offset := uint16(data[:2])
														
 
															+	MOVWQZX (SI), DX
														
 
															+	ADDQ $2, SI
														
 
															+
														
 
															+	MOVQ DI, AX
														
 
															+	SUBQ DX, AX
														
 
															+	CMPQ AX, DI
														
 
															+	JGT err_short_buf
														
 
															+
														
 
															+	// if we can't do the second stage then jump straight to read the
														
 
															+	// match length, we already have the offset.
														
 
															+	CMPQ CX, $0xF
														
 
															+	JEQ match_len_loop_pre
														
 
															+	CMPQ DX, $8
														
 
															+	JLT match_len_loop_pre
														
 
															+	CMPQ AX, R11
														
 
															+	JLT err_short_buf
														
 
															+
														
 
															+	// memcpy(op + 0, match + 0, 8);
														
 
															+	MOVQ (AX), BX
														
 
															+	MOVQ BX, (DI)
														
 
															+	// memcpy(op + 8, match + 8, 8);
														
 
															+	MOVQ 8(AX), BX
														
 
															+	MOVQ BX, 8(DI)
														
 
															+	// memcpy(op +16, match +16, 2);
														
 
															+	MOVW 16(AX), BX
														
 
															+	MOVW BX, 16(DI)
														
 
															+
														
 
															+	ADDQ $4, DI // minmatch
														
 
															+	ADDQ CX, DI
														
 
															+
														
 
															+	// shortcut complete, load next token
														
 
															+	JMP loop
														
 
															+
														
 
															+lit_len_loop_pre:
														
 
															 	// if lit_len > 0
														
 
															 	CMPQ CX, $0
														
 
															 	JEQ offset
														
 
															-
														
 
															-	// if lit_len != 0xF
														
 
															 	CMPQ CX, $0xF
														
 
															 	JNE copy_literal
														
@@ -128,6 +202,10 @@ memmove_lit:
 
															 	ADDQ dst_len+8(FP), R8
														
 
															 	MOVQ src_base+24(FP), R9
														
 
															 	ADDQ src_len+32(FP), R9
														
 
															+	MOVQ R8, R12
														
 
															+	SUBQ $32, R12
														
 
															+	MOVQ R9, R13
														
 
															+	SUBQ $16, R13
														
 
															 finish_lit_copy:
														
 
															 	ADDQ CX, SI
														
@@ -155,8 +233,10 @@ offset:
 
															 	CMPQ DX, $0
														
 
															 	JEQ err_corrupt
														
 
															-	// if mlen != 0xF
														
 
															 	ANDB $0xF, CX
														
 
															+
														
 
															+match_len_loop_pre:
														
 
															+	// if mlen != 0xF
														
 
															 	CMPB CX, $0xF
														
 
															 	JNE copy_match
														
@@ -273,6 +353,10 @@ memmove_match:
 
															 	ADDQ dst_len+8(FP), R8
														
 
															 	MOVQ src_base+24(FP), R9
														
 
															 	ADDQ src_len+32(FP), R9
														
 
															+	MOVQ R8, R12
														
 
															+	SUBQ $32, R12
														
 
															+	MOVQ R9, R13
														
 
															+	SUBQ $16, R13
														
 
															 	ADDQ CX, DI
														
 
															 	JMP loop