Browse Source

implement copy shortcut from c

Chris Bannister 6 years ago
parent
commit
7590f738d0
1 changed files with 87 additions and 3 deletions
  1. 87 3
      decode_amd64.s

+ 87 - 3
decode_amd64.s

@@ -13,6 +13,9 @@
 // SI &src
 // SI &src
 // R8 &dst + len(dst)
 // R8 &dst + len(dst)
 // R9 &src + len(src)
 // R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
 // func decodeBlock(dst, src []byte) int
 // func decodeBlock(dst, src []byte) int
 // using 50 bytes of stack currently
 // using 50 bytes of stack currently
 TEXT ·decodeBlock(SB), NOSPLIT, $64-56
 TEXT ·decodeBlock(SB), NOSPLIT, $64-56
@@ -25,6 +28,14 @@ TEXT ·decodeBlock(SB), NOSPLIT, $64-56
 	MOVQ src_len+32(FP), R9
 	MOVQ src_len+32(FP), R9
 	ADDQ SI, R9
 	ADDQ SI, R9
 
 
+	// shortcut ends
+	// short output end
+	MOVQ R8, R12
+	SUBQ $32, R12
+	// short input end
+	MOVQ R9, R13
+	SUBQ $16, R13
+
 loop:
 loop:
 	// for si < len(src)
 	// for si < len(src)
 	CMPQ SI, R9
 	CMPQ SI, R9
@@ -40,11 +51,74 @@ loop:
 	MOVQ DX, CX
 	MOVQ DX, CX
 	SHRQ $4, CX
 	SHRQ $4, CX
 
 
+	// if lit_len != 0xF
+	CMPQ CX, $0xF
+	JEQ lit_len_loop_pre
+	CMPQ DI, R12
+	JGE lit_len_loop_pre
+	CMPQ SI, R13
+	JGE lit_len_loop_pre
+
+	// copy shortcut
+
+	// A two-stage shortcut for the most common case:
+	// 1) If the literal length is 0..14, and there is enough space,
+	// enter the shortcut and copy 16 bytes on behalf of the literals
+	// (in the fast mode, only 8 bytes can be safely copied this way).
+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
+	// manner; but we ensure that there's enough space in the output for
+	// those 18 bytes earlier, upon entering the shortcut (in other words,
+	// there is a combined check for both stages).
+
+	// copy literal
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+	ADDQ CX, DI
+	ADDQ CX, SI
+
+	MOVQ DX, CX
+	ANDQ $0xF, CX
+
+	// The second stage: prepare for match copying, decode full info.
+	// If it doesn't work out, the info won't be wasted.
+	// offset := uint16(data[:2])
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	MOVQ DI, AX
+	SUBQ DX, AX
+	CMPQ AX, DI
+	JGT err_short_buf
+
+	// if we can't do the second stage then jump straight to read the
+	// match length, we already have the offset.
+	CMPQ CX, $0xF
+	JEQ match_len_loop_pre
+	CMPQ DX, $8
+	JLT match_len_loop_pre
+	CMPQ AX, R11
+	JLT err_short_buf
+
+	// memcpy(op + 0, match + 0, 8);
+	MOVQ (AX), BX
+	MOVQ BX, (DI)
+	// memcpy(op + 8, match + 8, 8);
+	MOVQ 8(AX), BX
+	MOVQ BX, 8(DI)
+	// memcpy(op +16, match +16, 2);
+	MOVW 16(AX), BX
+	MOVW BX, 16(DI)
+
+	ADDQ $4, DI // minmatch
+	ADDQ CX, DI
+
+	// shortcut complete, load next token
+	JMP loop
+
+lit_len_loop_pre:
 	// if lit_len > 0
 	// if lit_len > 0
 	CMPQ CX, $0
 	CMPQ CX, $0
 	JEQ offset
 	JEQ offset
-
-	// if lit_len != 0xF
 	CMPQ CX, $0xF
 	CMPQ CX, $0xF
 	JNE copy_literal
 	JNE copy_literal
 
 
@@ -128,6 +202,10 @@ memmove_lit:
 	ADDQ dst_len+8(FP), R8
 	ADDQ dst_len+8(FP), R8
 	MOVQ src_base+24(FP), R9
 	MOVQ src_base+24(FP), R9
 	ADDQ src_len+32(FP), R9
 	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
 
 
 finish_lit_copy:
 finish_lit_copy:
 	ADDQ CX, SI
 	ADDQ CX, SI
@@ -155,8 +233,10 @@ offset:
 	CMPQ DX, $0
 	CMPQ DX, $0
 	JEQ err_corrupt
 	JEQ err_corrupt
 
 
-	// if mlen != 0xF
 	ANDB $0xF, CX
 	ANDB $0xF, CX
+
+match_len_loop_pre:
+	// if mlen != 0xF
 	CMPB CX, $0xF
 	CMPB CX, $0xF
 	JNE copy_match
 	JNE copy_match
 
 
@@ -273,6 +353,10 @@ memmove_match:
 	ADDQ dst_len+8(FP), R8
 	ADDQ dst_len+8(FP), R8
 	MOVQ src_base+24(FP), R9
 	MOVQ src_base+24(FP), R9
 	ADDQ src_len+32(FP), R9
 	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
 
 
 	ADDQ CX, DI
 	ADDQ CX, DI
 	JMP loop
 	JMP loop