Ver código fonte

Merge pull request #81 from greatroar/lea

lz4block: Optimize amd64 assembly decoder
Pierre Curto 5 anos atrás
pai
commit
4782965c40
1 arquivos alterados com 8 adições e 16 exclusões
  1. 8 16
      internal/lz4block/decode_amd64.s

+ 8 - 16
internal/lz4block/decode_amd64.s

@@ -109,8 +109,7 @@ loop:
 	MOVW 16(AX), BX
 	MOVW BX, 16(DI)
 
-	ADDQ $4, DI // minmatch
-	ADDQ CX, DI
+	LEAQ 4(DI)(CX*1), DI // minmatch
 
 	// shortcut complete, load next token
 	JMP loop
@@ -128,8 +127,7 @@ lit_len_loop:
 	JNE lit_len_finalise
 
 	// bounds check src[si+1]
-	MOVQ SI, AX
-	ADDQ $1, AX
+	LEAQ 1(SI), AX
 	CMPQ AX, R9
 	JGT err_short_buf
 
@@ -147,13 +145,11 @@ lit_len_finalise:
 
 copy_literal:
 	// bounds check src and dst
-	MOVQ SI, AX
-	ADDQ CX, AX
+	LEAQ (SI)(CX*1), AX
 	CMPQ AX, R9
 	JGT err_short_buf
 
-	MOVQ DI, AX
-	ADDQ CX, AX
+	LEAQ (DI)(CX*1), AX
 	CMPQ AX, R8
 	JGT err_short_buf
 
@@ -219,8 +215,7 @@ offset:
 	// free up DX to use for offset
 	MOVQ DX, CX
 
-	MOVQ SI, AX
-	ADDQ $2, AX
+	LEAQ 2(SI), AX
 	CMPQ AX, R9
 	JGT err_short_buf
 
@@ -247,8 +242,7 @@ match_len_loop:
 	JNE match_len_finalise
 
 	// bounds check src[si+1]
-	MOVQ SI, AX
-	ADDQ $1, AX
+	LEAQ 1(SI), AX
 	CMPQ AX, R9
 	JGT err_short_buf
 
@@ -269,8 +263,7 @@ copy_match:
 
 	// check we have match_len bytes left in dst
 	// di+match_len < len(dst)
-	MOVQ DI, AX
-	ADDQ CX, AX
+	LEAQ (DI)(CX*1), AX
 	CMPQ AX, R8
 	JGT err_short_buf
 
@@ -286,8 +279,7 @@ copy_match:
 	JLT err_short_buf
 
 	// if offset + match_len < di
-	MOVQ BX, AX
-	ADDQ CX, AX
+	LEAQ (BX)(CX*1), AX
 	CMPQ DI, AX
 	JGT copy_interior_match