Просмотр исходного кода

lz4block: Optimize amd64 assembly decoder

Replaced all occurrences of MOVQ x, z; ADDQ y, z by LEAQ (x)(y*1), z.
There are also a few instances of the same pattern but with SUBQ.
Replacing these with LEAQ with a negative offset seems to hurt
performance, at least on the i7-3770K.

Benchmark results:

name                old time/op    new time/op    delta
Uncompress-8          8.03ns ± 1%    7.92ns ± 1%   -1.36%  (p=0.000 n=30+39)
UncompressPg1661-8     301µs ± 1%     299µs ± 1%   -0.66%  (p=0.000 n=30+39)
UncompressDigits-8    47.8µs ± 1%    43.2µs ± 1%   -9.67%  (p=0.000 n=30+40)
UncompressTwain-8      199µs ± 1%     198µs ± 1%   -0.61%  (p=0.000 n=30+39)
UncompressRand-8      3.77µs ± 2%    3.76µs ± 2%     ~     (p=0.465 n=30+40)

name                old alloc/op   new alloc/op   delta
Uncompress-8           0.00B          0.00B          ~     (all equal)
UncompressPg1661-8      160B ± 0%      160B ± 0%     ~     (all equal)
UncompressDigits-8      160B ± 0%      162B ± 4%   +1.41%  (p=0.010 n=23+40)
UncompressTwain-8       160B ± 0%      160B ± 0%     ~     (all equal)
UncompressRand-8        161B ± 1%      161B ± 0%     ~     (p=0.384 n=30+39)

name                old allocs/op  new allocs/op  delta
Uncompress-8            0.00           0.00          ~     (all equal)
UncompressPg1661-8      3.00 ± 0%      3.00 ± 0%     ~     (all equal)
UncompressDigits-8      3.00 ± 0%      3.00 ± 0%     ~     (all equal)
UncompressTwain-8       3.00 ± 0%      3.00 ± 0%     ~     (all equal)
UncompressRand-8        3.00 ± 0%      3.00 ± 0%     ~     (all equal)

name                old speed      new speed      delta
UncompressPg1661-8  1.98GB/s ± 1%  1.99GB/s ± 1%   +0.66%  (p=0.000 n=30+39)
UncompressDigits-8  2.09GB/s ± 1%  2.32GB/s ± 1%  +10.70%  (p=0.000 n=30+40)
UncompressTwain-8   1.95GB/s ± 1%  1.96GB/s ± 1%   +0.61%  (p=0.000 n=30+39)
UncompressRand-8    4.35GB/s ± 2%  4.35GB/s ± 2%     ~     (p=0.475 n=30+40)
greatroar 5 лет назад
Родитель
Сommit
f2e79e789a
1 измененных файлов с 8 добавлено и 16 удалено
  1. 8 16
      internal/lz4block/decode_amd64.s

+ 8 - 16
internal/lz4block/decode_amd64.s

@@ -109,8 +109,7 @@ loop:
 	MOVW 16(AX), BX
 	MOVW 16(AX), BX
 	MOVW BX, 16(DI)
 	MOVW BX, 16(DI)
 
 
-	ADDQ $4, DI // minmatch
-	ADDQ CX, DI
+	LEAQ 4(DI)(CX*1), DI // minmatch
 
 
 	// shortcut complete, load next token
 	// shortcut complete, load next token
 	JMP loop
 	JMP loop
@@ -128,8 +127,7 @@ lit_len_loop:
 	JNE lit_len_finalise
 	JNE lit_len_finalise
 
 
 	// bounds check src[si+1]
 	// bounds check src[si+1]
-	MOVQ SI, AX
-	ADDQ $1, AX
+	LEAQ 1(SI), AX
 	CMPQ AX, R9
 	CMPQ AX, R9
 	JGT err_short_buf
 	JGT err_short_buf
 
 
@@ -147,13 +145,11 @@ lit_len_finalise:
 
 
 copy_literal:
 copy_literal:
 	// bounds check src and dst
 	// bounds check src and dst
-	MOVQ SI, AX
-	ADDQ CX, AX
+	LEAQ (SI)(CX*1), AX
 	CMPQ AX, R9
 	CMPQ AX, R9
 	JGT err_short_buf
 	JGT err_short_buf
 
 
-	MOVQ DI, AX
-	ADDQ CX, AX
+	LEAQ (DI)(CX*1), AX
 	CMPQ AX, R8
 	CMPQ AX, R8
 	JGT err_short_buf
 	JGT err_short_buf
 
 
@@ -219,8 +215,7 @@ offset:
 	// free up DX to use for offset
 	// free up DX to use for offset
 	MOVQ DX, CX
 	MOVQ DX, CX
 
 
-	MOVQ SI, AX
-	ADDQ $2, AX
+	LEAQ 2(SI), AX
 	CMPQ AX, R9
 	CMPQ AX, R9
 	JGT err_short_buf
 	JGT err_short_buf
 
 
@@ -247,8 +242,7 @@ match_len_loop:
 	JNE match_len_finalise
 	JNE match_len_finalise
 
 
 	// bounds check src[si+1]
 	// bounds check src[si+1]
-	MOVQ SI, AX
-	ADDQ $1, AX
+	LEAQ 1(SI), AX
 	CMPQ AX, R9
 	CMPQ AX, R9
 	JGT err_short_buf
 	JGT err_short_buf
 
 
@@ -269,8 +263,7 @@ copy_match:
 
 
 	// check we have match_len bytes left in dst
 	// check we have match_len bytes left in dst
 	// di+match_len < len(dst)
 	// di+match_len < len(dst)
-	MOVQ DI, AX
-	ADDQ CX, AX
+	LEAQ (DI)(CX*1), AX
 	CMPQ AX, R8
 	CMPQ AX, R8
 	JGT err_short_buf
 	JGT err_short_buf
 
 
@@ -286,8 +279,7 @@ copy_match:
 	JLT err_short_buf
 	JLT err_short_buf
 
 
 	// if offset + match_len < di
 	// if offset + match_len < di
-	MOVQ BX, AX
-	ADDQ CX, AX
+	LEAQ (BX)(CX*1), AX
 	CMPQ DI, AX
 	CMPQ DI, AX
 	JGT copy_interior_match
 	JGT copy_interior_match