Forráskód Böngészése

Inline the emitLiteral call.

name              old speed      new speed      delta
WordsEncode1e1-8   712MB/s ± 1%   700MB/s ± 1%   -1.65%  (p=0.000 n=10+10)
WordsEncode1e2-8   467MB/s ± 0%   460MB/s ± 1%   -1.53%   (p=0.000 n=9+10)
WordsEncode1e3-8   483MB/s ± 0%   478MB/s ± 2%   -0.98%   (p=0.007 n=9+10)
WordsEncode1e4-8   353MB/s ± 1%   414MB/s ± 0%  +17.03%   (p=0.000 n=10+9)
WordsEncode1e5-8   293MB/s ± 0%   296MB/s ± 1%   +1.03%    (p=0.000 n=8+9)
WordsEncode1e6-8   345MB/s ± 0%   345MB/s ± 0%     ~       (p=0.332 n=9+8)
RandomEncode-8    14.4GB/s ± 2%  14.4GB/s ± 2%     ~      (p=1.000 n=10+9)
_ZFlat0-8          863MB/s ± 0%   888MB/s ± 1%   +2.86%   (p=0.000 n=9+10)
_ZFlat1-8          471MB/s ± 0%   471MB/s ± 1%     ~      (p=0.897 n=8+10)
_ZFlat2-8         16.2GB/s ± 2%  16.2GB/s ± 3%     ~     (p=0.631 n=10+10)
_ZFlat3-8          659MB/s ± 1%   675MB/s ± 1%   +2.32%    (p=0.000 n=9+9)
_ZFlat4-8         8.29GB/s ± 1%  8.31GB/s ± 1%     ~     (p=0.315 n=10+10)
_ZFlat5-8          836MB/s ± 1%   850MB/s ± 0%   +1.78%    (p=0.000 n=9+9)
_ZFlat6-8          315MB/s ± 0%   316MB/s ± 0%   +0.39%   (p=0.002 n=9+10)
_ZFlat7-8          293MB/s ± 1%   294MB/s ± 1%     ~      (p=0.139 n=10+9)
_ZFlat8-8          331MB/s ± 1%   330MB/s ± 1%     ~      (p=0.356 n=10+9)
_ZFlat9-8          273MB/s ± 1%   273MB/s ± 0%     ~     (p=0.280 n=10+10)
_ZFlat10-8        1.12GB/s ± 1%  1.17GB/s ± 1%   +4.12%  (p=0.000 n=10+10)
_ZFlat11-8         460MB/s ± 0%   461MB/s ± 0%   +0.34%   (p=0.006 n=8+10)
Nigel Tao 9 éve
szülő
commit
5a44a9da21
1 módosított fájl, 47 hozzáadás és 15 törlés
  1. 47 15
      encode_amd64.s

+ 47 - 15
encode_amd64.s

@@ -57,14 +57,14 @@ threeBytes:
 	MOVW BX, 1(DI)
 	MOVW BX, 1(DI)
 	ADDQ $3, DI
 	ADDQ $3, DI
 	ADDQ $3, DX
 	ADDQ $3, DX
-	JMP  emitLiteralEnd
+	JMP  memmove
 
 
 twoBytes:
 twoBytes:
 	MOVB $0xf0, 0(DI)
 	MOVB $0xf0, 0(DI)
 	MOVB BX, 1(DI)
 	MOVB BX, 1(DI)
 	ADDQ $2, DI
 	ADDQ $2, DI
 	ADDQ $2, DX
 	ADDQ $2, DX
-	JMP  emitLiteralEnd
+	JMP  memmove
 
 
 oneByte:
 oneByte:
 	SHLB $2, BX
 	SHLB $2, BX
@@ -72,7 +72,7 @@ oneByte:
 	ADDQ $1, DI
 	ADDQ $1, DI
 	ADDQ $1, DX
 	ADDQ $1, DX
 
 
-emitLiteralEnd:
+memmove:
 	MOVQ DX, ret+48(FP)
 	MOVQ DX, ret+48(FP)
 
 
 	// copy(dst[i:], lit)
 	// copy(dst[i:], lit)
@@ -400,32 +400,64 @@ fourByteMatch:
 	CMPQ AX, $16
 	CMPQ AX, $16
 	JLE  emitLiteralFastPath
 	JLE  emitLiteralFastPath
 
 
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
 	//
 	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
 
 
+inlineEmitLiteralMemmove:
 	// Spill local variables (registers) onto the stack; call; unspill.
 	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ AX, DI
 	MOVQ SI, 72(SP)
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
 	MOVQ R15, 112(SP)
-	CALL ·emitLiteral(SB)
+	CALL runtime·memmove(SB)
 	MOVQ 56(SP), CX
 	MOVQ 56(SP), CX
 	MOVQ 64(SP), DX
 	MOVQ 64(SP), DX
 	MOVQ 72(SP), SI
 	MOVQ 72(SP), SI
 	MOVQ 80(SP), DI
 	MOVQ 80(SP), DI
 	MOVQ 88(SP), R9
 	MOVQ 88(SP), R9
 	MOVQ 112(SP), R15
 	MOVQ 112(SP), R15
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
 	JMP  inner1
 	JMP  inner1
 
 
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
 emitLiteralFastPath:
 emitLiteralFastPath:
 	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
 	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
 	MOVB AX, BX
 	MOVB AX, BX