|
|
@@ -364,7 +364,7 @@ GLOBL counter<>(SB), (NOPTR+RODATA), $16
|
|
|
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \
|
|
|
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14)
|
|
|
|
|
|
-#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, stack_size, BLAKE2s_FUNC) \
|
|
|
+#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \
|
|
|
MOVQ h, AX; \
|
|
|
MOVQ c, BX; \
|
|
|
MOVL flag, CX; \
|
|
|
@@ -372,8 +372,10 @@ GLOBL counter<>(SB), (NOPTR+RODATA), $16
|
|
|
MOVQ blocks_len, DX; \
|
|
|
\
|
|
|
MOVQ SP, BP; \
|
|
|
- ANDQ $0xFFFFFFFFFFFFFFF0, SP; \
|
|
|
- SUBQ $(16+16+stack_size), SP; \
|
|
|
+ MOVQ SP, R9; \
|
|
|
+ ADDQ $15, R9; \
|
|
|
+ ANDQ $~15, R9; \
|
|
|
+ MOVQ R9, SP; \
|
|
|
\
|
|
|
MOVQ 0(BX), R9; \
|
|
|
MOVQ R9, 0(SP); \
|
|
|
@@ -421,18 +423,18 @@ GLOBL counter<>(SB), (NOPTR+RODATA), $16
|
|
|
MOVQ BP, SP
|
|
|
|
|
|
// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
|
|
|
-TEXT ·hashBlocksSSE2(SB), 4, $0-48
|
|
|
- HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), 640, BLAKE2s_SSE2)
|
|
|
+TEXT ·hashBlocksSSE2(SB), 4, $672-48 // frame = 656 + 16 byte alignment
|
|
|
+ HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2)
|
|
|
RET
|
|
|
|
|
|
// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
|
|
|
-TEXT ·hashBlocksSSSE3(SB), 4, $0-48
|
|
|
- HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), 640, BLAKE2s_SSSE3)
|
|
|
+TEXT ·hashBlocksSSSE3(SB), 4, $672-48 // frame = 656 + 16 byte alignment
|
|
|
+ HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3)
|
|
|
RET
|
|
|
|
|
|
// func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
|
|
|
-TEXT ·hashBlocksSSE4(SB), 4, $0-48
|
|
|
- HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), 0, BLAKE2s_SSE4)
|
|
|
+TEXT ·hashBlocksSSE4(SB), 4, $16-48 // frame = 0 + 16 byte alignment
|
|
|
+ HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4)
|
|
|
RET
|
|
|
|
|
|
// func supportSSE4() bool
|