blake2b_amd64.s 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build amd64,!gccgo,!appengine
  5. #include "textflag.h"
  6. DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
  7. DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
  8. GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
  9. DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
  10. DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
  11. GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
  12. DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
  13. DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
  14. GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
  15. DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
  16. DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
  17. GLOBL ·iv3<>(SB), (NOPTR+RODATA), $32
  18. DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
  19. DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
  20. GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
  21. DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
  22. DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
  23. GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
  24. #define SHUFFLE(v2, v3, v4, v5, v6, v7, t0, t1, t2) \
  25. MOVO v4, t0; \
  26. MOVO v5, v4; \
  27. MOVO t0, v5; \
  28. MOVO v6, t0; \
  29. PUNPCKLQDQ v6, t2; \
  30. PUNPCKHQDQ v7, v6; \
  31. PUNPCKHQDQ t2, v6; \
  32. PUNPCKLQDQ v7, t2; \
  33. MOVO t0, v7; \
  34. MOVO v2, t1; \
  35. PUNPCKHQDQ t2, v7; \
  36. PUNPCKLQDQ v3, t2; \
  37. PUNPCKHQDQ t2, v2; \
  38. PUNPCKLQDQ t1, t2; \
  39. PUNPCKHQDQ t2, v3
  40. #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t0, t1, t2) \
  41. MOVO v4, t0; \
  42. MOVO v5, v4; \
  43. MOVO t0, v5; \
  44. MOVO v2, t0; \
  45. PUNPCKLQDQ v2, t2; \
  46. PUNPCKHQDQ v3, v2; \
  47. PUNPCKHQDQ t2, v2; \
  48. PUNPCKLQDQ v3, t2; \
  49. MOVO t0, v3; \
  50. MOVO v6, t1; \
  51. PUNPCKHQDQ t2, v3; \
  52. PUNPCKLQDQ v7, t2; \
  53. PUNPCKHQDQ t2, v6; \
  54. PUNPCKLQDQ t1, t2; \
  55. PUNPCKHQDQ t2, v7
  56. #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, t1, t2, c40, c48) \
  57. PADDQ m0, v0; \
  58. PADDQ m1, v1; \
  59. PADDQ v2, v0; \
  60. PADDQ v3, v1; \
  61. PXOR v0, v6; \
  62. PXOR v1, v7; \
  63. PSHUFD $0xB1, v6, v6; \
  64. PSHUFD $0xB1, v7, v7; \
  65. PADDQ v6, v4; \
  66. PADDQ v7, v5; \
  67. PXOR v4, v2; \
  68. PXOR v5, v3; \
  69. PSHUFB c40, v2; \
  70. PSHUFB c40, v3; \
  71. PADDQ m2, v0; \
  72. PADDQ m3, v1; \
  73. PADDQ v2, v0; \
  74. PADDQ v3, v1; \
  75. PXOR v0, v6; \
  76. PXOR v1, v7; \
  77. PSHUFB c48, v6; \
  78. PSHUFB c48, v7; \
  79. PADDQ v6, v4; \
  80. PADDQ v7, v5; \
  81. PXOR v4, v2; \
  82. PXOR v5, v3; \
  83. MOVOU v2, t2; \
  84. PADDQ v2, t2; \
  85. PSRLQ $63, v2; \
  86. PXOR t2, v2; \
  87. MOVOU v3, t2; \
  88. PADDQ v3, t2; \
  89. PSRLQ $63, v3; \
  90. PXOR t2, v3
  91. #define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
  92. MOVQ i0*8(src), m0; \
  93. PINSRQ $1, i1*8(src), m0; \
  94. MOVQ i2*8(src), m1; \
  95. PINSRQ $1, i3*8(src), m1; \
  96. MOVQ i4*8(src), m2; \
  97. PINSRQ $1, i5*8(src), m2; \
  98. MOVQ i6*8(src), m3; \
  99. PINSRQ $1, i7*8(src), m3
  100. // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
  101. TEXT ·hashBlocksSSE4(SB), 4, $32-48 // frame size = 16 + 16 byte alignment
  102. MOVQ h+0(FP), AX
  103. MOVQ c+8(FP), BX
  104. MOVQ flag+16(FP), CX
  105. MOVQ blocks_base+24(FP), SI
  106. MOVQ blocks_len+32(FP), DI
  107. MOVQ SP, BP
  108. MOVQ SP, R9
  109. ADDQ $15, R9
  110. ANDQ $~15, R9
  111. MOVQ R9, SP
  112. MOVOU ·iv3<>(SB), X0
  113. MOVO X0, 0(SP)
  114. XORQ CX, 0(SP) // 0(SP) = ·iv3 ^ (CX || 0)
  115. MOVOU ·c40<>(SB), X13
  116. MOVOU ·c48<>(SB), X14
  117. MOVQ 0(BX), R8
  118. MOVQ 8(BX), R9
  119. loop:
  120. ADDQ $128, R8
  121. CMPQ R8, $128
  122. JGE noinc
  123. INCQ R9
  124. noinc:
  125. MOVQ R8, X15
  126. PINSRQ $1, R9, X15
  127. MOVOU 0(AX), X0
  128. MOVOU 16(AX), X1
  129. MOVOU 32(AX), X2
  130. MOVOU 48(AX), X3
  131. MOVOU ·iv0<>(SB), X4
  132. MOVOU ·iv1<>(SB), X5
  133. MOVOU ·iv2<>(SB), X6
  134. PXOR X15, X6
  135. MOVO 0(SP), X7
  136. LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
  137. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  138. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  139. LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
  140. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  141. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  142. LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
  143. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  144. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  145. LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
  146. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  147. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  148. LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
  149. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  150. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  151. LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
  152. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  153. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  154. LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
  155. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  156. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  157. LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
  158. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  159. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  160. LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
  161. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  162. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  163. LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
  164. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  165. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  166. LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
  167. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  168. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  169. LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
  170. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  171. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  172. LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
  173. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  174. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  175. LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
  176. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  177. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  178. LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
  179. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  180. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  181. LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
  182. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  183. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  184. LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
  185. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  186. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  187. LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
  188. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  189. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  190. LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
  191. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  192. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  193. LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
  194. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  195. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  196. LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
  197. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  198. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  199. LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
  200. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  201. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  202. LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
  203. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  204. SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  205. LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
  206. HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X8, X9, X12, X13, X14)
  207. SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9, X10)
  208. MOVOU 0(AX), X8
  209. MOVOU 16(AX), X9
  210. MOVOU 32(AX), X10
  211. MOVOU 48(AX), X11
  212. PXOR X0, X8
  213. PXOR X1, X9
  214. PXOR X2, X10
  215. PXOR X3, X11
  216. PXOR X4, X8
  217. PXOR X5, X9
  218. PXOR X6, X10
  219. PXOR X7, X11
  220. MOVOU X8, 0(AX)
  221. MOVOU X9, 16(AX)
  222. MOVOU X10, 32(AX)
  223. MOVOU X11, 48(AX)
  224. LEAQ 128(SI), SI
  225. SUBQ $128, DI
  226. JNE loop
  227. MOVOU X15, 0(BX)
  228. MOVQ BP, SP
  229. RET
  230. // func supportSSE4() bool
  231. TEXT ·supportSSE4(SB), 4, $0-1
  232. MOVL $1, AX
  233. CPUID
  234. SHRL $15, CX // Bit 15 indicates SSE4 support
  235. ANDL $1, CX // CX != 0 if support SSE4
  236. MOVB CX, ret+0(FP)
  237. RET