blake2bAVX2_amd64.s 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build go1.7,amd64,!gccgo,!appengine
  5. #include "textflag.h"
  6. DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
  7. DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
  8. DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
  9. DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
  10. GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
  11. DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
  12. DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
  13. DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
  14. DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
  15. GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
  16. DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
  17. DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
  18. DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
  19. DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
  20. GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
  21. DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
  22. DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
  23. DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
  24. DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
  25. GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
  26. DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
  27. DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
  28. GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
  29. DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
  30. DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
  31. GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
  32. DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
  33. DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
  34. GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
  35. DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
  36. DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
  37. GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
  38. DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
  39. DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
  40. GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
  41. DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
  42. DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
  43. GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
  44. // unfortunately the BYTE representation of VPERMQ must be used
  45. #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
  46. VPADDQ m0, Y0, Y0; \
  47. VPADDQ Y1, Y0, Y0; \
  48. VPXOR Y0, Y3, Y3; \
  49. VPSHUFD $-79, Y3, Y3; \
  50. VPADDQ Y3, Y2, Y2; \
  51. VPXOR Y2, Y1, Y1; \
  52. VPSHUFB c40, Y1, Y1; \
  53. VPADDQ m1, Y0, Y0; \
  54. VPADDQ Y1, Y0, Y0; \
  55. VPXOR Y0, Y3, Y3; \
  56. VPSHUFB c48, Y3, Y3; \
  57. VPADDQ Y3, Y2, Y2; \
  58. VPXOR Y2, Y1, Y1; \
  59. VPADDQ Y1, Y1, t; \
  60. VPSRLQ $63, Y1, Y1; \
  61. VPXOR t, Y1, Y1; \
  62. BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 \ // VPERMQ 0x39, Y1, Y1
  63. BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e \ // VPERMQ 0x4e, Y2, Y2
  64. BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 \ // VPERMQ 0x93, Y3, Y3
  65. VPADDQ m2, Y0, Y0; \
  66. VPADDQ Y1, Y0, Y0; \
  67. VPXOR Y0, Y3, Y3; \
  68. VPSHUFD $-79, Y3, Y3; \
  69. VPADDQ Y3, Y2, Y2; \
  70. VPXOR Y2, Y1, Y1; \
  71. VPSHUFB c40, Y1, Y1; \
  72. VPADDQ m3, Y0, Y0; \
  73. VPADDQ Y1, Y0, Y0; \
  74. VPXOR Y0, Y3, Y3; \
  75. VPSHUFB c48, Y3, Y3; \
  76. VPADDQ Y3, Y2, Y2; \
  77. VPXOR Y2, Y1, Y1; \
  78. VPADDQ Y1, Y1, t; \
  79. VPSRLQ $63, Y1, Y1; \
  80. VPXOR t, Y1, Y1; \
  81. BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 \ // VPERMQ 0x39, Y3, Y3
  82. BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e \ // VPERMQ 0x4e, Y2, Y2
  83. BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 \ // VPERMQ 0x93, Y1, Y1
  84. // load msg into Y12, Y13, Y14, Y15
  85. #define LOAD_MSG_AVX2(src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \
  86. MOVQ i0*8(src), X12; \
  87. PINSRQ $1, i1*8(src), X12; \
  88. MOVQ i2*8(src), X11; \
  89. PINSRQ $1, i3*8(src), X11; \
  90. VINSERTI128 $1, X11, Y12, Y12; \
  91. MOVQ i4*8(src), X13; \
  92. PINSRQ $1, i5*8(src), X13; \
  93. MOVQ i6*8(src), X11; \
  94. PINSRQ $1, i7*8(src), X11; \
  95. VINSERTI128 $1, X11, Y13, Y13; \
  96. MOVQ i8*8(src), X14; \
  97. PINSRQ $1, i9*8(src), X14; \
  98. MOVQ i10*8(src), X11; \
  99. PINSRQ $1, i11*8(src), X11; \
  100. VINSERTI128 $1, X11, Y14, Y14; \
  101. MOVQ i12*8(src), X15; \
  102. PINSRQ $1, i13*8(src), X15; \
  103. MOVQ i14*8(src), X11; \
  104. PINSRQ $1, i15*8(src), X11; \
  105. VINSERTI128 $1, X11, Y15, Y15
  106. // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
  107. TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
  108. MOVQ h+0(FP), AX
  109. MOVQ c+8(FP), BX
  110. MOVQ flag+16(FP), CX
  111. MOVQ blocks_base+24(FP), SI
  112. MOVQ blocks_len+32(FP), DI
  113. MOVQ SP, DX
  114. MOVQ SP, R9
  115. ADDQ $31, R9
  116. ANDQ $~31, R9
  117. MOVQ R9, SP
  118. MOVQ CX, 16(SP)
  119. XORQ CX, CX
  120. MOVQ CX, 24(SP)
  121. VMOVDQU ·AVX2_c40<>(SB), Y4
  122. VMOVDQU ·AVX2_c48<>(SB), Y5
  123. VMOVDQU 0(AX), Y8
  124. VMOVDQU 32(AX), Y9
  125. VMOVDQU ·AVX2_iv0<>(SB), Y6
  126. VMOVDQU ·AVX2_iv1<>(SB), Y7
  127. MOVQ 0(BX), R8
  128. MOVQ 8(BX), R9
  129. MOVQ R9, 8(SP)
  130. loop:
  131. ADDQ $128, R8
  132. MOVQ R8, 0(SP)
  133. CMPQ R8, $128
  134. JGE noinc
  135. INCQ R9
  136. MOVQ R9, 8(SP)
  137. noinc:
  138. VMOVDQA Y8, Y0
  139. VMOVDQA Y9, Y1
  140. VMOVDQA Y6, Y2
  141. VPXOR 0(SP), Y7, Y3
  142. LOAD_MSG_AVX2(SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15)
  143. VMOVDQA Y12, 32(SP)
  144. VMOVDQA Y13, 64(SP)
  145. VMOVDQA Y14, 96(SP)
  146. VMOVDQA Y15, 128(SP)
  147. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  148. LOAD_MSG_AVX2(SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3)
  149. VMOVDQA Y12, 160(SP)
  150. VMOVDQA Y13, 192(SP)
  151. VMOVDQA Y14, 224(SP)
  152. VMOVDQA Y15, 256(SP)
  153. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  154. LOAD_MSG_AVX2(SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4)
  155. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  156. LOAD_MSG_AVX2(SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8)
  157. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  158. LOAD_MSG_AVX2(SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13)
  159. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  160. LOAD_MSG_AVX2(SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9)
  161. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  162. LOAD_MSG_AVX2(SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11)
  163. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  164. LOAD_MSG_AVX2(SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10)
  165. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  166. LOAD_MSG_AVX2(SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5)
  167. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  168. LOAD_MSG_AVX2(SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0)
  169. ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
  170. ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5)
  171. ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5)
  172. VPXOR Y0, Y8, Y8
  173. VPXOR Y1, Y9, Y9
  174. VPXOR Y2, Y8, Y8
  175. VPXOR Y3, Y9, Y9
  176. LEAQ 128(SI), SI
  177. SUBQ $128, DI
  178. JNE loop
  179. MOVQ R8, 0(BX)
  180. MOVQ R9, 8(BX)
  181. VMOVDQU Y8, 0(AX)
  182. VMOVDQU Y9, 32(AX)
  183. MOVQ DX, SP
  184. RET
  185. // unfortunately the BYTE representation of VPUNPCKLQDQ and VPUNPCKHQDQ must be used
  186. #define VPUNPCKLQDQ_X8_X8_X10 BYTE $0xC4; BYTE $0x41; BYTE $0x39; BYTE $0x6C; BYTE $0xD0
  187. #define VPUNPCKHQDQ_X7_X10_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF2
  188. #define VPUNPCKLQDQ_X7_X7_X10 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xD7
  189. #define VPUNPCKHQDQ_X8_X10_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x39; BYTE $0x6D; BYTE $0xFA
  190. #define VPUNPCKLQDQ_X3_X3_X10 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xD3
  191. #define VPUNPCKHQDQ_X2_X10_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD2
  192. #define VPUNPCKLQDQ_X9_X9_X10 BYTE $0xC4; BYTE $0x41; BYTE $0x31; BYTE $0x6C; BYTE $0xD1
  193. #define VPUNPCKHQDQ_X3_X10_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDA
  194. #define VPUNPCKLQDQ_X2_X2_X10 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xD2
  195. #define VPUNPCKHQDQ_X3_X10_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD2
  196. #define VPUNPCKHQDQ_X8_X10_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x39; BYTE $0x6D; BYTE $0xDA
  197. #define VPUNPCKHQDQ_X6_X10_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF2
  198. #define VPUNPCKHQDQ_X7_X10_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFA
  199. // shuffle X2 and X6 using the temp registers X8, X9, X10
  200. #define SHUFFLE_AVX() \
  201. VMOVDQA X4, X9; \
  202. VMOVDQA X5, X4; \
  203. VMOVDQA X9, X5; \
  204. VMOVDQA X6, X8; \
  205. VPUNPCKLQDQ_X8_X8_X10; \
  206. VPUNPCKHQDQ_X7_X10_X6; \
  207. VPUNPCKLQDQ_X7_X7_X10; \
  208. VPUNPCKHQDQ_X8_X10_X7; \
  209. VPUNPCKLQDQ_X3_X3_X10; \
  210. VMOVDQA X2, X9; \
  211. VPUNPCKHQDQ_X2_X10_X2; \
  212. VPUNPCKLQDQ_X9_X9_X10; \
  213. VPUNPCKHQDQ_X3_X10_X3; \
  214. // inverse shuffle X2 and X6 using the temp registers X8, X9, X10
  215. #define SHUFFLE_AVX_INV() \
  216. VMOVDQA X4, X9; \
  217. VMOVDQA X5, X4; \
  218. VMOVDQA X9, X5; \
  219. VMOVDQA X2, X8; \
  220. VPUNPCKLQDQ_X2_X2_X10; \
  221. VPUNPCKHQDQ_X3_X10_X2; \
  222. VPUNPCKLQDQ_X3_X3_X10; \
  223. VPUNPCKHQDQ_X8_X10_X3; \
  224. VPUNPCKLQDQ_X7_X7_X10; \
  225. VMOVDQA X6, X9; \
  226. VPUNPCKHQDQ_X6_X10_X6; \
  227. VPUNPCKLQDQ_X9_X9_X10; \
  228. VPUNPCKHQDQ_X7_X10_X7; \
  229. #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
  230. VPADDQ m0, v0, v0; \
  231. VPADDQ v2, v0, v0; \
  232. VPADDQ m1, v1, v1; \
  233. VPADDQ v3, v1, v1; \
  234. VPXOR v0, v6, v6; \
  235. VPXOR v1, v7, v7; \
  236. VPSHUFD $-79, v6, v6; \
  237. VPSHUFD $-79, v7, v7; \
  238. VPADDQ v6, v4, v4; \
  239. VPADDQ v7, v5, v5; \
  240. VPXOR v4, v2, v2; \
  241. VPXOR v5, v3, v3; \
  242. VPSHUFB c40, v2, v2; \
  243. VPSHUFB c40, v3, v3; \
  244. VPADDQ m2, v0, v0; \
  245. VPADDQ v2, v0, v0; \
  246. VPADDQ m3, v1, v1; \
  247. VPADDQ v3, v1, v1; \
  248. VPXOR v0, v6, v6; \
  249. VPXOR v1, v7, v7; \
  250. VPSHUFB c48, v6, v6; \
  251. VPSHUFB c48, v7, v7; \
  252. VPADDQ v6, v4, v4; \
  253. VPADDQ v7, v5, v5; \
  254. VPXOR v4, v2, v2; \
  255. VPXOR v5, v3, v3; \
  256. VPADDQ v2, v2, t0; \
  257. VPSRLQ $63, v2, v2; \
  258. VPXOR t0, v2, v2; \
  259. VPADDQ v3, v3, t0; \
  260. VPSRLQ $63, v3, v3; \
  261. VPXOR t0, v3, v3
  262. // unfortunately the BYTE representation of VPINSRQ must be used
  263. #define VPINSRQ_1_R10_X8_X8 BYTE $0xC4; BYTE $0x43; BYTE $0xB9; BYTE $0x22; BYTE $0xC2; BYTE $0x01
  264. #define VPINSRQ_1_R11_X9_X9 BYTE $0xC4; BYTE $0x43; BYTE $0xB1; BYTE $0x22; BYTE $0xCB; BYTE $0x01
  265. #define VPINSRQ_1_R12_X10_X10 BYTE $0xC4; BYTE $0x43; BYTE $0xA9; BYTE $0x22; BYTE $0xD4; BYTE $0x01
  266. #define VPINSRQ_1_R13_X11_X11 BYTE $0xC4; BYTE $0x43; BYTE $0xA1; BYTE $0x22; BYTE $0xDD; BYTE $0x01
  267. #define VPINSRQ_1_R9_X8_X8 BYTE $0xC4; BYTE $0x43; BYTE $0xB9; BYTE $0x22; BYTE $0xC1; BYTE $0x01
  268. // load src into X8, X9, X10 and X11 using R10, R11, R12 and R13 for temp registers
  269. #define LOAD_MSG_AVX(src, i0, i1, i2, i3, i4, i5, i6, i7) \
  270. MOVQ i0*8(src), X8; \
  271. MOVQ i1*8(src), R10; \
  272. MOVQ i2*8(src), X9; \
  273. MOVQ i3*8(src), R11; \
  274. MOVQ i4*8(src), X10; \
  275. MOVQ i5*8(src), R12; \
  276. MOVQ i6*8(src), X11; \
  277. MOVQ i7*8(src), R13; \
  278. VPINSRQ_1_R10_X8_X8; \
  279. VPINSRQ_1_R11_X9_X9; \
  280. VPINSRQ_1_R12_X10_X10; \
  281. VPINSRQ_1_R13_X11_X11
  282. // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
  283. TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
  284. MOVQ h+0(FP), AX
  285. MOVQ c+8(FP), BX
  286. MOVQ flag+16(FP), CX
  287. MOVQ blocks_base+24(FP), SI
  288. MOVQ blocks_len+32(FP), DI
  289. MOVQ SP, BP
  290. MOVQ SP, R9
  291. ADDQ $15, R9
  292. ANDQ $~15, R9
  293. MOVQ R9, SP
  294. MOVOU ·AVX_c40<>(SB), X13
  295. MOVOU ·AVX_c48<>(SB), X14
  296. VMOVDQU ·AVX_iv3<>(SB), X0
  297. VMOVDQA X0, 0(SP)
  298. XORQ CX, 0(SP) // 0(SP) = ·AVX_iv3 ^ (CX || 0)
  299. VMOVDQU 0(AX), X12
  300. VMOVDQU 16(AX), X15
  301. VMOVDQU 32(AX), X2
  302. VMOVDQU 48(AX), X3
  303. MOVQ 0(BX), R8
  304. MOVQ 8(BX), R9
  305. loop:
  306. ADDQ $128, R8
  307. CMPQ R8, $128
  308. JGE noinc
  309. INCQ R9
  310. noinc:
  311. MOVQ R8, X8
  312. VPINSRQ_1_R9_X8_X8
  313. VMOVDQA X12, X0
  314. VMOVDQA X15, X1
  315. VMOVDQU ·AVX_iv0<>(SB), X4
  316. VMOVDQU ·AVX_iv1<>(SB), X5
  317. VMOVDQU ·AVX_iv2<>(SB), X6
  318. VPXOR X8, X6, X6
  319. VMOVDQA 0(SP), X7
  320. LOAD_MSG_AVX(SI, 0, 2, 4, 6, 1, 3, 5, 7)
  321. VMOVDQA X8, 16(SP)
  322. VMOVDQA X9, 32(SP)
  323. VMOVDQA X10, 48(SP)
  324. VMOVDQA X11, 64(SP)
  325. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  326. SHUFFLE_AVX()
  327. LOAD_MSG_AVX(SI, 8, 10, 12, 14, 9, 11, 13, 15)
  328. VMOVDQA X8, 80(SP)
  329. VMOVDQA X9, 96(SP)
  330. VMOVDQA X10, 112(SP)
  331. VMOVDQA X11, 128(SP)
  332. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  333. SHUFFLE_AVX_INV()
  334. LOAD_MSG_AVX(SI, 14, 4, 9, 13, 10, 8, 15, 6)
  335. VMOVDQA X8, 144(SP)
  336. VMOVDQA X9, 160(SP)
  337. VMOVDQA X10, 176(SP)
  338. VMOVDQA X11, 192(SP)
  339. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  340. SHUFFLE_AVX()
  341. LOAD_MSG_AVX(SI, 1, 0, 11, 5, 12, 2, 7, 3)
  342. VMOVDQA X8, 208(SP)
  343. VMOVDQA X9, 224(SP)
  344. VMOVDQA X10, 240(SP)
  345. VMOVDQA X11, 256(SP)
  346. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  347. SHUFFLE_AVX_INV()
  348. LOAD_MSG_AVX(SI, 11, 12, 5, 15, 8, 0, 2, 13)
  349. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  350. SHUFFLE_AVX()
  351. LOAD_MSG_AVX(SI, 10, 3, 7, 9, 14, 6, 1, 4)
  352. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  353. SHUFFLE_AVX_INV()
  354. LOAD_MSG_AVX(SI, 7, 3, 13, 11, 9, 1, 12, 14)
  355. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  356. SHUFFLE_AVX()
  357. LOAD_MSG_AVX(SI, 2, 5, 4, 15, 6, 10, 0, 8)
  358. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  359. SHUFFLE_AVX_INV()
  360. LOAD_MSG_AVX(SI, 9, 5, 2, 10, 0, 7, 4, 15)
  361. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  362. SHUFFLE_AVX()
  363. LOAD_MSG_AVX(SI, 14, 11, 6, 3, 1, 12, 8, 13)
  364. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  365. SHUFFLE_AVX_INV()
  366. LOAD_MSG_AVX(SI, 2, 6, 0, 8, 12, 10, 11, 3)
  367. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  368. SHUFFLE_AVX()
  369. LOAD_MSG_AVX(SI, 4, 7, 15, 1, 13, 5, 14, 9)
  370. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  371. SHUFFLE_AVX_INV()
  372. LOAD_MSG_AVX(SI, 12, 1, 14, 4, 5, 15, 13, 10)
  373. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  374. SHUFFLE_AVX()
  375. LOAD_MSG_AVX(SI, 0, 6, 9, 8, 7, 3, 2, 11)
  376. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  377. SHUFFLE_AVX_INV()
  378. LOAD_MSG_AVX(SI, 13, 7, 12, 3, 11, 14, 1, 9)
  379. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  380. SHUFFLE_AVX()
  381. LOAD_MSG_AVX(SI, 5, 15, 8, 2, 0, 4, 6, 10)
  382. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  383. SHUFFLE_AVX_INV()
  384. LOAD_MSG_AVX(SI, 6, 14, 11, 0, 15, 9, 3, 8)
  385. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  386. SHUFFLE_AVX()
  387. LOAD_MSG_AVX(SI, 12, 13, 1, 10, 2, 7, 4, 5)
  388. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  389. SHUFFLE_AVX_INV()
  390. LOAD_MSG_AVX(SI, 10, 8, 7, 1, 2, 4, 6, 5)
  391. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  392. SHUFFLE_AVX()
  393. LOAD_MSG_AVX(SI, 15, 9, 3, 13, 11, 14, 12, 0)
  394. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
  395. SHUFFLE_AVX_INV()
  396. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14)
  397. SHUFFLE_AVX()
  398. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14)
  399. SHUFFLE_AVX_INV()
  400. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14)
  401. SHUFFLE_AVX()
  402. HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14)
  403. SHUFFLE_AVX_INV()
  404. VMOVDQU 32(AX), X10
  405. VMOVDQU 48(AX), X11
  406. VPXOR X0, X12, X12
  407. VPXOR X1, X15, X15
  408. VPXOR X2, X10, X10
  409. VPXOR X3, X11, X11
  410. VPXOR X4, X12, X12
  411. VPXOR X5, X15, X15
  412. VPXOR X6, X10, X2
  413. VPXOR X7, X11, X3
  414. VMOVDQU X2, 32(AX)
  415. VMOVDQU X3, 48(AX)
  416. LEAQ 128(SI), SI
  417. SUBQ $128, DI
  418. JNE loop
  419. VMOVDQU X12, 0(AX)
  420. VMOVDQU X15, 16(AX)
  421. MOVQ R8, 0(BX)
  422. MOVQ R9, 8(BX)
  423. VZEROUPPER
  424. MOVQ BP, SP
  425. RET
  426. // func supportsAVX2() bool
  427. TEXT ·supportsAVX2(SB), 4, $0-1
  428. MOVQ runtime·support_avx2(SB), AX
  429. MOVB AX, ret+0(FP)
  430. RET
  431. // func supportsAVX() bool
  432. TEXT ·supportsAVX(SB), 4, $0-1
  433. MOVQ runtime·support_avx(SB), AX
  434. MOVB AX, ret+0(FP)
  435. RET