asm_ppc64le.s 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. // Copyright 2019 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Based on CRYPTOGAMS code with the following comment:
  5. // # ====================================================================
  6. // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  7. // # project. The module is, however, dual licensed under OpenSSL and
  8. // # CRYPTOGAMS licenses depending on where you obtain it. For further
  9. // # details see http://www.openssl.org/~appro/cryptogams/.
  10. // # ====================================================================
  11. // Code for the perl script that generates the ppc64 assembler
  12. // can be found in the cryptogams repository at the link below. It is based on
  13. // the original from openssl.
  14. // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
  15. // The differences in this and the original implementation are
  16. // due to the calling conventions and initialization of constants.
  17. // +build ppc64le,!gccgo,!appengine
  18. #include "textflag.h"
  19. #define OUT R3
  20. #define INP R4
  21. #define LEN R5
  22. #define KEY R6
  23. #define CNT R7
  24. #define TMP R15
  25. #define CONSTBASE R16
  26. #define X0 R11
  27. #define X1 R12
  28. #define X2 R14
  29. #define X3 R15
  30. #define X4 R16
  31. #define X5 R17
  32. #define X6 R18
  33. #define X7 R19
  34. #define X8 R20
  35. #define X9 R21
  36. #define X10 R22
  37. #define X11 R23
  38. #define X12 R24
  39. #define X13 R25
  40. #define X14 R26
  41. #define X15 R27
  42. DATA consts<>+0x00(SB)/8, $0x3320646e61707865
  43. DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
  44. DATA consts<>+0x10(SB)/8, $0x0000000000000001
  45. DATA consts<>+0x18(SB)/8, $0x0000000000000000
  46. DATA consts<>+0x20(SB)/8, $0x0000000000000004
  47. DATA consts<>+0x28(SB)/8, $0x0000000000000000
  48. DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
  49. DATA consts<>+0x38(SB)/8, $0x0203000106070405
  50. DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
  51. DATA consts<>+0x48(SB)/8, $0x0102030005060704
  52. DATA consts<>+0x50(SB)/8, $0x6170786561707865
  53. DATA consts<>+0x58(SB)/8, $0x6170786561707865
  54. DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
  55. DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
  56. DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
  57. DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
  58. DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
  59. DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
  60. DATA consts<>+0x90(SB)/8, $0x0000000100000000
  61. DATA consts<>+0x98(SB)/8, $0x0000000300000002
  62. GLOBL consts<>(SB), RODATA, $0xa0
  63. //func chaCha20_ctr32_vsx(out, inp []byte, len int, key *[32]byte, counter *[16]byte)
  64. TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
  65. MOVD out+0(FP), OUT
  66. MOVD inp+8(FP), INP
  67. MOVD len+16(FP), LEN
  68. MOVD key+24(FP), KEY
  69. MOVD cnt+32(FP), CNT
  70. // Addressing for constants
  71. MOVD $consts<>+0x00(SB), CONSTBASE
  72. MOVD $16, R8
  73. MOVD $32, R9
  74. MOVD $48, R10
  75. MOVD $64, R11
  76. // V16
  77. LXVW4X (CONSTBASE)(R0), VS48
  78. ADD $80,CONSTBASE
  79. // Load key into V17,V18
  80. LXVW4X (KEY)(R0), VS49
  81. LXVW4X (KEY)(R8), VS50
  82. // Load CNT, NONCE into V19
  83. LXVW4X (CNT)(R0), VS51
  84. // Clear V27
  85. VXOR V27, V27, V27
  86. // V28
  87. LXVW4X (CONSTBASE)(R11), VS60
  88. // splat slot from V19 -> V26
  89. VSPLTW $0, V19, V26
  90. VSLDOI $4, V19, V27, V19
  91. VSLDOI $12, V27, V19, V19
  92. VADDUWM V26, V28, V26
  93. MOVD $10, R14
  94. MOVD R14, CTR
  95. loop_outer_vsx:
  96. // V0, V1, V2, V3
  97. LXVW4X (R0)(CONSTBASE), VS32
  98. LXVW4X (R8)(CONSTBASE), VS33
  99. LXVW4X (R9)(CONSTBASE), VS34
  100. LXVW4X (R10)(CONSTBASE), VS35
  101. // splat values from V17, V18 into V4-V11
  102. VSPLTW $0, V17, V4
  103. VSPLTW $1, V17, V5
  104. VSPLTW $2, V17, V6
  105. VSPLTW $3, V17, V7
  106. VSPLTW $0, V18, V8
  107. VSPLTW $1, V18, V9
  108. VSPLTW $2, V18, V10
  109. VSPLTW $3, V18, V11
  110. // VOR
  111. VOR V26, V26, V12
  112. // splat values from V19 -> V13, V14, V15
  113. VSPLTW $1, V19, V13
  114. VSPLTW $2, V19, V14
  115. VSPLTW $3, V19, V15
  116. // splat const values
  117. VSPLTISW $-16, V27
  118. VSPLTISW $12, V28
  119. VSPLTISW $8, V29
  120. VSPLTISW $7, V30
  121. loop_vsx:
  122. VADDUWM V0, V4, V0
  123. VADDUWM V1, V5, V1
  124. VADDUWM V2, V6, V2
  125. VADDUWM V3, V7, V3
  126. VXOR V12, V0, V12
  127. VXOR V13, V1, V13
  128. VXOR V14, V2, V14
  129. VXOR V15, V3, V15
  130. VRLW V12, V27, V12
  131. VRLW V13, V27, V13
  132. VRLW V14, V27, V14
  133. VRLW V15, V27, V15
  134. VADDUWM V8, V12, V8
  135. VADDUWM V9, V13, V9
  136. VADDUWM V10, V14, V10
  137. VADDUWM V11, V15, V11
  138. VXOR V4, V8, V4
  139. VXOR V5, V9, V5
  140. VXOR V6, V10, V6
  141. VXOR V7, V11, V7
  142. VRLW V4, V28, V4
  143. VRLW V5, V28, V5
  144. VRLW V6, V28, V6
  145. VRLW V7, V28, V7
  146. VADDUWM V0, V4, V0
  147. VADDUWM V1, V5, V1
  148. VADDUWM V2, V6, V2
  149. VADDUWM V3, V7, V3
  150. VXOR V12, V0, V12
  151. VXOR V13, V1, V13
  152. VXOR V14, V2, V14
  153. VXOR V15, V3, V15
  154. VRLW V12, V29, V12
  155. VRLW V13, V29, V13
  156. VRLW V14, V29, V14
  157. VRLW V15, V29, V15
  158. VADDUWM V8, V12, V8
  159. VADDUWM V9, V13, V9
  160. VADDUWM V10, V14, V10
  161. VADDUWM V11, V15, V11
  162. VXOR V4, V8, V4
  163. VXOR V5, V9, V5
  164. VXOR V6, V10, V6
  165. VXOR V7, V11, V7
  166. VRLW V4, V30, V4
  167. VRLW V5, V30, V5
  168. VRLW V6, V30, V6
  169. VRLW V7, V30, V7
  170. VADDUWM V0, V5, V0
  171. VADDUWM V1, V6, V1
  172. VADDUWM V2, V7, V2
  173. VADDUWM V3, V4, V3
  174. VXOR V15, V0, V15
  175. VXOR V12, V1, V12
  176. VXOR V13, V2, V13
  177. VXOR V14, V3, V14
  178. VRLW V15, V27, V15
  179. VRLW V12, V27, V12
  180. VRLW V13, V27, V13
  181. VRLW V14, V27, V14
  182. VADDUWM V10, V15, V10
  183. VADDUWM V11, V12, V11
  184. VADDUWM V8, V13, V8
  185. VADDUWM V9, V14, V9
  186. VXOR V5, V10, V5
  187. VXOR V6, V11, V6
  188. VXOR V7, V8, V7
  189. VXOR V4, V9, V4
  190. VRLW V5, V28, V5
  191. VRLW V6, V28, V6
  192. VRLW V7, V28, V7
  193. VRLW V4, V28, V4
  194. VADDUWM V0, V5, V0
  195. VADDUWM V1, V6, V1
  196. VADDUWM V2, V7, V2
  197. VADDUWM V3, V4, V3
  198. VXOR V15, V0, V15
  199. VXOR V12, V1, V12
  200. VXOR V13, V2, V13
  201. VXOR V14, V3, V14
  202. VRLW V15, V29, V15
  203. VRLW V12, V29, V12
  204. VRLW V13, V29, V13
  205. VRLW V14, V29, V14
  206. VADDUWM V10, V15, V10
  207. VADDUWM V11, V12, V11
  208. VADDUWM V8, V13, V8
  209. VADDUWM V9, V14, V9
  210. VXOR V5, V10, V5
  211. VXOR V6, V11, V6
  212. VXOR V7, V8, V7
  213. VXOR V4, V9, V4
  214. VRLW V5, V30, V5
  215. VRLW V6, V30, V6
  216. VRLW V7, V30, V7
  217. VRLW V4, V30, V4
  218. BC 16, LT, loop_vsx
  219. VADDUWM V12, V26, V12
  220. WORD $0x13600F8C // VMRGEW V0, V1, V27
  221. WORD $0x13821F8C // VMRGEW V2, V3, V28
  222. WORD $0x10000E8C // VMRGOW V0, V1, V0
  223. WORD $0x10421E8C // VMRGOW V2, V3, V2
  224. WORD $0x13A42F8C // VMRGEW V4, V5, V29
  225. WORD $0x13C63F8C // VMRGEW V6, V7, V30
  226. XXPERMDI VS32, VS34, $0, VS33
  227. XXPERMDI VS32, VS34, $3, VS35
  228. XXPERMDI VS59, VS60, $0, VS32
  229. XXPERMDI VS59, VS60, $3, VS34
  230. WORD $0x10842E8C // VMRGOW V4, V5, V4
  231. WORD $0x10C63E8C // VMRGOW V6, V7, V6
  232. WORD $0x13684F8C // VMRGEW V8, V9, V27
  233. WORD $0x138A5F8C // VMRGEW V10, V11, V28
  234. XXPERMDI VS36, VS38, $0, VS37
  235. XXPERMDI VS36, VS38, $3, VS39
  236. XXPERMDI VS61, VS62, $0, VS36
  237. XXPERMDI VS61, VS62, $3, VS38
  238. WORD $0x11084E8C // VMRGOW V8, V9, V8
  239. WORD $0x114A5E8C // VMRGOW V10, V11, V10
  240. WORD $0x13AC6F8C // VMRGEW V12, V13, V29
  241. WORD $0x13CE7F8C // VMRGEW V14, V15, V30
  242. XXPERMDI VS40, VS42, $0, VS41
  243. XXPERMDI VS40, VS42, $3, VS43
  244. XXPERMDI VS59, VS60, $0, VS40
  245. XXPERMDI VS59, VS60, $3, VS42
  246. WORD $0x118C6E8C // VMRGOW V12, V13, V12
  247. WORD $0x11CE7E8C // VMRGOW V14, V15, V14
  248. VSPLTISW $4, V27
  249. VADDUWM V26, V27, V26
  250. XXPERMDI VS44, VS46, $0, VS45
  251. XXPERMDI VS44, VS46, $3, VS47
  252. XXPERMDI VS61, VS62, $0, VS44
  253. XXPERMDI VS61, VS62, $3, VS46
  254. VADDUWM V0, V16, V0
  255. VADDUWM V4, V17, V4
  256. VADDUWM V8, V18, V8
  257. VADDUWM V12, V19, V12
  258. CMPU LEN, $64
  259. BLT tail_vsx
  260. // Bottom of loop
  261. LXVW4X (INP)(R0), VS59
  262. LXVW4X (INP)(R8), VS60
  263. LXVW4X (INP)(R9), VS61
  264. LXVW4X (INP)(R10), VS62
  265. VXOR V27, V0, V27
  266. VXOR V28, V4, V28
  267. VXOR V29, V8, V29
  268. VXOR V30, V12, V30
  269. STXVW4X VS59, (OUT)(R0)
  270. STXVW4X VS60, (OUT)(R8)
  271. ADD $64, INP
  272. STXVW4X VS61, (OUT)(R9)
  273. ADD $-64, LEN
  274. STXVW4X VS62, (OUT)(R10)
  275. ADD $64, OUT
  276. BEQ done_vsx
  277. VADDUWM V1, V16, V0
  278. VADDUWM V5, V17, V4
  279. VADDUWM V9, V18, V8
  280. VADDUWM V13, V19, V12
  281. CMPU LEN, $64
  282. BLT tail_vsx
  283. LXVW4X (INP)(R0), VS59
  284. LXVW4X (INP)(R8), VS60
  285. LXVW4X (INP)(R9), VS61
  286. LXVW4X (INP)(R10), VS62
  287. VXOR V27, V0, V27
  288. VXOR V28, V4, V28
  289. VXOR V29, V8, V29
  290. VXOR V30, V12, V30
  291. STXVW4X VS59, (OUT)(R0)
  292. STXVW4X VS60, (OUT)(R8)
  293. ADD $64, INP
  294. STXVW4X VS61, (OUT)(R9)
  295. ADD $-64, LEN
  296. STXVW4X VS62, (OUT)(V10)
  297. ADD $64, OUT
  298. BEQ done_vsx
  299. VADDUWM V2, V16, V0
  300. VADDUWM V6, V17, V4
  301. VADDUWM V10, V18, V8
  302. VADDUWM V14, V19, V12
  303. CMPU LEN, $64
  304. BLT tail_vsx
  305. LXVW4X (INP)(R0), VS59
  306. LXVW4X (INP)(R8), VS60
  307. LXVW4X (INP)(R9), VS61
  308. LXVW4X (INP)(R10), VS62
  309. VXOR V27, V0, V27
  310. VXOR V28, V4, V28
  311. VXOR V29, V8, V29
  312. VXOR V30, V12, V30
  313. STXVW4X VS59, (OUT)(R0)
  314. STXVW4X VS60, (OUT)(R8)
  315. ADD $64, INP
  316. STXVW4X VS61, (OUT)(R9)
  317. ADD $-64, LEN
  318. STXVW4X VS62, (OUT)(R10)
  319. ADD $64, OUT
  320. BEQ done_vsx
  321. VADDUWM V3, V16, V0
  322. VADDUWM V7, V17, V4
  323. VADDUWM V11, V18, V8
  324. VADDUWM V15, V19, V12
  325. CMPU LEN, $64
  326. BLT tail_vsx
  327. LXVW4X (INP)(R0), VS59
  328. LXVW4X (INP)(R8), VS60
  329. LXVW4X (INP)(R9), VS61
  330. LXVW4X (INP)(R10), VS62
  331. VXOR V27, V0, V27
  332. VXOR V28, V4, V28
  333. VXOR V29, V8, V29
  334. VXOR V30, V12, V30
  335. STXVW4X VS59, (OUT)(R0)
  336. STXVW4X VS60, (OUT)(R8)
  337. ADD $64, INP
  338. STXVW4X VS61, (OUT)(R9)
  339. ADD $-64, LEN
  340. STXVW4X VS62, (OUT)(R10)
  341. ADD $64, OUT
  342. MOVD $10, R14
  343. MOVD R14, CTR
  344. BNE loop_outer_vsx
  345. done_vsx:
  346. // Increment counter by 4
  347. MOVD (CNT), R14
  348. ADD $4, R14
  349. MOVD R14, (CNT)
  350. RET
  351. tail_vsx:
  352. ADD $32, R1, R11
  353. MOVD LEN, CTR
  354. // Save values on stack to copy from
  355. STXVW4X VS32, (R11)(R0)
  356. STXVW4X VS36, (R11)(R8)
  357. STXVW4X VS40, (R11)(R9)
  358. STXVW4X VS44, (R11)(R10)
  359. ADD $-1, R11, R12
  360. ADD $-1, INP
  361. ADD $-1, OUT
  362. looptail_vsx:
  363. // Copying the result to OUT
  364. // in bytes.
  365. MOVBZU 1(R12), KEY
  366. MOVBZU 1(INP), TMP
  367. XOR KEY, TMP, KEY
  368. MOVBU KEY, 1(OUT)
  369. BC 16, LT, looptail_vsx
  370. // Clear the stack values
  371. STXVW4X VS48, (R11)(R0)
  372. STXVW4X VS48, (R11)(R8)
  373. STXVW4X VS48, (R11)(R9)
  374. STXVW4X VS48, (R11)(R10)
  375. BR done_vsx