sum_vmsl_s390x.s 24 KB


  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build s390x,go1.11,!gccgo,!appengine
  5. #include "textflag.h"
  6. // Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
  7. // constants
  8. #define EX0 V1
  9. #define EX1 V2
  10. #define EX2 V3
  11. // temporaries
  12. #define T_0 V4
  13. #define T_1 V5
  14. #define T_2 V6
  15. #define T_3 V7
  16. #define T_4 V8
  17. #define T_5 V9
  18. #define T_6 V10
  19. #define T_7 V11
  20. #define T_8 V12
  21. #define T_9 V13
  22. #define T_10 V14
  23. // r**2 & r**4
  24. #define R_0 V15
  25. #define R_1 V16
  26. #define R_2 V17
  27. #define R5_1 V18
  28. #define R5_2 V19
  29. // key (r)
  30. #define RSAVE_0 R7
  31. #define RSAVE_1 R8
  32. #define RSAVE_2 R9
  33. #define R5SAVE_1 R10
  34. #define R5SAVE_2 R11
  35. // message block
  36. #define M0 V20
  37. #define M1 V21
  38. #define M2 V22
  39. #define M3 V23
  40. #define M4 V24
  41. #define M5 V25
  42. // accumulator
  43. #define H0_0 V26
  44. #define H1_0 V27
  45. #define H2_0 V28
  46. #define H0_1 V29
  47. #define H1_1 V30
  48. #define H2_1 V31
  49. GLOBL ·keyMask<>(SB), RODATA, $16
  50. DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
  51. DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
  52. GLOBL ·bswapMask<>(SB), RODATA, $16
  53. DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
  54. DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
  55. GLOBL ·constants<>(SB), RODATA, $48
  56. // EX0
  57. DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
  58. DATA ·constants<>+8(SB)/8, $0x0000050403020100
  59. // EX1
  60. DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
  61. DATA ·constants<>+24(SB)/8, $0x00000a0908070605
  62. // EX2
  63. DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
  64. DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
  65. GLOBL ·c<>(SB), RODATA, $48
  66. // EX0
  67. DATA ·c<>+0(SB)/8, $0x0000050403020100
  68. DATA ·c<>+8(SB)/8, $0x0000151413121110
  69. // EX1
  70. DATA ·c<>+16(SB)/8, $0x00000a0908070605
  71. DATA ·c<>+24(SB)/8, $0x00001a1918171615
  72. // EX2
  73. DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
  74. DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
  75. GLOBL ·reduce<>(SB), RODATA, $32
  76. // 44 bit
  77. DATA ·reduce<>+0(SB)/8, $0x0
  78. DATA ·reduce<>+8(SB)/8, $0xfffffffffff
  79. // 42 bit
  80. DATA ·reduce<>+16(SB)/8, $0x0
  81. DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
  82. // h = (f*g) % (2**130-5) [partial reduction]
  83. // uses T_0...T_9 temporary registers
  84. // input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
  85. // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
  86. // output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
  87. #define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
  88. \ // Eliminate the dependency for the last 2 VMSLs
  89. VMSLG m02_0, r_2, m4_2, m4_2 \
  90. VMSLG m13_0, r_2, m5_2, m5_2 \ // 8 VMSLs pipelined
  91. VMSLG m02_0, r_0, m4_0, m4_0 \
  92. VMSLG m02_1, r5_2, V0, T_0 \
  93. VMSLG m02_0, r_1, m4_1, m4_1 \
  94. VMSLG m02_1, r_0, V0, T_1 \
  95. VMSLG m02_1, r_1, V0, T_2 \
  96. VMSLG m02_2, r5_1, V0, T_3 \
  97. VMSLG m02_2, r5_2, V0, T_4 \
  98. VMSLG m13_0, r_0, m5_0, m5_0 \
  99. VMSLG m13_1, r5_2, V0, T_5 \
  100. VMSLG m13_0, r_1, m5_1, m5_1 \
  101. VMSLG m13_1, r_0, V0, T_6 \
  102. VMSLG m13_1, r_1, V0, T_7 \
  103. VMSLG m13_2, r5_1, V0, T_8 \
  104. VMSLG m13_2, r5_2, V0, T_9 \
  105. VMSLG m02_2, r_0, m4_2, m4_2 \
  106. VMSLG m13_2, r_0, m5_2, m5_2 \
  107. VAQ m4_0, T_0, m02_0 \
  108. VAQ m4_1, T_1, m02_1 \
  109. VAQ m5_0, T_5, m13_0 \
  110. VAQ m5_1, T_6, m13_1 \
  111. VAQ m02_0, T_3, m02_0 \
  112. VAQ m02_1, T_4, m02_1 \
  113. VAQ m13_0, T_8, m13_0 \
  114. VAQ m13_1, T_9, m13_1 \
  115. VAQ m4_2, T_2, m02_2 \
  116. VAQ m5_2, T_7, m13_2 \
  117. // SQUARE uses three limbs of r and r_2*5 to output square of r
  118. // uses T_1, T_5 and T_7 temporary registers
  119. // input: r_0, r_1, r_2, r5_2
  120. // temp: TEMP0, TEMP1, TEMP2
  121. // output: p0, p1, p2
  122. #define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
  123. VMSLG r_0, r_0, p0, p0 \
  124. VMSLG r_1, r5_2, V0, TEMP0 \
  125. VMSLG r_2, r5_2, p1, p1 \
  126. VMSLG r_0, r_1, V0, TEMP1 \
  127. VMSLG r_1, r_1, p2, p2 \
  128. VMSLG r_0, r_2, V0, TEMP2 \
  129. VAQ TEMP0, p0, p0 \
  130. VAQ TEMP1, p1, p1 \
  131. VAQ TEMP2, p2, p2 \
  132. VAQ TEMP0, p0, p0 \
  133. VAQ TEMP1, p1, p1 \
  134. VAQ TEMP2, p2, p2 \
  135. // carry h0->h1->h2->h0 || h3->h4->h5->h3
  136. // uses T_2, T_4, T_5, T_7, T_8, T_9
  137. // t6, t7, t8, t9, t10, t11
  138. // input: h0, h1, h2, h3, h4, h5
  139. // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
  140. // output: h0, h1, h2, h3, h4, h5
  141. #define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
  142. VLM (R12), t6, t7 \ // 44 and 42 bit clear mask
  143. VLEIB $7, $0x28, t10 \ // 5 byte shift mask
  144. VREPIB $4, t8 \ // 4 bit shift mask
  145. VREPIB $2, t11 \ // 2 bit shift mask
  146. VSRLB t10, h0, t0 \ // h0 byte shift
  147. VSRLB t10, h1, t1 \ // h1 byte shift
  148. VSRLB t10, h2, t2 \ // h2 byte shift
  149. VSRLB t10, h3, t3 \ // h3 byte shift
  150. VSRLB t10, h4, t4 \ // h4 byte shift
  151. VSRLB t10, h5, t5 \ // h5 byte shift
  152. VSRL t8, t0, t0 \ // h0 bit shift
  153. VSRL t8, t1, t1 \ // h2 bit shift
  154. VSRL t11, t2, t2 \ // h2 bit shift
  155. VSRL t8, t3, t3 \ // h3 bit shift
  156. VSRL t8, t4, t4 \ // h4 bit shift
  157. VESLG $2, t2, t9 \ // h2 carry x5
  158. VSRL t11, t5, t5 \ // h5 bit shift
  159. VN t6, h0, h0 \ // h0 clear carry
  160. VAQ t2, t9, t2 \ // h2 carry x5
  161. VESLG $2, t5, t9 \ // h5 carry x5
  162. VN t6, h1, h1 \ // h1 clear carry
  163. VN t7, h2, h2 \ // h2 clear carry
  164. VAQ t5, t9, t5 \ // h5 carry x5
  165. VN t6, h3, h3 \ // h3 clear carry
  166. VN t6, h4, h4 \ // h4 clear carry
  167. VN t7, h5, h5 \ // h5 clear carry
  168. VAQ t0, h1, h1 \ // h0->h1
  169. VAQ t3, h4, h4 \ // h3->h4
  170. VAQ t1, h2, h2 \ // h1->h2
  171. VAQ t4, h5, h5 \ // h4->h5
  172. VAQ t2, h0, h0 \ // h2->h0
  173. VAQ t5, h3, h3 \ // h5->h3
  174. VREPG $1, t6, t6 \ // 44 and 42 bit masks across both halves
  175. VREPG $1, t7, t7 \
  176. VSLDB $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
  177. VSLDB $8, h1, h1, h1 \
  178. VSLDB $8, h2, h2, h2 \
  179. VO h0, h3, h3 \
  180. VO h1, h4, h4 \
  181. VO h2, h5, h5 \
  182. VESRLG $44, h3, t0 \ // 44 bit shift right
  183. VESRLG $44, h4, t1 \
  184. VESRLG $42, h5, t2 \
  185. VN t6, h3, h3 \ // clear carry bits
  186. VN t6, h4, h4 \
  187. VN t7, h5, h5 \
  188. VESLG $2, t2, t9 \ // multiply carry by 5
  189. VAQ t9, t2, t2 \
  190. VAQ t0, h4, h4 \
  191. VAQ t1, h5, h5 \
  192. VAQ t2, h3, h3 \
  193. // carry h0->h1->h2->h0
  194. // input: h0, h1, h2
  195. // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
  196. // output: h0, h1, h2
  197. #define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
  198. VLEIB $7, $0x28, t3 \ // 5 byte shift mask
  199. VREPIB $4, t4 \ // 4 bit shift mask
  200. VREPIB $2, t7 \ // 2 bit shift mask
  201. VGBM $0x003F, t5 \ // mask to clear carry bits
  202. VSRLB t3, h0, t0 \
  203. VSRLB t3, h1, t1 \
  204. VSRLB t3, h2, t2 \
  205. VESRLG $4, t5, t5 \ // 44 bit clear mask
  206. VSRL t4, t0, t0 \
  207. VSRL t4, t1, t1 \
  208. VSRL t7, t2, t2 \
  209. VESRLG $2, t5, t6 \ // 42 bit clear mask
  210. VESLG $2, t2, t8 \
  211. VAQ t8, t2, t2 \
  212. VN t5, h0, h0 \
  213. VN t5, h1, h1 \
  214. VN t6, h2, h2 \
  215. VAQ t0, h1, h1 \
  216. VAQ t1, h2, h2 \
  217. VAQ t2, h0, h0 \
  218. VSRLB t3, h0, t0 \
  219. VSRLB t3, h1, t1 \
  220. VSRLB t3, h2, t2 \
  221. VSRL t4, t0, t0 \
  222. VSRL t4, t1, t1 \
  223. VSRL t7, t2, t2 \
  224. VN t5, h0, h0 \
  225. VN t5, h1, h1 \
  226. VESLG $2, t2, t8 \
  227. VN t6, h2, h2 \
  228. VAQ t0, h1, h1 \
  229. VAQ t8, t2, t2 \
  230. VAQ t1, h2, h2 \
  231. VAQ t2, h0, h0 \
  232. // expands two message blocks into the lower halfs of the d registers
  233. // moves the contents of the d registers into upper halfs
  234. // input: in1, in2, d0, d1, d2, d3, d4, d5
  235. // temp: TEMP0, TEMP1, TEMP2, TEMP3
  236. // output: d0, d1, d2, d3, d4, d5
  237. #define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
  238. VGBM $0xff3f, TEMP0 \
  239. VGBM $0xff1f, TEMP1 \
  240. VESLG $4, d1, TEMP2 \
  241. VESLG $4, d4, TEMP3 \
  242. VESRLG $4, TEMP0, TEMP0 \
  243. VPERM in1, d0, EX0, d0 \
  244. VPERM in2, d3, EX0, d3 \
  245. VPERM in1, d2, EX2, d2 \
  246. VPERM in2, d5, EX2, d5 \
  247. VPERM in1, TEMP2, EX1, d1 \
  248. VPERM in2, TEMP3, EX1, d4 \
  249. VN TEMP0, d0, d0 \
  250. VN TEMP0, d3, d3 \
  251. VESRLG $4, d1, d1 \
  252. VESRLG $4, d4, d4 \
  253. VN TEMP1, d2, d2 \
  254. VN TEMP1, d5, d5 \
  255. VN TEMP0, d1, d1 \
  256. VN TEMP0, d4, d4 \
  257. // expands one message block into the lower halfs of the d registers
  258. // moves the contents of the d registers into upper halfs
  259. // input: in, d0, d1, d2
  260. // temp: TEMP0, TEMP1, TEMP2
  261. // output: d0, d1, d2
  262. #define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
  263. VGBM $0xff3f, TEMP0 \
  264. VESLG $4, d1, TEMP2 \
  265. VGBM $0xff1f, TEMP1 \
  266. VPERM in, d0, EX0, d0 \
  267. VESRLG $4, TEMP0, TEMP0 \
  268. VPERM in, d2, EX2, d2 \
  269. VPERM in, TEMP2, EX1, d1 \
  270. VN TEMP0, d0, d0 \
  271. VN TEMP1, d2, d2 \
  272. VESRLG $4, d1, d1 \
  273. VN TEMP0, d1, d1 \
  274. // pack h2:h0 into h1:h0 (no carry)
  275. // input: h0, h1, h2
  276. // output: h0, h1, h2
  277. #define PACK(h0, h1, h2) \
  278. VMRLG h1, h2, h2 \ // copy h1 to upper half h2
  279. VESLG $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
  280. VO h0, h1, h0 \ // combine h0 with 20 bits from limb 1
  281. VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
  282. VLEIG $1, $0, h1 \ // clear h2 stuff from lower half of h1
  283. VO h0, h1, h0 \ // h0 now has 88 bits (limb 0 and 1)
  284. VLEIG $0, $0, h2 \ // clear upper half of h2
  285. VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
  286. VLEIB $7, $88, h1 \ // for byte shift (11 bytes)
  287. VSLB h1, h2, h2 \ // shift h2 11 bytes to the left
  288. VO h0, h2, h0 \ // combine h0 with 20 bits from limb 1
  289. VLEIG $0, $0, h1 \ // clear upper half of h1
  290. // if h > 2**130-5 then h -= 2**130-5
  291. // input: h0, h1
  292. // temp: t0, t1, t2
  293. // output: h0
  294. #define MOD(h0, h1, t0, t1, t2) \
  295. VZERO t0 \
  296. VLEIG $1, $5, t0 \
  297. VACCQ h0, t0, t1 \
  298. VAQ h0, t0, t0 \
  299. VONE t2 \
  300. VLEIG $1, $-4, t2 \
  301. VAQ t2, t1, t1 \
  302. VACCQ h1, t1, t1 \
  303. VONE t2 \
  304. VAQ t2, t1, t1 \
  305. VN h0, t1, t2 \
  306. VNC t0, t1, t1 \
  307. VO t1, t2, h0 \
  308. // func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
  309. TEXT ·poly1305vmsl(SB), $0-32
  310. // This code processes 6 + up to 4 blocks (32 bytes) per iteration
  311. // using the algorithm described in:
  312. // NEON crypto, Daniel J. Bernstein & Peter Schwabe
  313. // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
  314. // And as moddified for VMSL as described in
  315. // Accelerating Poly1305 Cryptographic Message Authentication on the z14
  316. // O'Farrell et al, CASCON 2017, p48-55
  317. // https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
  318. LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
  319. VZERO V0 // c
  320. // load EX0, EX1 and EX2
  321. MOVD $·constants<>(SB), R5
  322. VLM (R5), EX0, EX2 // c
  323. // setup r
  324. VL (R4), T_0
  325. MOVD $·keyMask<>(SB), R6
  326. VL (R6), T_1
  327. VN T_0, T_1, T_0
  328. VZERO T_2 // limbs for r
  329. VZERO T_3
  330. VZERO T_4
  331. EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
  332. // T_2, T_3, T_4: [0, r]
  333. // setup r*20
  334. VLEIG $0, $0, T_0
  335. VLEIG $1, $20, T_0 // T_0: [0, 20]
  336. VZERO T_5
  337. VZERO T_6
  338. VMSLG T_0, T_3, T_5, T_5
  339. VMSLG T_0, T_4, T_6, T_6
  340. // store r for final block in GR
  341. VLGVG $1, T_2, RSAVE_0 // c
  342. VLGVG $1, T_3, RSAVE_1 // c
  343. VLGVG $1, T_4, RSAVE_2 // c
  344. VLGVG $1, T_5, R5SAVE_1 // c
  345. VLGVG $1, T_6, R5SAVE_2 // c
  346. // initialize h
  347. VZERO H0_0
  348. VZERO H1_0
  349. VZERO H2_0
  350. VZERO H0_1
  351. VZERO H1_1
  352. VZERO H2_1
  353. // initialize pointer for reduce constants
  354. MOVD $·reduce<>(SB), R12
  355. // calculate r**2 and 20*(r**2)
  356. VZERO R_0
  357. VZERO R_1
  358. VZERO R_2
  359. SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
  360. REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
  361. VZERO R5_1
  362. VZERO R5_2
  363. VMSLG T_0, R_1, R5_1, R5_1
  364. VMSLG T_0, R_2, R5_2, R5_2
  365. // skip r**4 calculation if 3 blocks or less
  366. CMPBLE R3, $48, b4
  367. // calculate r**4 and 20*(r**4)
  368. VZERO T_8
  369. VZERO T_9
  370. VZERO T_10
  371. SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
  372. REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
  373. VZERO T_2
  374. VZERO T_3
  375. VMSLG T_0, T_9, T_2, T_2
  376. VMSLG T_0, T_10, T_3, T_3
  377. // put r**2 to the right and r**4 to the left of R_0, R_1, R_2
  378. VSLDB $8, T_8, T_8, T_8
  379. VSLDB $8, T_9, T_9, T_9
  380. VSLDB $8, T_10, T_10, T_10
  381. VSLDB $8, T_2, T_2, T_2
  382. VSLDB $8, T_3, T_3, T_3
  383. VO T_8, R_0, R_0
  384. VO T_9, R_1, R_1
  385. VO T_10, R_2, R_2
  386. VO T_2, R5_1, R5_1
  387. VO T_3, R5_2, R5_2
  388. CMPBLE R3, $80, load // less than or equal to 5 blocks in message
  389. // 6(or 5+1) blocks
  390. SUB $81, R3
  391. VLM (R2), M0, M4
  392. VLL R3, 80(R2), M5
  393. ADD $1, R3
  394. MOVBZ $1, R0
  395. CMPBGE R3, $16, 2(PC)
  396. VLVGB R3, R0, M5
  397. MOVD $96(R2), R2
  398. EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
  399. EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
  400. VLEIB $2, $1, H2_0
  401. VLEIB $2, $1, H2_1
  402. VLEIB $10, $1, H2_0
  403. VLEIB $10, $1, H2_1
  404. VZERO M0
  405. VZERO M1
  406. VZERO M2
  407. VZERO M3
  408. VZERO T_4
  409. VZERO T_10
  410. EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
  411. VLR T_4, M4
  412. VLEIB $10, $1, M2
  413. CMPBLT R3, $16, 2(PC)
  414. VLEIB $10, $1, T_10
  415. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  416. REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
  417. VMRHG V0, H0_1, H0_0
  418. VMRHG V0, H1_1, H1_0
  419. VMRHG V0, H2_1, H2_0
  420. VMRLG V0, H0_1, H0_1
  421. VMRLG V0, H1_1, H1_1
  422. VMRLG V0, H2_1, H2_1
  423. SUB $16, R3
  424. CMPBLE R3, $0, square
  425. load:
  426. // load EX0, EX1 and EX2
  427. MOVD $·c<>(SB), R5
  428. VLM (R5), EX0, EX2
  429. loop:
  430. CMPBLE R3, $64, add // b4 // last 4 or less blocks left
  431. // next 4 full blocks
  432. VLM (R2), M2, M5
  433. SUB $64, R3
  434. MOVD $64(R2), R2
  435. REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
  436. // expacc in-lined to create [m2, m3] limbs
  437. VGBM $0x3f3f, T_0 // 44 bit clear mask
  438. VGBM $0x1f1f, T_1 // 40 bit clear mask
  439. VPERM M2, M3, EX0, T_3
  440. VESRLG $4, T_0, T_0 // 44 bit clear mask ready
  441. VPERM M2, M3, EX1, T_4
  442. VPERM M2, M3, EX2, T_5
  443. VN T_0, T_3, T_3
  444. VESRLG $4, T_4, T_4
  445. VN T_1, T_5, T_5
  446. VN T_0, T_4, T_4
  447. VMRHG H0_1, T_3, H0_0
  448. VMRHG H1_1, T_4, H1_0
  449. VMRHG H2_1, T_5, H2_0
  450. VMRLG H0_1, T_3, H0_1
  451. VMRLG H1_1, T_4, H1_1
  452. VMRLG H2_1, T_5, H2_1
  453. VLEIB $10, $1, H2_0
  454. VLEIB $10, $1, H2_1
  455. VPERM M4, M5, EX0, T_3
  456. VPERM M4, M5, EX1, T_4
  457. VPERM M4, M5, EX2, T_5
  458. VN T_0, T_3, T_3
  459. VESRLG $4, T_4, T_4
  460. VN T_1, T_5, T_5
  461. VN T_0, T_4, T_4
  462. VMRHG V0, T_3, M0
  463. VMRHG V0, T_4, M1
  464. VMRHG V0, T_5, M2
  465. VMRLG V0, T_3, M3
  466. VMRLG V0, T_4, M4
  467. VMRLG V0, T_5, M5
  468. VLEIB $10, $1, M2
  469. VLEIB $10, $1, M5
  470. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  471. CMPBNE R3, $0, loop
  472. REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
  473. VMRHG V0, H0_1, H0_0
  474. VMRHG V0, H1_1, H1_0
  475. VMRHG V0, H2_1, H2_0
  476. VMRLG V0, H0_1, H0_1
  477. VMRLG V0, H1_1, H1_1
  478. VMRLG V0, H2_1, H2_1
  479. // load EX0, EX1, EX2
  480. MOVD $·constants<>(SB), R5
  481. VLM (R5), EX0, EX2
  482. // sum vectors
  483. VAQ H0_0, H0_1, H0_0
  484. VAQ H1_0, H1_1, H1_0
  485. VAQ H2_0, H2_1, H2_0
  486. // h may be >= 2*(2**130-5) so we need to reduce it again
  487. // M0...M4 are used as temps here
  488. REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
  489. next: // carry h1->h2
  490. VLEIB $7, $0x28, T_1
  491. VREPIB $4, T_2
  492. VGBM $0x003F, T_3
  493. VESRLG $4, T_3
  494. // byte shift
  495. VSRLB T_1, H1_0, T_4
  496. // bit shift
  497. VSRL T_2, T_4, T_4
  498. // clear h1 carry bits
  499. VN T_3, H1_0, H1_0
  500. // add carry
  501. VAQ T_4, H2_0, H2_0
  502. // h is now < 2*(2**130-5)
  503. // pack h into h1 (hi) and h0 (lo)
  504. PACK(H0_0, H1_0, H2_0)
  505. // if h > 2**130-5 then h -= 2**130-5
  506. MOD(H0_0, H1_0, T_0, T_1, T_2)
  507. // h += s
  508. MOVD $·bswapMask<>(SB), R5
  509. VL (R5), T_1
  510. VL 16(R4), T_0
  511. VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
  512. VAQ T_0, H0_0, H0_0
  513. VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
  514. VST H0_0, (R1)
  515. RET
  516. add:
  517. // load EX0, EX1, EX2
  518. MOVD $·constants<>(SB), R5
  519. VLM (R5), EX0, EX2
  520. REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
  521. VMRHG V0, H0_1, H0_0
  522. VMRHG V0, H1_1, H1_0
  523. VMRHG V0, H2_1, H2_0
  524. VMRLG V0, H0_1, H0_1
  525. VMRLG V0, H1_1, H1_1
  526. VMRLG V0, H2_1, H2_1
  527. CMPBLE R3, $64, b4
  528. b4:
  529. CMPBLE R3, $48, b3 // 3 blocks or less
  530. // 4(3+1) blocks remaining
  531. SUB $49, R3
  532. VLM (R2), M0, M2
  533. VLL R3, 48(R2), M3
  534. ADD $1, R3
  535. MOVBZ $1, R0
  536. CMPBEQ R3, $16, 2(PC)
  537. VLVGB R3, R0, M3
  538. MOVD $64(R2), R2
  539. EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
  540. VLEIB $10, $1, H2_0
  541. VLEIB $10, $1, H2_1
  542. VZERO M0
  543. VZERO M1
  544. VZERO M4
  545. VZERO M5
  546. VZERO T_4
  547. VZERO T_10
  548. EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
  549. VLR T_4, M2
  550. VLEIB $10, $1, M4
  551. CMPBNE R3, $16, 2(PC)
  552. VLEIB $10, $1, T_10
  553. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  554. REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
  555. VMRHG V0, H0_1, H0_0
  556. VMRHG V0, H1_1, H1_0
  557. VMRHG V0, H2_1, H2_0
  558. VMRLG V0, H0_1, H0_1
  559. VMRLG V0, H1_1, H1_1
  560. VMRLG V0, H2_1, H2_1
  561. SUB $16, R3
  562. CMPBLE R3, $0, square // this condition must always hold true!
  563. b3:
  564. CMPBLE R3, $32, b2
  565. // 3 blocks remaining
  566. // setup [r²,r]
  567. VSLDB $8, R_0, R_0, R_0
  568. VSLDB $8, R_1, R_1, R_1
  569. VSLDB $8, R_2, R_2, R_2
  570. VSLDB $8, R5_1, R5_1, R5_1
  571. VSLDB $8, R5_2, R5_2, R5_2
  572. VLVGG $1, RSAVE_0, R_0
  573. VLVGG $1, RSAVE_1, R_1
  574. VLVGG $1, RSAVE_2, R_2
  575. VLVGG $1, R5SAVE_1, R5_1
  576. VLVGG $1, R5SAVE_2, R5_2
  577. // setup [h0, h1]
  578. VSLDB $8, H0_0, H0_0, H0_0
  579. VSLDB $8, H1_0, H1_0, H1_0
  580. VSLDB $8, H2_0, H2_0, H2_0
  581. VO H0_1, H0_0, H0_0
  582. VO H1_1, H1_0, H1_0
  583. VO H2_1, H2_0, H2_0
  584. VZERO H0_1
  585. VZERO H1_1
  586. VZERO H2_1
  587. VZERO M0
  588. VZERO M1
  589. VZERO M2
  590. VZERO M3
  591. VZERO M4
  592. VZERO M5
  593. // H*[r**2, r]
  594. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  595. REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
  596. SUB $33, R3
  597. VLM (R2), M0, M1
  598. VLL R3, 32(R2), M2
  599. ADD $1, R3
  600. MOVBZ $1, R0
  601. CMPBEQ R3, $16, 2(PC)
  602. VLVGB R3, R0, M2
  603. // H += m0
  604. VZERO T_1
  605. VZERO T_2
  606. VZERO T_3
  607. EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
  608. VLEIB $10, $1, T_3
  609. VAG H0_0, T_1, H0_0
  610. VAG H1_0, T_2, H1_0
  611. VAG H2_0, T_3, H2_0
  612. VZERO M0
  613. VZERO M3
  614. VZERO M4
  615. VZERO M5
  616. VZERO T_10
  617. // (H+m0)*r
  618. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  619. REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
  620. // H += m1
  621. VZERO V0
  622. VZERO T_1
  623. VZERO T_2
  624. VZERO T_3
  625. EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
  626. VLEIB $10, $1, T_3
  627. VAQ H0_0, T_1, H0_0
  628. VAQ H1_0, T_2, H1_0
  629. VAQ H2_0, T_3, H2_0
  630. REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
  631. // [H, m2] * [r**2, r]
  632. EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
  633. CMPBNE R3, $16, 2(PC)
  634. VLEIB $10, $1, H2_0
  635. VZERO M0
  636. VZERO M1
  637. VZERO M2
  638. VZERO M3
  639. VZERO M4
  640. VZERO M5
  641. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  642. REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
  643. SUB $16, R3
  644. CMPBLE R3, $0, next // this condition must always hold true!
  645. b2:
  646. CMPBLE R3, $16, b1
  647. // 2 blocks remaining
  648. // setup [r²,r]
  649. VSLDB $8, R_0, R_0, R_0
  650. VSLDB $8, R_1, R_1, R_1
  651. VSLDB $8, R_2, R_2, R_2
  652. VSLDB $8, R5_1, R5_1, R5_1
  653. VSLDB $8, R5_2, R5_2, R5_2
  654. VLVGG $1, RSAVE_0, R_0
  655. VLVGG $1, RSAVE_1, R_1
  656. VLVGG $1, RSAVE_2, R_2
  657. VLVGG $1, R5SAVE_1, R5_1
  658. VLVGG $1, R5SAVE_2, R5_2
  659. // setup [h0, h1]
  660. VSLDB $8, H0_0, H0_0, H0_0
  661. VSLDB $8, H1_0, H1_0, H1_0
  662. VSLDB $8, H2_0, H2_0, H2_0
  663. VO H0_1, H0_0, H0_0
  664. VO H1_1, H1_0, H1_0
  665. VO H2_1, H2_0, H2_0
  666. VZERO H0_1
  667. VZERO H1_1
  668. VZERO H2_1
  669. VZERO M0
  670. VZERO M1
  671. VZERO M2
  672. VZERO M3
  673. VZERO M4
  674. VZERO M5
  675. // H*[r**2, r]
  676. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  677. REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
  678. VMRHG V0, H0_1, H0_0
  679. VMRHG V0, H1_1, H1_0
  680. VMRHG V0, H2_1, H2_0
  681. VMRLG V0, H0_1, H0_1
  682. VMRLG V0, H1_1, H1_1
  683. VMRLG V0, H2_1, H2_1
  684. // move h to the left and 0s at the right
  685. VSLDB $8, H0_0, H0_0, H0_0
  686. VSLDB $8, H1_0, H1_0, H1_0
  687. VSLDB $8, H2_0, H2_0, H2_0
  688. // get message blocks and append 1 to start
  689. SUB $17, R3
  690. VL (R2), M0
  691. VLL R3, 16(R2), M1
  692. ADD $1, R3
  693. MOVBZ $1, R0
  694. CMPBEQ R3, $16, 2(PC)
  695. VLVGB R3, R0, M1
  696. VZERO T_6
  697. VZERO T_7
  698. VZERO T_8
  699. EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
  700. EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
  701. VLEIB $2, $1, T_8
  702. CMPBNE R3, $16, 2(PC)
  703. VLEIB $10, $1, T_8
  704. // add [m0, m1] to h
  705. VAG H0_0, T_6, H0_0
  706. VAG H1_0, T_7, H1_0
  707. VAG H2_0, T_8, H2_0
  708. VZERO M2
  709. VZERO M3
  710. VZERO M4
  711. VZERO M5
  712. VZERO T_10
  713. VZERO M0
  714. // at this point R_0 .. R5_2 look like [r**2, r]
  715. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  716. REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
  717. SUB $16, R3, R3
  718. CMPBLE R3, $0, next
  719. b1:
  720. CMPBLE R3, $0, next
  721. // 1 block remaining
  722. // setup [r²,r]
  723. VSLDB $8, R_0, R_0, R_0
  724. VSLDB $8, R_1, R_1, R_1
  725. VSLDB $8, R_2, R_2, R_2
  726. VSLDB $8, R5_1, R5_1, R5_1
  727. VSLDB $8, R5_2, R5_2, R5_2
  728. VLVGG $1, RSAVE_0, R_0
  729. VLVGG $1, RSAVE_1, R_1
  730. VLVGG $1, RSAVE_2, R_2
  731. VLVGG $1, R5SAVE_1, R5_1
  732. VLVGG $1, R5SAVE_2, R5_2
  733. // setup [h0, h1]
  734. VSLDB $8, H0_0, H0_0, H0_0
  735. VSLDB $8, H1_0, H1_0, H1_0
  736. VSLDB $8, H2_0, H2_0, H2_0
  737. VO H0_1, H0_0, H0_0
  738. VO H1_1, H1_0, H1_0
  739. VO H2_1, H2_0, H2_0
  740. VZERO H0_1
  741. VZERO H1_1
  742. VZERO H2_1
  743. VZERO M0
  744. VZERO M1
  745. VZERO M2
  746. VZERO M3
  747. VZERO M4
  748. VZERO M5
  749. // H*[r**2, r]
  750. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  751. REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
  752. // set up [0, m0] limbs
  753. SUB $1, R3
  754. VLL R3, (R2), M0
  755. ADD $1, R3
  756. MOVBZ $1, R0
  757. CMPBEQ R3, $16, 2(PC)
  758. VLVGB R3, R0, M0
  759. VZERO T_1
  760. VZERO T_2
  761. VZERO T_3
  762. EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
  763. CMPBNE R3, $16, 2(PC)
  764. VLEIB $10, $1, T_3
  765. // h+m0
  766. VAQ H0_0, T_1, H0_0
  767. VAQ H1_0, T_2, H1_0
  768. VAQ H2_0, T_3, H2_0
  769. VZERO M0
  770. VZERO M1
  771. VZERO M2
  772. VZERO M3
  773. VZERO M4
  774. VZERO M5
  775. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  776. REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
  777. BR next
  778. square:
  779. // setup [r²,r]
  780. VSLDB $8, R_0, R_0, R_0
  781. VSLDB $8, R_1, R_1, R_1
  782. VSLDB $8, R_2, R_2, R_2
  783. VSLDB $8, R5_1, R5_1, R5_1
  784. VSLDB $8, R5_2, R5_2, R5_2
  785. VLVGG $1, RSAVE_0, R_0
  786. VLVGG $1, RSAVE_1, R_1
  787. VLVGG $1, RSAVE_2, R_2
  788. VLVGG $1, R5SAVE_1, R5_1
  789. VLVGG $1, R5SAVE_2, R5_2
  790. // setup [h0, h1]
  791. VSLDB $8, H0_0, H0_0, H0_0
  792. VSLDB $8, H1_0, H1_0, H1_0
  793. VSLDB $8, H2_0, H2_0, H2_0
  794. VO H0_1, H0_0, H0_0
  795. VO H1_1, H1_0, H1_0
  796. VO H2_1, H2_0, H2_0
  797. VZERO H0_1
  798. VZERO H1_1
  799. VZERO H2_1
  800. VZERO M0
  801. VZERO M1
  802. VZERO M2
  803. VZERO M3
  804. VZERO M4
  805. VZERO M5
  806. // (h0*r**2) + (h1*r)
  807. MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
  808. REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
  809. BR next