sum_s390x.s 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build s390x,go1.11,!gccgo,!appengine
  5. #include "textflag.h"
  6. // Implementation of Poly1305 using the vector facility (vx).
  7. // constants
  8. #define MOD26 V0
  9. #define EX0 V1
  10. #define EX1 V2
  11. #define EX2 V3
  12. // temporaries
  13. #define T_0 V4
  14. #define T_1 V5
  15. #define T_2 V6
  16. #define T_3 V7
  17. #define T_4 V8
  18. // key (r)
  19. #define R_0 V9
  20. #define R_1 V10
  21. #define R_2 V11
  22. #define R_3 V12
  23. #define R_4 V13
  24. #define R5_1 V14
  25. #define R5_2 V15
  26. #define R5_3 V16
  27. #define R5_4 V17
  28. #define RSAVE_0 R5
  29. #define RSAVE_1 R6
  30. #define RSAVE_2 R7
  31. #define RSAVE_3 R8
  32. #define RSAVE_4 R9
  33. #define R5SAVE_1 V28
  34. #define R5SAVE_2 V29
  35. #define R5SAVE_3 V30
  36. #define R5SAVE_4 V31
  37. // message block
  38. #define F_0 V18
  39. #define F_1 V19
  40. #define F_2 V20
  41. #define F_3 V21
  42. #define F_4 V22
  43. // accumulator
  44. #define H_0 V23
  45. #define H_1 V24
  46. #define H_2 V25
  47. #define H_3 V26
  48. #define H_4 V27
  49. GLOBL ·keyMask<>(SB), RODATA, $16
  50. DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
  51. DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
  52. GLOBL ·bswapMask<>(SB), RODATA, $16
  53. DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
  54. DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
  55. GLOBL ·constants<>(SB), RODATA, $64
  56. // MOD26
  57. DATA ·constants<>+0(SB)/8, $0x3ffffff
  58. DATA ·constants<>+8(SB)/8, $0x3ffffff
  59. // EX0
  60. DATA ·constants<>+16(SB)/8, $0x0006050403020100
  61. DATA ·constants<>+24(SB)/8, $0x1016151413121110
  62. // EX1
  63. DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
  64. DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
  65. // EX2
  66. DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
  67. DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
  68. // h = (f*g) % (2**130-5) [partial reduction]
  69. #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
  70. VMLOF f0, g0, h0 \
  71. VMLOF f0, g1, h1 \
  72. VMLOF f0, g2, h2 \
  73. VMLOF f0, g3, h3 \
  74. VMLOF f0, g4, h4 \
  75. VMLOF f1, g54, T_0 \
  76. VMLOF f1, g0, T_1 \
  77. VMLOF f1, g1, T_2 \
  78. VMLOF f1, g2, T_3 \
  79. VMLOF f1, g3, T_4 \
  80. VMALOF f2, g53, h0, h0 \
  81. VMALOF f2, g54, h1, h1 \
  82. VMALOF f2, g0, h2, h2 \
  83. VMALOF f2, g1, h3, h3 \
  84. VMALOF f2, g2, h4, h4 \
  85. VMALOF f3, g52, T_0, T_0 \
  86. VMALOF f3, g53, T_1, T_1 \
  87. VMALOF f3, g54, T_2, T_2 \
  88. VMALOF f3, g0, T_3, T_3 \
  89. VMALOF f3, g1, T_4, T_4 \
  90. VMALOF f4, g51, h0, h0 \
  91. VMALOF f4, g52, h1, h1 \
  92. VMALOF f4, g53, h2, h2 \
  93. VMALOF f4, g54, h3, h3 \
  94. VMALOF f4, g0, h4, h4 \
  95. VAG T_0, h0, h0 \
  96. VAG T_1, h1, h1 \
  97. VAG T_2, h2, h2 \
  98. VAG T_3, h3, h3 \
  99. VAG T_4, h4, h4
  100. // carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
  101. #define REDUCE(h0, h1, h2, h3, h4) \
  102. VESRLG $26, h0, T_0 \
  103. VESRLG $26, h3, T_1 \
  104. VN MOD26, h0, h0 \
  105. VN MOD26, h3, h3 \
  106. VAG T_0, h1, h1 \
  107. VAG T_1, h4, h4 \
  108. VESRLG $26, h1, T_2 \
  109. VESRLG $26, h4, T_3 \
  110. VN MOD26, h1, h1 \
  111. VN MOD26, h4, h4 \
  112. VESLG $2, T_3, T_4 \
  113. VAG T_3, T_4, T_4 \
  114. VAG T_2, h2, h2 \
  115. VAG T_4, h0, h0 \
  116. VESRLG $26, h2, T_0 \
  117. VESRLG $26, h0, T_1 \
  118. VN MOD26, h2, h2 \
  119. VN MOD26, h0, h0 \
  120. VAG T_0, h3, h3 \
  121. VAG T_1, h1, h1 \
  122. VESRLG $26, h3, T_2 \
  123. VN MOD26, h3, h3 \
  124. VAG T_2, h4, h4
  125. // expand in0 into d[0] and in1 into d[1]
  126. #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
  127. VGBM $0x0707, d1 \ // d1=tmp
  128. VPERM in0, in1, EX2, d4 \
  129. VPERM in0, in1, EX0, d0 \
  130. VPERM in0, in1, EX1, d2 \
  131. VN d1, d4, d4 \
  132. VESRLG $26, d0, d1 \
  133. VESRLG $30, d2, d3 \
  134. VESRLG $4, d2, d2 \
  135. VN MOD26, d0, d0 \
  136. VN MOD26, d1, d1 \
  137. VN MOD26, d2, d2 \
  138. VN MOD26, d3, d3
  139. // pack h4:h0 into h1:h0 (no carry)
  140. #define PACK(h0, h1, h2, h3, h4) \
  141. VESLG $26, h1, h1 \
  142. VESLG $26, h3, h3 \
  143. VO h0, h1, h0 \
  144. VO h2, h3, h2 \
  145. VESLG $4, h2, h2 \
  146. VLEIB $7, $48, h1 \
  147. VSLB h1, h2, h2 \
  148. VO h0, h2, h0 \
  149. VLEIB $7, $104, h1 \
  150. VSLB h1, h4, h3 \
  151. VO h3, h0, h0 \
  152. VLEIB $7, $24, h1 \
  153. VSRLB h1, h4, h1
  154. // if h > 2**130-5 then h -= 2**130-5
  155. #define MOD(h0, h1, t0, t1, t2) \
  156. VZERO t0 \
  157. VLEIG $1, $5, t0 \
  158. VACCQ h0, t0, t1 \
  159. VAQ h0, t0, t0 \
  160. VONE t2 \
  161. VLEIG $1, $-4, t2 \
  162. VAQ t2, t1, t1 \
  163. VACCQ h1, t1, t1 \
  164. VONE t2 \
  165. VAQ t2, t1, t1 \
  166. VN h0, t1, t2 \
  167. VNC t0, t1, t1 \
  168. VO t1, t2, h0
  169. // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
  170. TEXT ·poly1305vx(SB), $0-32
  171. // This code processes up to 2 blocks (32 bytes) per iteration
  172. // using the algorithm described in:
  173. // NEON crypto, Daniel J. Bernstein & Peter Schwabe
  174. // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
  175. LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
  176. // load MOD26, EX0, EX1 and EX2
  177. MOVD $·constants<>(SB), R5
  178. VLM (R5), MOD26, EX2
  179. // setup r
  180. VL (R4), T_0
  181. MOVD $·keyMask<>(SB), R6
  182. VL (R6), T_1
  183. VN T_0, T_1, T_0
  184. EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
  185. // setup r*5
  186. VLEIG $0, $5, T_0
  187. VLEIG $1, $5, T_0
  188. // store r (for final block)
  189. VMLOF T_0, R_1, R5SAVE_1
  190. VMLOF T_0, R_2, R5SAVE_2
  191. VMLOF T_0, R_3, R5SAVE_3
  192. VMLOF T_0, R_4, R5SAVE_4
  193. VLGVG $0, R_0, RSAVE_0
  194. VLGVG $0, R_1, RSAVE_1
  195. VLGVG $0, R_2, RSAVE_2
  196. VLGVG $0, R_3, RSAVE_3
  197. VLGVG $0, R_4, RSAVE_4
  198. // skip r**2 calculation
  199. CMPBLE R3, $16, skip
  200. // calculate r**2
  201. MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
  202. REDUCE(H_0, H_1, H_2, H_3, H_4)
  203. VLEIG $0, $5, T_0
  204. VLEIG $1, $5, T_0
  205. VMLOF T_0, H_1, R5_1
  206. VMLOF T_0, H_2, R5_2
  207. VMLOF T_0, H_3, R5_3
  208. VMLOF T_0, H_4, R5_4
  209. VLR H_0, R_0
  210. VLR H_1, R_1
  211. VLR H_2, R_2
  212. VLR H_3, R_3
  213. VLR H_4, R_4
  214. // initialize h
  215. VZERO H_0
  216. VZERO H_1
  217. VZERO H_2
  218. VZERO H_3
  219. VZERO H_4
  220. loop:
  221. CMPBLE R3, $32, b2
  222. VLM (R2), T_0, T_1
  223. SUB $32, R3
  224. MOVD $32(R2), R2
  225. EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
  226. VLEIB $4, $1, F_4
  227. VLEIB $12, $1, F_4
  228. multiply:
  229. VAG H_0, F_0, F_0
  230. VAG H_1, F_1, F_1
  231. VAG H_2, F_2, F_2
  232. VAG H_3, F_3, F_3
  233. VAG H_4, F_4, F_4
  234. MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
  235. REDUCE(H_0, H_1, H_2, H_3, H_4)
  236. CMPBNE R3, $0, loop
  237. finish:
  238. // sum vectors
  239. VZERO T_0
  240. VSUMQG H_0, T_0, H_0
  241. VSUMQG H_1, T_0, H_1
  242. VSUMQG H_2, T_0, H_2
  243. VSUMQG H_3, T_0, H_3
  244. VSUMQG H_4, T_0, H_4
  245. // h may be >= 2*(2**130-5) so we need to reduce it again
  246. REDUCE(H_0, H_1, H_2, H_3, H_4)
  247. // carry h1->h4
  248. VESRLG $26, H_1, T_1
  249. VN MOD26, H_1, H_1
  250. VAQ T_1, H_2, H_2
  251. VESRLG $26, H_2, T_2
  252. VN MOD26, H_2, H_2
  253. VAQ T_2, H_3, H_3
  254. VESRLG $26, H_3, T_3
  255. VN MOD26, H_3, H_3
  256. VAQ T_3, H_4, H_4
  257. // h is now < 2*(2**130-5)
  258. // pack h into h1 (hi) and h0 (lo)
  259. PACK(H_0, H_1, H_2, H_3, H_4)
  260. // if h > 2**130-5 then h -= 2**130-5
  261. MOD(H_0, H_1, T_0, T_1, T_2)
  262. // h += s
  263. MOVD $·bswapMask<>(SB), R5
  264. VL (R5), T_1
  265. VL 16(R4), T_0
  266. VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
  267. VAQ T_0, H_0, H_0
  268. VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
  269. VST H_0, (R1)
  270. RET
  271. b2:
  272. CMPBLE R3, $16, b1
  273. // 2 blocks remaining
  274. SUB $17, R3
  275. VL (R2), T_0
  276. VLL R3, 16(R2), T_1
  277. ADD $1, R3
  278. MOVBZ $1, R0
  279. CMPBEQ R3, $16, 2(PC)
  280. VLVGB R3, R0, T_1
  281. EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
  282. CMPBNE R3, $16, 2(PC)
  283. VLEIB $12, $1, F_4
  284. VLEIB $4, $1, F_4
  285. // setup [r²,r]
  286. VLVGG $1, RSAVE_0, R_0
  287. VLVGG $1, RSAVE_1, R_1
  288. VLVGG $1, RSAVE_2, R_2
  289. VLVGG $1, RSAVE_3, R_3
  290. VLVGG $1, RSAVE_4, R_4
  291. VPDI $0, R5_1, R5SAVE_1, R5_1
  292. VPDI $0, R5_2, R5SAVE_2, R5_2
  293. VPDI $0, R5_3, R5SAVE_3, R5_3
  294. VPDI $0, R5_4, R5SAVE_4, R5_4
  295. MOVD $0, R3
  296. BR multiply
  297. skip:
  298. VZERO H_0
  299. VZERO H_1
  300. VZERO H_2
  301. VZERO H_3
  302. VZERO H_4
  303. CMPBEQ R3, $0, finish
  304. b1:
  305. // 1 block remaining
  306. SUB $1, R3
  307. VLL R3, (R2), T_0
  308. ADD $1, R3
  309. MOVBZ $1, R0
  310. CMPBEQ R3, $16, 2(PC)
  311. VLVGB R3, R0, T_0
  312. VZERO T_1
  313. EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
  314. CMPBNE R3, $16, 2(PC)
  315. VLEIB $4, $1, F_4
  316. VLEIG $1, $1, R_0
  317. VZERO R_1
  318. VZERO R_2
  319. VZERO R_3
  320. VZERO R_4
  321. VZERO R5_1
  322. VZERO R5_2
  323. VZERO R5_3
  324. VZERO R5_4
  325. // setup [r, 1]
  326. VLVGG $0, RSAVE_0, R_0
  327. VLVGG $0, RSAVE_1, R_1
  328. VLVGG $0, RSAVE_2, R_2
  329. VLVGG $0, RSAVE_3, R_3
  330. VLVGG $0, RSAVE_4, R_4
  331. VPDI $0, R5SAVE_1, R5_1, R5_1
  332. VPDI $0, R5SAVE_2, R5_2, R5_2
  333. VPDI $0, R5SAVE_3, R5_3, R5_3
  334. VPDI $0, R5SAVE_4, R5_4, R5_4
  335. MOVD $0, R3
  336. BR multiply