aes_ni.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687
  1. /*
  2. Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
  3. The redistribution and use of this software (with or without changes)
  4. is allowed without the payment of fees or royalties provided that:
  5. source code distributions include the above copyright notice, this
  6. list of conditions and the following disclaimer;
  7. binary distributions include the above copyright notice, this list
  8. of conditions and the following disclaimer in their documentation.
  9. This software is provided 'as is' with no explicit or implied warranties
  10. in respect of its operation, including, but not limited to, correctness
  11. and fitness for purpose.
  12. ---------------------------------------------------------------------------
  13. Issue Date: 09/09/2014
  14. */
  15. #include "aes_ni.h"
  16. #if defined( USE_INTEL_AES_IF_PRESENT )
  17. #if defined(_MSC_VER)
  18. #include <intrin.h>
  19. #pragma intrinsic(__cpuid)
  20. #define INLINE __inline
  21. INLINE int has_aes_ni(void)
  22. {
  23. static int test = -1;
  24. if(test < 0)
  25. {
  26. int cpu_info[4];
  27. __cpuid(cpu_info, 1);
  28. test = cpu_info[2] & 0x02000000;
  29. }
  30. return test;
  31. }
  32. #elif defined( __GNUC__ )
  33. #include <cpuid.h>
  34. #if !defined(__clang__)
  35. #pragma GCC target ("ssse3")
  36. #pragma GCC target ("sse4.1")
  37. #pragma GCC target ("aes")
  38. #endif
  39. #include <x86intrin.h>
  40. #define INLINE static __inline
  41. INLINE int has_aes_ni()
  42. {
  43. static int test = -1;
  44. if(test < 0)
  45. {
  46. unsigned int a, b, c, d;
  47. if(!__get_cpuid(1, &a, &b, &c, &d))
  48. test = 0;
  49. else
  50. test = (c & 0x2000000);
  51. }
  52. return test;
  53. }
  54. #else
  55. #error AES New Instructions require Microsoft, Intel, GNU C, or CLANG
  56. #endif
  57. INLINE __m128i aes_128_assist(__m128i t1, __m128i t2)
  58. {
  59. __m128i t3;
  60. t2 = _mm_shuffle_epi32(t2, 0xff);
  61. t3 = _mm_slli_si128(t1, 0x4);
  62. t1 = _mm_xor_si128(t1, t3);
  63. t3 = _mm_slli_si128(t3, 0x4);
  64. t1 = _mm_xor_si128(t1, t3);
  65. t3 = _mm_slli_si128(t3, 0x4);
  66. t1 = _mm_xor_si128(t1, t3);
  67. t1 = _mm_xor_si128(t1, t2);
  68. return t1;
  69. }
  70. AES_RETURN aes_ni(encrypt_key128)(const unsigned char *key, aes_encrypt_ctx cx[1])
  71. {
  72. __m128i t1, t2;
  73. __m128i *ks = (__m128i*)cx->ks;
  74. if(!has_aes_ni())
  75. {
  76. return aes_xi(encrypt_key128)(key, cx);
  77. }
  78. t1 = _mm_loadu_si128((__m128i*)key);
  79. ks[0] = t1;
  80. t2 = _mm_aeskeygenassist_si128(t1, 0x1);
  81. t1 = aes_128_assist(t1, t2);
  82. ks[1] = t1;
  83. t2 = _mm_aeskeygenassist_si128(t1, 0x2);
  84. t1 = aes_128_assist(t1, t2);
  85. ks[2] = t1;
  86. t2 = _mm_aeskeygenassist_si128(t1, 0x4);
  87. t1 = aes_128_assist(t1, t2);
  88. ks[3] = t1;
  89. t2 = _mm_aeskeygenassist_si128(t1, 0x8);
  90. t1 = aes_128_assist(t1, t2);
  91. ks[4] = t1;
  92. t2 = _mm_aeskeygenassist_si128(t1, 0x10);
  93. t1 = aes_128_assist(t1, t2);
  94. ks[5] = t1;
  95. t2 = _mm_aeskeygenassist_si128(t1, 0x20);
  96. t1 = aes_128_assist(t1, t2);
  97. ks[6] = t1;
  98. t2 = _mm_aeskeygenassist_si128(t1, 0x40);
  99. t1 = aes_128_assist(t1, t2);
  100. ks[7] = t1;
  101. t2 = _mm_aeskeygenassist_si128(t1, 0x80);
  102. t1 = aes_128_assist(t1, t2);
  103. ks[8] = t1;
  104. t2 = _mm_aeskeygenassist_si128(t1, 0x1b);
  105. t1 = aes_128_assist(t1, t2);
  106. ks[9] = t1;
  107. t2 = _mm_aeskeygenassist_si128(t1, 0x36);
  108. t1 = aes_128_assist(t1, t2);
  109. ks[10] = t1;
  110. cx->inf.l = 0;
  111. cx->inf.b[0] = 10 * 16;
  112. return EXIT_SUCCESS;
  113. }
  114. INLINE void aes_192_assist(__m128i* t1, __m128i * t2, __m128i * t3)
  115. {
  116. __m128i t4;
  117. *t2 = _mm_shuffle_epi32(*t2, 0x55);
  118. t4 = _mm_slli_si128(*t1, 0x4);
  119. *t1 = _mm_xor_si128(*t1, t4);
  120. t4 = _mm_slli_si128(t4, 0x4);
  121. *t1 = _mm_xor_si128(*t1, t4);
  122. t4 = _mm_slli_si128(t4, 0x4);
  123. *t1 = _mm_xor_si128(*t1, t4);
  124. *t1 = _mm_xor_si128(*t1, *t2);
  125. *t2 = _mm_shuffle_epi32(*t1, 0xff);
  126. t4 = _mm_slli_si128(*t3, 0x4);
  127. *t3 = _mm_xor_si128(*t3, t4);
  128. *t3 = _mm_xor_si128(*t3, *t2);
  129. }
  130. AES_RETURN aes_ni(encrypt_key192)(const unsigned char *key, aes_encrypt_ctx cx[1])
  131. {
  132. __m128i t1, t2, t3;
  133. __m128i *ks = (__m128i*)cx->ks;
  134. if(!has_aes_ni())
  135. {
  136. return aes_xi(encrypt_key192)(key, cx);
  137. }
  138. t1 = _mm_loadu_si128((__m128i*)key);
  139. t3 = _mm_loadu_si128((__m128i*)(key + 16));
  140. ks[0] = t1;
  141. ks[1] = t3;
  142. t2 = _mm_aeskeygenassist_si128(t3, 0x1);
  143. aes_192_assist(&t1, &t2, &t3);
  144. ks[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[1]), _mm_castsi128_pd(t1), 0));
  145. ks[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
  146. t2 = _mm_aeskeygenassist_si128(t3, 0x2);
  147. aes_192_assist(&t1, &t2, &t3);
  148. ks[3] = t1;
  149. ks[4] = t3;
  150. t2 = _mm_aeskeygenassist_si128(t3, 0x4);
  151. aes_192_assist(&t1, &t2, &t3);
  152. ks[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[4]), _mm_castsi128_pd(t1), 0));
  153. ks[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
  154. t2 = _mm_aeskeygenassist_si128(t3, 0x8);
  155. aes_192_assist(&t1, &t2, &t3);
  156. ks[6] = t1;
  157. ks[7] = t3;
  158. t2 = _mm_aeskeygenassist_si128(t3, 0x10);
  159. aes_192_assist(&t1, &t2, &t3);
  160. ks[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[7]), _mm_castsi128_pd(t1), 0));
  161. ks[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
  162. t2 = _mm_aeskeygenassist_si128(t3, 0x20);
  163. aes_192_assist(&t1, &t2, &t3);
  164. ks[9] = t1;
  165. ks[10] = t3;
  166. t2 = _mm_aeskeygenassist_si128(t3, 0x40);
  167. aes_192_assist(&t1, &t2, &t3);
  168. ks[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ks[10]), _mm_castsi128_pd(t1), 0));
  169. ks[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(t1), _mm_castsi128_pd(t3), 1));
  170. t2 = _mm_aeskeygenassist_si128(t3, 0x80);
  171. aes_192_assist(&t1, &t2, &t3);
  172. ks[12] = t1;
  173. cx->inf.l = 0;
  174. cx->inf.b[0] = 12 * 16;
  175. return EXIT_SUCCESS;
  176. }
  177. INLINE void aes_256_assist1(__m128i* t1, __m128i * t2)
  178. {
  179. __m128i t4;
  180. *t2 = _mm_shuffle_epi32(*t2, 0xff);
  181. t4 = _mm_slli_si128(*t1, 0x4);
  182. *t1 = _mm_xor_si128(*t1, t4);
  183. t4 = _mm_slli_si128(t4, 0x4);
  184. *t1 = _mm_xor_si128(*t1, t4);
  185. t4 = _mm_slli_si128(t4, 0x4);
  186. *t1 = _mm_xor_si128(*t1, t4);
  187. *t1 = _mm_xor_si128(*t1, *t2);
  188. }
  189. INLINE void aes_256_assist2(__m128i* t1, __m128i * t3)
  190. {
  191. __m128i t2, t4;
  192. t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
  193. t2 = _mm_shuffle_epi32(t4, 0xaa);
  194. t4 = _mm_slli_si128(*t3, 0x4);
  195. *t3 = _mm_xor_si128(*t3, t4);
  196. t4 = _mm_slli_si128(t4, 0x4);
  197. *t3 = _mm_xor_si128(*t3, t4);
  198. t4 = _mm_slli_si128(t4, 0x4);
  199. *t3 = _mm_xor_si128(*t3, t4);
  200. *t3 = _mm_xor_si128(*t3, t2);
  201. }
  202. AES_RETURN aes_ni(encrypt_key256)(const unsigned char *key, aes_encrypt_ctx cx[1])
  203. {
  204. __m128i t1, t2, t3;
  205. __m128i *ks = (__m128i*)cx->ks;
  206. if(!has_aes_ni())
  207. {
  208. return aes_xi(encrypt_key256)(key, cx);
  209. }
  210. t1 = _mm_loadu_si128((__m128i*)key);
  211. t3 = _mm_loadu_si128((__m128i*)(key + 16));
  212. ks[0] = t1;
  213. ks[1] = t3;
  214. t2 = _mm_aeskeygenassist_si128(t3, 0x01);
  215. aes_256_assist1(&t1, &t2);
  216. ks[2] = t1;
  217. aes_256_assist2(&t1, &t3);
  218. ks[3] = t3;
  219. t2 = _mm_aeskeygenassist_si128(t3, 0x02);
  220. aes_256_assist1(&t1, &t2);
  221. ks[4] = t1;
  222. aes_256_assist2(&t1, &t3);
  223. ks[5] = t3;
  224. t2 = _mm_aeskeygenassist_si128(t3, 0x04);
  225. aes_256_assist1(&t1, &t2);
  226. ks[6] = t1;
  227. aes_256_assist2(&t1, &t3);
  228. ks[7] = t3;
  229. t2 = _mm_aeskeygenassist_si128(t3, 0x08);
  230. aes_256_assist1(&t1, &t2);
  231. ks[8] = t1;
  232. aes_256_assist2(&t1, &t3);
  233. ks[9] = t3;
  234. t2 = _mm_aeskeygenassist_si128(t3, 0x10);
  235. aes_256_assist1(&t1, &t2);
  236. ks[10] = t1;
  237. aes_256_assist2(&t1, &t3);
  238. ks[11] = t3;
  239. t2 = _mm_aeskeygenassist_si128(t3, 0x20);
  240. aes_256_assist1(&t1, &t2);
  241. ks[12] = t1;
  242. aes_256_assist2(&t1, &t3);
  243. ks[13] = t3;
  244. t2 = _mm_aeskeygenassist_si128(t3, 0x40);
  245. aes_256_assist1(&t1, &t2);
  246. ks[14] = t1;
  247. cx->inf.l = 0;
  248. cx->inf.b[0] = 14 * 16;
  249. return EXIT_SUCCESS;
  250. }
  251. INLINE void enc_to_dec(aes_decrypt_ctx cx[1])
  252. {
  253. __m128i *ks = (__m128i*)cx->ks;
  254. int j;
  255. for( j = 1 ; j < (cx->inf.b[0] >> 4) ; ++j )
  256. ks[j] = _mm_aesimc_si128(ks[j]);
  257. }
  258. AES_RETURN aes_ni(decrypt_key128)(const unsigned char *key, aes_decrypt_ctx cx[1])
  259. {
  260. if(!has_aes_ni())
  261. {
  262. return aes_xi(decrypt_key128)(key, cx);
  263. }
  264. if(aes_ni(encrypt_key128)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
  265. {
  266. enc_to_dec(cx);
  267. return EXIT_SUCCESS;
  268. }
  269. else
  270. return EXIT_FAILURE;
  271. }
  272. AES_RETURN aes_ni(decrypt_key192)(const unsigned char *key, aes_decrypt_ctx cx[1])
  273. {
  274. if(!has_aes_ni())
  275. {
  276. return aes_xi(decrypt_key192)(key, cx);
  277. }
  278. if(aes_ni(encrypt_key192)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
  279. {
  280. enc_to_dec(cx);
  281. return EXIT_SUCCESS;
  282. }
  283. else
  284. return EXIT_FAILURE;
  285. }
  286. AES_RETURN aes_ni(decrypt_key256)(const unsigned char *key, aes_decrypt_ctx cx[1])
  287. {
  288. if(!has_aes_ni())
  289. {
  290. return aes_xi(decrypt_key256)(key, cx);
  291. }
  292. if(aes_ni(encrypt_key256)(key, (aes_encrypt_ctx*)cx) == EXIT_SUCCESS)
  293. {
  294. enc_to_dec(cx);
  295. return EXIT_SUCCESS;
  296. }
  297. else
  298. return EXIT_FAILURE;
  299. }
  300. AES_RETURN aes_ni(encrypt)(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1])
  301. {
  302. __m128i *key = (__m128i*)cx->ks, t;
  303. if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
  304. return EXIT_FAILURE;
  305. if(!has_aes_ni())
  306. {
  307. return aes_xi(encrypt)(in, out, cx);
  308. }
  309. t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
  310. switch(cx->inf.b[0])
  311. {
  312. case 14 * 16:
  313. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  314. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  315. case 12 * 16:
  316. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  317. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  318. case 10 * 16:
  319. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  320. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  321. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  322. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  323. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  324. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  325. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  326. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  327. t = _mm_aesenc_si128(t, *(__m128i*)++key);
  328. t = _mm_aesenclast_si128(t, *(__m128i*)++key);
  329. }
  330. _mm_storeu_si128(&((__m128i*)out)[0], t);
  331. return EXIT_SUCCESS;
  332. }
  333. AES_RETURN aes_ni(decrypt)(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1])
  334. {
  335. __m128i *key = (__m128i*)cx->ks + (cx->inf.b[0] >> 4), t;
  336. if(cx->inf.b[0] != 10 * 16 && cx->inf.b[0] != 12 * 16 && cx->inf.b[0] != 14 * 16)
  337. return EXIT_FAILURE;
  338. if(!has_aes_ni())
  339. {
  340. return aes_xi(decrypt)(in, out, cx);
  341. }
  342. t = _mm_xor_si128(_mm_loadu_si128((__m128i*)in), *(__m128i*)key);
  343. switch(cx->inf.b[0])
  344. {
  345. case 14 * 16:
  346. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  347. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  348. case 12 * 16:
  349. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  350. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  351. case 10 * 16:
  352. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  353. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  354. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  355. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  356. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  357. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  358. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  359. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  360. t = _mm_aesdec_si128(t, *(__m128i*)--key);
  361. t = _mm_aesdeclast_si128(t, *(__m128i*)--key);
  362. }
  363. _mm_storeu_si128((__m128i*)out, t);
  364. return EXIT_SUCCESS;
  365. }
  366. #ifdef ADD_AESNI_MODE_CALLS
  367. #ifdef USE_AES_CONTEXT
  368. AES_RETURN aes_CBC_encrypt(const unsigned char *in,
  369. unsigned char *out,
  370. unsigned char ivec[16],
  371. unsigned long length,
  372. const aes_encrypt_ctx cx[1])
  373. {
  374. __m128i feedback, data, *key = (__m128i*)cx->ks;
  375. int number_of_rounds = cx->inf.b[0] >> 4, j;
  376. unsigned long i;
  377. if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
  378. return EXIT_FAILURE;
  379. if(!has_aes_ni())
  380. {
  381. return aes_cbc_encrypt(in, out, length, ivec, cx);
  382. }
  383. if(length % 16)
  384. length = length / 16 + 1;
  385. else length /= 16;
  386. feedback = _mm_loadu_si128((__m128i*)ivec);
  387. for(i = 0; i < length; i++)
  388. {
  389. data = _mm_loadu_si128(&((__m128i*)in)[i]);
  390. feedback = _mm_xor_si128(data, feedback);
  391. feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
  392. for(j = 1; j <number_of_rounds; j++)
  393. feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
  394. feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
  395. _mm_storeu_si128(&((__m128i*)out)[i], feedback);
  396. }
  397. return EXIT_SUCCESS;
  398. }
  399. AES_RETURN aes_CBC_decrypt(const unsigned char *in,
  400. unsigned char *out,
  401. unsigned char ivec[16],
  402. unsigned long length,
  403. const aes_decrypt_ctx cx[1])
  404. {
  405. __m128i data, feedback, last_in, *key = (__m128i*)cx->ks;
  406. int number_of_rounds = cx->inf.b[0] >> 4, j;
  407. unsigned long i;
  408. if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
  409. return EXIT_FAILURE;
  410. if(!has_aes_ni())
  411. {
  412. return aes_cbc_decrypt(in, out, length, ivec, cx);
  413. }
  414. if(length % 16)
  415. length = length / 16 + 1;
  416. else length /= 16;
  417. feedback = _mm_loadu_si128((__m128i*)ivec);
  418. for(i = 0; i < length; i++)
  419. {
  420. last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
  421. data = _mm_xor_si128(last_in, ((__m128i*)key)[number_of_rounds]);
  422. for(j = number_of_rounds - 1; j > 0; j--)
  423. {
  424. data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
  425. }
  426. data = _mm_aesdeclast_si128(data, ((__m128i*)key)[0]);
  427. data = _mm_xor_si128(data, feedback);
  428. _mm_storeu_si128(&((__m128i*)out)[i], data);
  429. feedback = last_in;
  430. }
  431. return EXIT_SUCCESS;
  432. }
  433. static void ctr_inc(unsigned char *ctr_blk)
  434. {
  435. uint32_t c;
  436. c = *(uint32_t*)(ctr_blk + 8);
  437. c++;
  438. *(uint32_t*)(ctr_blk + 8) = c;
  439. if(!c)
  440. *(uint32_t*)(ctr_blk + 12) = *(uint32_t*)(ctr_blk + 12) + 1;
  441. }
  442. AES_RETURN AES_CTR_encrypt(const unsigned char *in,
  443. unsigned char *out,
  444. const unsigned char ivec[8],
  445. const unsigned char nonce[4],
  446. unsigned long length,
  447. const aes_encrypt_ctx cx[1])
  448. {
  449. __m128i ctr_block = { 0 }, *key = (__m128i*)cx->ks, tmp, ONE, BSWAP_EPI64;
  450. int number_of_rounds = cx->inf.b[0] >> 4, j;
  451. unsigned long i;
  452. if(number_of_rounds != 10 && number_of_rounds != 12 && number_of_rounds != 14)
  453. return EXIT_FAILURE;
  454. if(!has_aes_ni())
  455. {
  456. unsigned char ctr_blk[16];
  457. *(uint64_t*)ctr_blk = *(uint64_t*)ivec;
  458. *(uint32_t*)(ctr_blk + 8) = *(uint32_t*)nonce;
  459. return aes_ctr_crypt(in, out, length, (unsigned char*)ctr_blk, ctr_inc, cx);
  460. }
  461. if(length % 16)
  462. length = length / 16 + 1;
  463. else length /= 16;
  464. ONE = _mm_set_epi32(0, 1, 0, 0);
  465. BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
  466. #ifdef _MSC_VER
  467. ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
  468. #else
  469. ctr_block = _mm_set_epi64(*(__m64*)ivec, *(__m64*)&ctr_block);
  470. #endif
  471. ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
  472. ctr_block = _mm_srli_si128(ctr_block, 4);
  473. ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
  474. ctr_block = _mm_add_epi64(ctr_block, ONE);
  475. for(i = 0; i < length; i++)
  476. {
  477. tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
  478. ctr_block = _mm_add_epi64(ctr_block, ONE);
  479. tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
  480. for(j = 1; j <number_of_rounds; j++)
  481. {
  482. tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
  483. };
  484. tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
  485. tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
  486. _mm_storeu_si128(&((__m128i*)out)[i], tmp);
  487. }
  488. return EXIT_SUCCESS;
  489. }
  490. #else
  491. void aes_CBC_encrypt(const unsigned char *in,
  492. unsigned char *out,
  493. unsigned char ivec[16],
  494. unsigned long length,
  495. unsigned char *key,
  496. int number_of_rounds)
  497. {
  498. __m128i feedback, data;
  499. unsigned long i;
  500. int j;
  501. if(length % 16)
  502. length = length / 16 + 1;
  503. else length /= 16;
  504. feedback = _mm_loadu_si128((__m128i*)ivec);
  505. for(i = 0; i < length; i++)
  506. {
  507. data = _mm_loadu_si128(&((__m128i*)in)[i]);
  508. feedback = _mm_xor_si128(data, feedback);
  509. feedback = _mm_xor_si128(feedback, ((__m128i*)key)[0]);
  510. for(j = 1; j <number_of_rounds; j++)
  511. feedback = _mm_aesenc_si128(feedback, ((__m128i*)key)[j]);
  512. feedback = _mm_aesenclast_si128(feedback, ((__m128i*)key)[j]);
  513. _mm_storeu_si128(&((__m128i*)out)[i], feedback);
  514. }
  515. }
  516. void aes_CBC_decrypt(const unsigned char *in,
  517. unsigned char *out,
  518. unsigned char ivec[16],
  519. unsigned long length,
  520. unsigned char *key,
  521. int number_of_rounds)
  522. {
  523. __m128i data, feedback, last_in;
  524. unsigned long i;
  525. int j;
  526. if(length % 16)
  527. length = length / 16 + 1;
  528. else length /= 16;
  529. feedback = _mm_loadu_si128((__m128i*)ivec);
  530. for(i = 0; i < length; i++)
  531. {
  532. last_in = _mm_loadu_si128(&((__m128i*)in)[i]);
  533. data = _mm_xor_si128(last_in, ((__m128i*)key)[0]);
  534. for(j = 1; j <number_of_rounds; j++)
  535. {
  536. data = _mm_aesdec_si128(data, ((__m128i*)key)[j]);
  537. }
  538. data = _mm_aesdeclast_si128(data, ((__m128i*)key)[j]);
  539. data = _mm_xor_si128(data, feedback);
  540. _mm_storeu_si128(&((__m128i*)out)[i], data);
  541. feedback = last_in;
  542. }
  543. }
  544. void AES_CTR_encrypt(const unsigned char *in,
  545. unsigned char *out,
  546. const unsigned char ivec[8],
  547. const unsigned char nonce[4],
  548. unsigned long length,
  549. const unsigned char *key,
  550. int number_of_rounds)
  551. {
  552. __m128i ctr_block = { 0 }, tmp, ONE, BSWAP_EPI64;
  553. unsigned long i;
  554. int j;
  555. if(length % 16)
  556. length = length / 16 + 1;
  557. else length /= 16;
  558. ONE = _mm_set_epi32(0, 1, 0, 0);
  559. BSWAP_EPI64 = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
  560. #ifdef _MSC_VER
  561. ctr_block = _mm_insert_epi64(ctr_block, *(long long*)ivec, 1);
  562. #else
  563. ctr_block = _mm_set_epi64(*(__m64*)ivec, *(__m64*)&ctr_block);
  564. #endif
  565. ctr_block = _mm_insert_epi32(ctr_block, *(long*)nonce, 1);
  566. ctr_block = _mm_srli_si128(ctr_block, 4);
  567. ctr_block = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
  568. ctr_block = _mm_add_epi64(ctr_block, ONE);
  569. for(i = 0; i < length; i++)
  570. {
  571. tmp = _mm_shuffle_epi8(ctr_block, BSWAP_EPI64);
  572. ctr_block = _mm_add_epi64(ctr_block, ONE);
  573. tmp = _mm_xor_si128(tmp, ((__m128i*)key)[0]);
  574. for(j = 1; j <number_of_rounds; j++)
  575. {
  576. tmp = _mm_aesenc_si128(tmp, ((__m128i*)key)[j]);
  577. };
  578. tmp = _mm_aesenclast_si128(tmp, ((__m128i*)key)[j]);
  579. tmp = _mm_xor_si128(tmp, _mm_loadu_si128(&((__m128i*)in)[i]));
  580. _mm_storeu_si128(&((__m128i*)out)[i], tmp);
  581. }
  582. }
  583. #endif
  584. #endif
  585. #endif