aes_asm.s 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. /* aes_asm.s
  2. *
  3. * Copyright (C) 2006-2011 Sawtooth Consulting Ltd.
  4. *
  5. * This file is part of CyaSSL.
  6. *
  7. * CyaSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * CyaSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  20. */
  21. /* See Intel® Advanced Encryption Standard (AES) Instructions Set White Paper
  22. * by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
  23. */
  24. //AES_CBC_encrypt (const unsigned char *in,
  25. // unsigned char *out,
  26. // unsigned char ivec[16],
  27. // unsigned long length,
  28. // const unsigned char *KS,
  29. // int nr)
  30. .globl AES_CBC_encrypt
  31. AES_CBC_encrypt:
  32. # parameter 1: %rdi
  33. # parameter 2: %rsi
  34. # parameter 3: %rdx
  35. # parameter 4: %rcx
  36. # parameter 5: %r8
  37. # parameter 6: %r9d
  38. movq %rcx, %r10
  39. shrq $4, %rcx
  40. shlq $60, %r10
  41. je NO_PARTS
  42. addq $1, %rcx
  43. NO_PARTS:
  44. subq $16, %rsi
  45. movdqa (%rdx), %xmm1
  46. LOOP:
  47. pxor (%rdi), %xmm1
  48. pxor (%r8), %xmm1
  49. addq $16,%rsi
  50. addq $16,%rdi
  51. cmpl $12, %r9d
  52. aesenc 16(%r8),%xmm1
  53. aesenc 32(%r8),%xmm1
  54. aesenc 48(%r8),%xmm1
  55. aesenc 64(%r8),%xmm1
  56. aesenc 80(%r8),%xmm1
  57. aesenc 96(%r8),%xmm1
  58. aesenc 112(%r8),%xmm1
  59. aesenc 128(%r8),%xmm1
  60. aesenc 144(%r8),%xmm1
  61. movdqa 160(%r8),%xmm2
  62. jb LAST
  63. cmpl $14, %r9d
  64. aesenc 160(%r8),%xmm1
  65. aesenc 176(%r8),%xmm1
  66. movdqa 192(%r8),%xmm2
  67. jb LAST
  68. aesenc 192(%r8),%xmm1
  69. aesenc 208(%r8),%xmm1
  70. movdqa 224(%r8),%xmm2
  71. LAST:
  72. decq %rcx
  73. aesenclast %xmm2,%xmm1
  74. movdqu %xmm1,(%rsi)
  75. jne LOOP
  76. ret
  77. //AES_CBC_decrypt (const unsigned char *in,
  78. // unsigned char *out,
  79. // unsigned char ivec[16],
  80. // unsigned long length,
  81. // const unsigned char *KS,
  82. // int nr)
  83. .globl AES_CBC_decrypt
  84. AES_CBC_decrypt:
  85. # parameter 1: %rdi
  86. # parameter 2: %rsi
  87. # parameter 3: %rdx
  88. # parameter 4: %rcx
  89. # parameter 5: %r8
  90. # parameter 6: %r9d
  91. movq %rcx, %r10
  92. shrq $4, %rcx
  93. shlq $60, %r10
  94. je DNO_PARTS_4
  95. addq $1, %rcx
  96. DNO_PARTS_4:
  97. movq %rcx, %r10
  98. shlq $62, %r10
  99. shrq $62, %r10
  100. shrq $2, %rcx
  101. movdqu (%rdx),%xmm5
  102. je DREMAINDER_4
  103. subq $64, %rsi
  104. DLOOP_4:
  105. movdqu (%rdi), %xmm1
  106. movdqu 16(%rdi), %xmm2
  107. movdqu 32(%rdi), %xmm3
  108. movdqu 48(%rdi), %xmm4
  109. movdqa %xmm1, %xmm6
  110. movdqa %xmm2, %xmm7
  111. movdqa %xmm3, %xmm8
  112. movdqa %xmm4, %xmm15
  113. movdqa (%r8), %xmm9
  114. movdqa 16(%r8), %xmm10
  115. movdqa 32(%r8), %xmm11
  116. movdqa 48(%r8), %xmm12
  117. pxor %xmm9, %xmm1
  118. pxor %xmm9, %xmm2
  119. pxor %xmm9, %xmm3
  120. pxor %xmm9, %xmm4
  121. aesdec %xmm10, %xmm1
  122. aesdec %xmm10, %xmm2
  123. aesdec %xmm10, %xmm3
  124. aesdec %xmm10, %xmm4
  125. aesdec %xmm11, %xmm1
  126. aesdec %xmm11, %xmm2
  127. aesdec %xmm11, %xmm3
  128. aesdec %xmm11, %xmm4
  129. aesdec %xmm12, %xmm1
  130. aesdec %xmm12, %xmm2
  131. aesdec %xmm12, %xmm3
  132. aesdec %xmm12, %xmm4
  133. movdqa 64(%r8), %xmm9
  134. movdqa 80(%r8), %xmm10
  135. movdqa 96(%r8), %xmm11
  136. movdqa 112(%r8), %xmm12
  137. aesdec %xmm9, %xmm1
  138. aesdec %xmm9, %xmm2
  139. aesdec %xmm9, %xmm3
  140. aesdec %xmm9, %xmm4
  141. aesdec %xmm10, %xmm1
  142. aesdec %xmm10, %xmm2
  143. aesdec %xmm10, %xmm3
  144. aesdec %xmm10, %xmm4
  145. aesdec %xmm11, %xmm1
  146. aesdec %xmm11, %xmm2
  147. aesdec %xmm11, %xmm3
  148. aesdec %xmm11, %xmm4
  149. aesdec %xmm12, %xmm1
  150. aesdec %xmm12, %xmm2
  151. aesdec %xmm12, %xmm3
  152. aesdec %xmm12, %xmm4
  153. movdqa 128(%r8), %xmm9
  154. movdqa 144(%r8), %xmm10
  155. movdqa 160(%r8), %xmm11
  156. cmpl $12, %r9d
  157. aesdec %xmm9, %xmm1
  158. aesdec %xmm9, %xmm2
  159. aesdec %xmm9, %xmm3
  160. aesdec %xmm9, %xmm4
  161. aesdec %xmm10, %xmm1
  162. aesdec %xmm10, %xmm2
  163. aesdec %xmm10, %xmm3
  164. aesdec %xmm10, %xmm4
  165. jb DLAST_4
  166. movdqa 160(%r8), %xmm9
  167. movdqa 176(%r8), %xmm10
  168. movdqa 192(%r8), %xmm11
  169. cmpl $14, %r9d
  170. aesdec %xmm9, %xmm1
  171. aesdec %xmm9, %xmm2
  172. aesdec %xmm9, %xmm3
  173. aesdec %xmm9, %xmm4
  174. aesdec %xmm10, %xmm1
  175. aesdec %xmm10, %xmm2
  176. aesdec %xmm10, %xmm3
  177. aesdec %xmm10, %xmm4
  178. jb DLAST_4
  179. movdqa 192(%r8), %xmm9
  180. movdqa 208(%r8), %xmm10
  181. movdqa 224(%r8), %xmm11
  182. aesdec %xmm9, %xmm1
  183. aesdec %xmm9, %xmm2
  184. aesdec %xmm9, %xmm3
  185. aesdec %xmm9, %xmm4
  186. aesdec %xmm10, %xmm1
  187. aesdec %xmm10, %xmm2
  188. aesdec %xmm10, %xmm3
  189. aesdec %xmm10, %xmm4
  190. DLAST_4:
  191. addq $64, %rdi
  192. addq $64, %rsi
  193. decq %rcx
  194. aesdeclast %xmm11, %xmm1
  195. aesdeclast %xmm11, %xmm2
  196. aesdeclast %xmm11, %xmm3
  197. aesdeclast %xmm11, %xmm4
  198. pxor %xmm5 ,%xmm1
  199. pxor %xmm6 ,%xmm2
  200. pxor %xmm7 ,%xmm3
  201. pxor %xmm8 ,%xmm4
  202. movdqu %xmm1, (%rsi)
  203. movdqu %xmm2, 16(%rsi)
  204. movdqu %xmm3, 32(%rsi)
  205. movdqu %xmm4, 48(%rsi)
  206. movdqa %xmm15,%xmm5
  207. jne DLOOP_4
  208. addq $64, %rsi
  209. DREMAINDER_4:
  210. cmpq $0, %r10
  211. je DEND_4
  212. DLOOP_4_2:
  213. movdqu (%rdi), %xmm1
  214. movdqa %xmm1 ,%xmm15
  215. addq $16, %rdi
  216. pxor (%r8), %xmm1
  217. movdqu 160(%r8), %xmm2
  218. cmpl $12, %r9d
  219. aesdec 16(%r8), %xmm1
  220. aesdec 32(%r8), %xmm1
  221. aesdec 48(%r8), %xmm1
  222. aesdec 64(%r8), %xmm1
  223. aesdec 80(%r8), %xmm1
  224. aesdec 96(%r8), %xmm1
  225. aesdec 112(%r8), %xmm1
  226. aesdec 128(%r8), %xmm1
  227. aesdec 144(%r8), %xmm1
  228. jb DLAST_4_2
  229. movdqu 192(%r8), %xmm2
  230. cmpl $14, %r9d
  231. aesdec 160(%r8), %xmm1
  232. aesdec 176(%r8), %xmm1
  233. jb DLAST_4_2
  234. movdqu 224(%r8), %xmm2
  235. aesdec 192(%r8), %xmm1
  236. aesdec 208(%r8), %xmm1
  237. DLAST_4_2:
  238. aesdeclast %xmm2, %xmm1
  239. pxor %xmm5, %xmm1
  240. movdqa %xmm15, %xmm5
  241. movdqu %xmm1, (%rsi)
  242. addq $16, %rsi
  243. decq %r10
  244. jne DLOOP_4_2
  245. DEND_4:
  246. ret
  247. //void AES_128_Key_Expansion(const unsigned char* userkey,
  248. // unsigned char* key_schedule);
  249. .align 16,0x90
  250. .globl AES_128_Key_Expansion
  251. AES_128_Key_Expansion:
  252. # parameter 1: %rdi
  253. # parameter 2: %rsi
  254. movl $10, 240(%rsi)
  255. movdqu (%rdi), %xmm1
  256. movdqa %xmm1, (%rsi)
  257. ASSISTS:
  258. aeskeygenassist $1, %xmm1, %xmm2
  259. call PREPARE_ROUNDKEY_128
  260. movdqa %xmm1, 16(%rsi)
  261. aeskeygenassist $2, %xmm1, %xmm2
  262. call PREPARE_ROUNDKEY_128
  263. movdqa %xmm1, 32(%rsi)
  264. aeskeygenassist $4, %xmm1, %xmm2
  265. call PREPARE_ROUNDKEY_128
  266. movdqa %xmm1, 48(%rsi)
  267. aeskeygenassist $8, %xmm1, %xmm2
  268. call PREPARE_ROUNDKEY_128
  269. movdqa %xmm1, 64(%rsi)
  270. aeskeygenassist $16, %xmm1, %xmm2
  271. call PREPARE_ROUNDKEY_128
  272. movdqa %xmm1, 80(%rsi)
  273. aeskeygenassist $32, %xmm1, %xmm2
  274. call PREPARE_ROUNDKEY_128
  275. movdqa %xmm1, 96(%rsi)
  276. aeskeygenassist $64, %xmm1, %xmm2
  277. call PREPARE_ROUNDKEY_128
  278. movdqa %xmm1, 112(%rsi)
  279. aeskeygenassist $0x80, %xmm1, %xmm2
  280. call PREPARE_ROUNDKEY_128
  281. movdqa %xmm1, 128(%rsi)
  282. aeskeygenassist $0x1b, %xmm1, %xmm2
  283. call PREPARE_ROUNDKEY_128
  284. movdqa %xmm1, 144(%rsi)
  285. aeskeygenassist $0x36, %xmm1, %xmm2
  286. call PREPARE_ROUNDKEY_128
  287. movdqa %xmm1, 160(%rsi)
  288. ret
  289. PREPARE_ROUNDKEY_128:
  290. pshufd $255, %xmm2, %xmm2
  291. movdqa %xmm1, %xmm3
  292. pslldq $4, %xmm3
  293. pxor %xmm3, %xmm1
  294. pslldq $4, %xmm3
  295. pxor %xmm3, %xmm1
  296. pslldq $4, %xmm3
  297. pxor %xmm3, %xmm1
  298. pxor %xmm2, %xmm1
  299. ret
  300. //void AES_192_Key_Expansion (const unsigned char *userkey,
  301. // unsigned char *key)
  302. .globl AES_192_Key_Expansion
  303. AES_192_Key_Expansion:
  304. # parameter 1: %rdi
  305. # parameter 2: %rsi
  306. movdqu (%rdi), %xmm1
  307. movdqu 16(%rdi), %xmm3
  308. movdqa %xmm1, (%rsi)
  309. movdqa %xmm3, %xmm5
  310. aeskeygenassist $0x1, %xmm3, %xmm2
  311. call PREPARE_ROUNDKEY_192
  312. shufpd $0, %xmm1, %xmm5
  313. movdqa %xmm5, 16(%rsi)
  314. movdqa %xmm1, %xmm6
  315. shufpd $1, %xmm3, %xmm6
  316. movdqa %xmm6, 32(%rsi)
  317. aeskeygenassist $0x2, %xmm3, %xmm2
  318. call PREPARE_ROUNDKEY_192
  319. movdqa %xmm1, 48(%rsi)
  320. movdqa %xmm3, %xmm5
  321. aeskeygenassist $0x4, %xmm3, %xmm2
  322. call PREPARE_ROUNDKEY_192
  323. shufpd $0, %xmm1, %xmm5
  324. movdqa %xmm5, 64(%rsi)
  325. movdqa %xmm1, %xmm6
  326. shufpd $1, %xmm3, %xmm6
  327. movdqa %xmm6, 80(%rsi)
  328. aeskeygenassist $0x8, %xmm3, %xmm2
  329. call PREPARE_ROUNDKEY_192
  330. movdqa %xmm1, 96(%rsi)
  331. movdqa %xmm3, %xmm5
  332. aeskeygenassist $0x10, %xmm3, %xmm2
  333. call PREPARE_ROUNDKEY_192
  334. shufpd $0, %xmm1, %xmm5
  335. movdqa %xmm5, 112(%rsi)
  336. movdqa %xmm1, %xmm6
  337. shufpd $1, %xmm3, %xmm6
  338. movdqa %xmm6, 128(%rsi)
  339. aeskeygenassist $0x20, %xmm3, %xmm2
  340. call PREPARE_ROUNDKEY_192
  341. movdqa %xmm1, 144(%rsi)
  342. movdqa %xmm3, %xmm5
  343. aeskeygenassist $0x40, %xmm3, %xmm2
  344. call PREPARE_ROUNDKEY_192
  345. shufpd $0, %xmm1, %xmm5
  346. movdqa %xmm5, 160(%rsi)
  347. movdqa %xmm1, %xmm6
  348. shufpd $1, %xmm3, %xmm6
  349. movdqa %xmm6, 176(%rsi)
  350. aeskeygenassist $0x80, %xmm3, %xmm2
  351. call PREPARE_ROUNDKEY_192
  352. movdqa %xmm1, 192(%rsi)
  353. movdqa %xmm3, 208(%rsi)
  354. ret
  355. PREPARE_ROUNDKEY_192:
  356. pshufd $0x55, %xmm2, %xmm2
  357. movdqu %xmm1, %xmm4
  358. pslldq $4, %xmm4
  359. pxor %xmm4, %xmm1
  360. pslldq $4, %xmm4
  361. pxor %xmm4, %xmm1
  362. pslldq $4, %xmm4
  363. pxor %xmm4, %xmm1
  364. pxor %xmm2, %xmm1
  365. pshufd $0xff, %xmm1, %xmm2
  366. movdqu %xmm3, %xmm4
  367. pslldq $4, %xmm4
  368. pxor %xmm4, %xmm3
  369. pxor %xmm2, %xmm3
  370. ret
  371. //void AES_256_Key_Expansion (const unsigned char *userkey,
  372. // unsigned char *key)
  373. .globl AES_256_Key_Expansion
  374. AES_256_Key_Expansion:
  375. # parameter 1: %rdi
  376. # parameter 2: %rsi
  377. movdqu (%rdi), %xmm1
  378. movdqu 16(%rdi), %xmm3
  379. movdqa %xmm1, (%rsi)
  380. movdqa %xmm3, 16(%rsi)
  381. aeskeygenassist $0x1, %xmm3, %xmm2
  382. call MAKE_RK256_a
  383. movdqa %xmm1, 32(%rsi)
  384. aeskeygenassist $0x0, %xmm1, %xmm2
  385. call MAKE_RK256_b
  386. movdqa %xmm3, 48(%rsi)
  387. aeskeygenassist $0x2, %xmm3, %xmm2
  388. call MAKE_RK256_a
  389. movdqa %xmm1, 64(%rsi)
  390. aeskeygenassist $0x0, %xmm1, %xmm2
  391. call MAKE_RK256_b
  392. movdqa %xmm3, 80(%rsi)
  393. aeskeygenassist $0x4, %xmm3, %xmm2
  394. call MAKE_RK256_a
  395. movdqa %xmm1, 96(%rsi)
  396. aeskeygenassist $0x0, %xmm1, %xmm2
  397. call MAKE_RK256_b
  398. movdqa %xmm3, 112(%rsi)
  399. aeskeygenassist $0x8, %xmm3, %xmm2
  400. call MAKE_RK256_a
  401. movdqa %xmm1, 128(%rsi)
  402. aeskeygenassist $0x0, %xmm1, %xmm2
  403. call MAKE_RK256_b
  404. movdqa %xmm3, 144(%rsi)
  405. aeskeygenassist $0x10, %xmm3, %xmm2
  406. call MAKE_RK256_a
  407. movdqa %xmm1, 160(%rsi)
  408. aeskeygenassist $0x0, %xmm1, %xmm2
  409. call MAKE_RK256_b
  410. movdqa %xmm3, 176(%rsi)
  411. aeskeygenassist $0x20, %xmm3, %xmm2
  412. call MAKE_RK256_a
  413. movdqa %xmm1, 192(%rsi)
  414. aeskeygenassist $0x0, %xmm1, %xmm2
  415. call MAKE_RK256_b
  416. movdqa %xmm3, 208(%rsi)
  417. aeskeygenassist $0x40, %xmm3, %xmm2
  418. call MAKE_RK256_a
  419. movdqa %xmm1, 224(%rsi)
  420. ret
  421. MAKE_RK256_a:
  422. pshufd $0xff, %xmm2, %xmm2
  423. movdqa %xmm1, %xmm4
  424. pslldq $4, %xmm4
  425. pxor %xmm4, %xmm1
  426. pslldq $4, %xmm4
  427. pxor %xmm4, %xmm1
  428. pslldq $4, %xmm4
  429. pxor %xmm4, %xmm1
  430. pxor %xmm2, %xmm1
  431. ret
  432. MAKE_RK256_b:
  433. pshufd $0xaa, %xmm2, %xmm2
  434. movdqa %xmm3, %xmm4
  435. pslldq $4, %xmm4
  436. pxor %xmm4, %xmm3
  437. pslldq $4, %xmm4
  438. pxor %xmm4, %xmm3
  439. pslldq $4, %xmm4
  440. pxor %xmm4, %xmm3
  441. pxor %xmm2, %xmm3
  442. ret