aes_asm.asm 40 KB


  1. ; /* aes_asm.asm
  2. ; *
  3. ; * Copyright (C) 2006-2023 wolfSSL Inc.
  4. ; *
  5. ; * This file is part of wolfSSL.
  6. ; *
  7. ; * wolfSSL is free software; you can redistribute it and/or modify
  8. ; * it under the terms of the GNU General Public License as published by
  9. ; * the Free Software Foundation; either version 2 of the License, or
  10. ; * (at your option) any later version.
  11. ; *
  12. ; * wolfSSL is distributed in the hope that it will be useful,
  13. ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. ; * GNU General Public License for more details.
  16. ; *
  17. ; * You should have received a copy of the GNU General Public License
  18. ; * along with this program; if not, write to the Free Software
  19. ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. ; */
  21. ;
  22. ;
  23. ; /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper
  24. ; * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron
  25. ; */
  26. ;
  27. ; /* This file is in intel asm syntax, see .s for at&t syntax */
  28. ;
  29. fips_version = 0
  30. IFDEF HAVE_FIPS
  31. fips_version = 1
  32. IFDEF HAVE_FIPS_VERSION
  33. fips_version = HAVE_FIPS_VERSION
  34. ENDIF
  35. ENDIF
  36. IF fips_version GE 2
  37. fipsAh SEGMENT ALIAS(".fipsA$h") 'CODE'
  38. ELSE
  39. _text SEGMENT
  40. ENDIF
  41. ; /*
  42. ; AES_CBC_encrypt_AESNI[const ,unsigned char*in
  43. ; unsigned ,char*out
  44. ; unsigned ,char ivec+16
  45. ; unsigned ,long length
  46. ; const ,unsigned char*KS
  47. ; int nr]
  48. ; */
  49. AES_CBC_encrypt_AESNI PROC
  50. ;# parameter 1: rdi
  51. ;# parameter 2: rsi
  52. ;# parameter 3: rdx
  53. ;# parameter 4: rcx
  54. ;# parameter 5: r8
  55. ;# parameter 6: r9d
  56. ; save rdi and rsi to rax and r11, restore before ret
  57. mov rax,rdi
  58. mov r11,rsi
  59. ; convert to what we had for att&t convention
  60. mov rdi,rcx
  61. mov rsi,rdx
  62. mov rdx,r8
  63. mov rcx,r9
  64. mov r8,[rsp+40]
  65. mov r9d,[rsp+48]
  66. mov r10,rcx
  67. shr rcx,4
  68. shl r10,60
  69. je NO_PARTS
  70. add rcx,1
  71. NO_PARTS:
  72. sub rsi,16
  73. movdqa xmm1,[rdx]
  74. LOOP_1:
  75. pxor xmm1,[rdi]
  76. pxor xmm1,[r8]
  77. add rsi,16
  78. add rdi,16
  79. cmp r9d,12
  80. aesenc xmm1,16[r8]
  81. aesenc xmm1,32[r8]
  82. aesenc xmm1,48[r8]
  83. aesenc xmm1,64[r8]
  84. aesenc xmm1,80[r8]
  85. aesenc xmm1,96[r8]
  86. aesenc xmm1,112[r8]
  87. aesenc xmm1,128[r8]
  88. aesenc xmm1,144[r8]
  89. movdqa xmm2,160[r8]
  90. jb LAST
  91. cmp r9d,14
  92. aesenc xmm1,160[r8]
  93. aesenc xmm1,176[r8]
  94. movdqa xmm2,192[r8]
  95. jb LAST
  96. aesenc xmm1,192[r8]
  97. aesenc xmm1,208[r8]
  98. movdqa xmm2,224[r8]
  99. LAST:
  100. dec rcx
  101. aesenclast xmm1,xmm2
  102. movdqu [rsi],xmm1
  103. jne LOOP_1
  104. ; restore non volatile rdi,rsi
  105. mov rdi,rax
  106. mov rsi,r11
  107. ret
  108. AES_CBC_encrypt_AESNI ENDP
  109. ; void AES_CBC_decrypt_AESNI_by4(const unsigned char* in,
  110. ; unsigned char* out,
  111. ; unsigned char ivec[16],
  112. ; unsigned long length,
  113. ; const unsigned char* KS,
  114. ; int nr)
  115. AES_CBC_decrypt_AESNI_by4 PROC
  116. ; parameter 1: rdi
  117. ; parameter 2: rsi
  118. ; parameter 3: rdx
  119. ; parameter 4: rcx
  120. ; parameter 5: r8
  121. ; parameter 6: r9d
  122. ; save rdi and rsi to rax and r11, restore before ret
  123. mov rax, rdi
  124. mov r11, rsi
  125. ; convert to what we had for att&t convention
  126. mov rdi, rcx
  127. mov rsi, rdx
  128. mov rdx, r8
  129. mov rcx,r9
  130. mov r8, [rsp+40]
  131. mov r9d, [rsp+48]
  132. ; on microsoft xmm6-xmm15 are non volatile,
  133. ; let's save on stack and restore at end
  134. sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each
  135. movdqa [rsp+0], xmm6
  136. movdqa [rsp+16], xmm7
  137. movdqa [rsp+32], xmm8
  138. movdqa [rsp+48], xmm9
  139. movdqa [rsp+64], xmm10
  140. movdqa [rsp+80], xmm11
  141. movdqa [rsp+96], xmm12
  142. movdqa [rsp+112], xmm15
  143. ; back to our original code, more or less
  144. mov r10, rcx
  145. shr rcx, 4
  146. shl r10, 60
  147. je DNO_PARTS_4
  148. add rcx, 1
  149. DNO_PARTS_4:
  150. mov r10, rcx
  151. shl r10, 62
  152. shr r10, 62
  153. shr rcx, 2
  154. movdqu xmm5, [rdx]
  155. je DREMAINDER_4
  156. sub rsi, 64
  157. DLOOP_4:
  158. movdqu xmm1, [rdi]
  159. movdqu xmm2, 16[rdi]
  160. movdqu xmm3, 32[rdi]
  161. movdqu xmm4, 48[rdi]
  162. movdqa xmm6, xmm1
  163. movdqa xmm7, xmm2
  164. movdqa xmm8, xmm3
  165. movdqa xmm15, xmm4
  166. movdqa xmm9, [r8]
  167. movdqa xmm10, 16[r8]
  168. movdqa xmm11, 32[r8]
  169. movdqa xmm12, 48[r8]
  170. pxor xmm1, xmm9
  171. pxor xmm2, xmm9
  172. pxor xmm3, xmm9
  173. pxor xmm4, xmm9
  174. aesdec xmm1, xmm10
  175. aesdec xmm2, xmm10
  176. aesdec xmm3, xmm10
  177. aesdec xmm4, xmm10
  178. aesdec xmm1, xmm11
  179. aesdec xmm2, xmm11
  180. aesdec xmm3, xmm11
  181. aesdec xmm4, xmm11
  182. aesdec xmm1, xmm12
  183. aesdec xmm2, xmm12
  184. aesdec xmm3, xmm12
  185. aesdec xmm4, xmm12
  186. movdqa xmm9, 64[r8]
  187. movdqa xmm10, 80[r8]
  188. movdqa xmm11, 96[r8]
  189. movdqa xmm12, 112[r8]
  190. aesdec xmm1, xmm9
  191. aesdec xmm2, xmm9
  192. aesdec xmm3, xmm9
  193. aesdec xmm4, xmm9
  194. aesdec xmm1, xmm10
  195. aesdec xmm2, xmm10
  196. aesdec xmm3, xmm10
  197. aesdec xmm4, xmm10
  198. aesdec xmm1, xmm11
  199. aesdec xmm2, xmm11
  200. aesdec xmm3, xmm11
  201. aesdec xmm4, xmm11
  202. aesdec xmm1, xmm12
  203. aesdec xmm2, xmm12
  204. aesdec xmm3, xmm12
  205. aesdec xmm4, xmm12
  206. movdqa xmm9, 128[r8]
  207. movdqa xmm10, 144[r8]
  208. movdqa xmm11, 160[r8]
  209. cmp r9d, 12
  210. aesdec xmm1, xmm9
  211. aesdec xmm2, xmm9
  212. aesdec xmm3, xmm9
  213. aesdec xmm4, xmm9
  214. aesdec xmm1, xmm10
  215. aesdec xmm2, xmm10
  216. aesdec xmm3, xmm10
  217. aesdec xmm4, xmm10
  218. jb DLAST_4
  219. movdqa xmm9, 160[r8]
  220. movdqa xmm10, 176[r8]
  221. movdqa xmm11, 192[r8]
  222. cmp r9d, 14
  223. aesdec xmm1, xmm9
  224. aesdec xmm2, xmm9
  225. aesdec xmm3, xmm9
  226. aesdec xmm4, xmm9
  227. aesdec xmm1, xmm10
  228. aesdec xmm2, xmm10
  229. aesdec xmm3, xmm10
  230. aesdec xmm4, xmm10
  231. jb DLAST_4
  232. movdqa xmm9, 192[r8]
  233. movdqa xmm10, 208[r8]
  234. movdqa xmm11, 224[r8]
  235. aesdec xmm1, xmm9
  236. aesdec xmm2, xmm9
  237. aesdec xmm3, xmm9
  238. aesdec xmm4, xmm9
  239. aesdec xmm1, xmm10
  240. aesdec xmm2, xmm10
  241. aesdec xmm3, xmm10
  242. aesdec xmm4, xmm10
  243. DLAST_4:
  244. add rdi, 64
  245. add rsi, 64
  246. dec rcx
  247. aesdeclast xmm1, xmm11
  248. aesdeclast xmm2, xmm11
  249. aesdeclast xmm3, xmm11
  250. aesdeclast xmm4, xmm11
  251. pxor xmm1, xmm5
  252. pxor xmm2, xmm6
  253. pxor xmm3, xmm7
  254. pxor xmm4, xmm8
  255. movdqu [rsi], xmm1
  256. movdqu 16[rsi], xmm2
  257. movdqu 32[rsi], xmm3
  258. movdqu 48[rsi], xmm4
  259. movdqa xmm5, xmm15
  260. jne DLOOP_4
  261. add rsi, 64
  262. DREMAINDER_4:
  263. cmp r10, 0
  264. je DEND_4
  265. DLOOP_4_2:
  266. movdqu xmm1, [rdi]
  267. movdqa xmm15, xmm1
  268. add rdi, 16
  269. pxor xmm1, [r8]
  270. movdqu xmm2, 160[r8]
  271. cmp r9d, 12
  272. aesdec xmm1, 16[r8]
  273. aesdec xmm1, 32[r8]
  274. aesdec xmm1, 48[r8]
  275. aesdec xmm1, 64[r8]
  276. aesdec xmm1, 80[r8]
  277. aesdec xmm1, 96[r8]
  278. aesdec xmm1, 112[r8]
  279. aesdec xmm1, 128[r8]
  280. aesdec xmm1, 144[r8]
  281. jb DLAST_4_2
  282. movdqu xmm2, 192[r8]
  283. cmp r9d, 14
  284. aesdec xmm1, 160[r8]
  285. aesdec xmm1, 176[r8]
  286. jb DLAST_4_2
  287. movdqu xmm2, 224[r8]
  288. aesdec xmm1, 192[r8]
  289. aesdec xmm1, 208[r8]
  290. DLAST_4_2:
  291. aesdeclast xmm1, xmm2
  292. pxor xmm1, xmm5
  293. movdqa xmm5, xmm15
  294. movdqu [rsi], xmm1
  295. add rsi, 16
  296. dec r10
  297. jne DLOOP_4_2
  298. DEND_4:
  299. ; restore non volatile rdi,rsi
  300. mov rdi, rax
  301. mov rsi, r11
  302. ; restore non volatile xmms from stack
  303. movdqa xmm6, [rsp+0]
  304. movdqa xmm7, [rsp+16]
  305. movdqa xmm8, [rsp+32]
  306. movdqa xmm9, [rsp+48]
  307. movdqa xmm10, [rsp+64]
  308. movdqa xmm11, [rsp+80]
  309. movdqa xmm12, [rsp+96]
  310. movdqa xmm15, [rsp+112]
  311. add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each
  312. ret
  313. AES_CBC_decrypt_AESNI_by4 ENDP
  314. ; void AES_CBC_decrypt_AESNI_by6(const unsigned char *in,
  315. ; unsigned char *out,
  316. ; unsigned char ivec[16],
  317. ; unsigned long length,
  318. ; const unsigned char *KS,
  319. ; int nr)
  320. AES_CBC_decrypt_AESNI_by6 PROC
  321. ; parameter 1: rdi - in
  322. ; parameter 2: rsi - out
  323. ; parameter 3: rdx - ivec
  324. ; parameter 4: rcx - length
  325. ; parameter 5: r8 - KS
  326. ; parameter 6: r9d - nr
  327. ; save rdi and rsi to rax and r11, restore before ret
  328. mov rax, rdi
  329. mov r11, rsi
  330. ; convert to what we had for att&t convention
  331. mov rdi, rcx
  332. mov rsi, rdx
  333. mov rdx, r8
  334. mov rcx, r9
  335. mov r8, [rsp+40]
  336. mov r9d, [rsp+48]
  337. ; on microsoft xmm6-xmm15 are non volatile,
  338. ; let's save on stack and restore at end
  339. sub rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each
  340. movdqa [rsp+0], xmm6
  341. movdqa [rsp+16], xmm7
  342. movdqa [rsp+32], xmm8
  343. movdqa [rsp+48], xmm9
  344. movdqa [rsp+64], xmm10
  345. movdqa [rsp+80], xmm11
  346. movdqa [rsp+96], xmm12
  347. movdqa [rsp+112], xmm13
  348. movdqa [rsp+128], xmm14
  349. ; back to our original code, more or less
  350. mov r10, rcx
  351. shr rcx, 4
  352. shl r10, 60
  353. je DNO_PARTS_6
  354. add rcx, 1
  355. DNO_PARTS_6:
  356. mov r12, rax
  357. mov r13, rdx
  358. mov r14, rbx
  359. mov rdx, 0
  360. mov rax, rcx
  361. mov rbx, 6
  362. div rbx
  363. mov rcx, rax
  364. mov r10, rdx
  365. mov rax, r12
  366. mov rdx, r13
  367. mov rbx, r14
  368. cmp rcx, 0
  369. movdqu xmm7, [rdx]
  370. je DREMAINDER_6
  371. sub rsi, 96
  372. DLOOP_6:
  373. movdqu xmm1, [rdi]
  374. movdqu xmm2, 16[rdi]
  375. movdqu xmm3, 32[rdi]
  376. movdqu xmm4, 48[rdi]
  377. movdqu xmm5, 64[rdi]
  378. movdqu xmm6, 80[rdi]
  379. movdqa xmm8, [r8]
  380. movdqa xmm9, 16[r8]
  381. movdqa xmm10, 32[r8]
  382. movdqa xmm11, 48[r8]
  383. pxor xmm1, xmm8
  384. pxor xmm2, xmm8
  385. pxor xmm3, xmm8
  386. pxor xmm4, xmm8
  387. pxor xmm5, xmm8
  388. pxor xmm6, xmm8
  389. aesdec xmm1, xmm9
  390. aesdec xmm2, xmm9
  391. aesdec xmm3, xmm9
  392. aesdec xmm4, xmm9
  393. aesdec xmm5, xmm9
  394. aesdec xmm6, xmm9
  395. aesdec xmm1, xmm10
  396. aesdec xmm2, xmm10
  397. aesdec xmm3, xmm10
  398. aesdec xmm4, xmm10
  399. aesdec xmm5, xmm10
  400. aesdec xmm6, xmm10
  401. aesdec xmm1, xmm11
  402. aesdec xmm2, xmm11
  403. aesdec xmm3, xmm11
  404. aesdec xmm4, xmm11
  405. aesdec xmm5, xmm11
  406. aesdec xmm6, xmm11
  407. movdqa xmm8, 64[r8]
  408. movdqa xmm9, 80[r8]
  409. movdqa xmm10, 96[r8]
  410. movdqa xmm11, 112[r8]
  411. aesdec xmm1, xmm8
  412. aesdec xmm2, xmm8
  413. aesdec xmm3, xmm8
  414. aesdec xmm4, xmm8
  415. aesdec xmm5, xmm8
  416. aesdec xmm6, xmm8
  417. aesdec xmm1, xmm9
  418. aesdec xmm2, xmm9
  419. aesdec xmm3, xmm9
  420. aesdec xmm4, xmm9
  421. aesdec xmm5, xmm9
  422. aesdec xmm6, xmm9
  423. aesdec xmm1, xmm10
  424. aesdec xmm2, xmm10
  425. aesdec xmm3, xmm10
  426. aesdec xmm4, xmm10
  427. aesdec xmm5, xmm10
  428. aesdec xmm6, xmm10
  429. aesdec xmm1, xmm11
  430. aesdec xmm2, xmm11
  431. aesdec xmm3, xmm11
  432. aesdec xmm4, xmm11
  433. aesdec xmm5, xmm11
  434. aesdec xmm6, xmm11
  435. movdqa xmm8, 128[r8]
  436. movdqa xmm9, 144[r8]
  437. movdqa xmm10, 160[r8]
  438. cmp r9d, 12
  439. aesdec xmm1, xmm8
  440. aesdec xmm2, xmm8
  441. aesdec xmm3, xmm8
  442. aesdec xmm4, xmm8
  443. aesdec xmm5, xmm8
  444. aesdec xmm6, xmm8
  445. aesdec xmm1, xmm9
  446. aesdec xmm2, xmm9
  447. aesdec xmm3, xmm9
  448. aesdec xmm4, xmm9
  449. aesdec xmm5, xmm9
  450. aesdec xmm6, xmm9
  451. jb DLAST_6
  452. movdqa xmm8, 160[r8]
  453. movdqa xmm9, 176[r8]
  454. movdqa xmm10, 192[r8]
  455. cmp r9d, 14
  456. aesdec xmm1, xmm8
  457. aesdec xmm2, xmm8
  458. aesdec xmm3, xmm8
  459. aesdec xmm4, xmm8
  460. aesdec xmm5, xmm8
  461. aesdec xmm6, xmm8
  462. aesdec xmm1, xmm9
  463. aesdec xmm2, xmm9
  464. aesdec xmm3, xmm9
  465. aesdec xmm4, xmm9
  466. aesdec xmm5, xmm9
  467. aesdec xmm6, xmm9
  468. jb DLAST_6
  469. movdqa xmm8, 192[r8]
  470. movdqa xmm9, 208[r8]
  471. movdqa xmm10, 224[r8]
  472. aesdec xmm1, xmm8
  473. aesdec xmm2, xmm8
  474. aesdec xmm3, xmm8
  475. aesdec xmm4, xmm8
  476. aesdec xmm5, xmm8
  477. aesdec xmm6, xmm8
  478. aesdec xmm1, xmm9
  479. aesdec xmm2, xmm9
  480. aesdec xmm3, xmm9
  481. aesdec xmm4, xmm9
  482. aesdec xmm5, xmm9
  483. aesdec xmm6, xmm9
  484. DLAST_6:
  485. add rsi, 96
  486. aesdeclast xmm1, xmm10
  487. aesdeclast xmm2, xmm10
  488. aesdeclast xmm3, xmm10
  489. aesdeclast xmm4, xmm10
  490. aesdeclast xmm5, xmm10
  491. aesdeclast xmm6, xmm10
  492. movdqu xmm8, [rdi]
  493. movdqu xmm9, 16[rdi]
  494. movdqu xmm10, 32[rdi]
  495. movdqu xmm11, 48[rdi]
  496. movdqu xmm12, 64[rdi]
  497. movdqu xmm13, 80[rdi]
  498. pxor xmm1, xmm7
  499. pxor xmm2, xmm8
  500. pxor xmm3, xmm9
  501. pxor xmm4, xmm10
  502. pxor xmm5, xmm11
  503. pxor xmm6, xmm12
  504. movdqu xmm7, xmm13
  505. movdqu [rsi], xmm1
  506. movdqu 16[rsi], xmm2
  507. movdqu 32[rsi], xmm3
  508. movdqu 48[rsi], xmm4
  509. movdqu 64[rsi], xmm5
  510. movdqu 80[rsi], xmm6
  511. add rdi, 96
  512. dec rcx
  513. jne DLOOP_6
  514. add rsi, 96
  515. DREMAINDER_6:
  516. cmp r10, 0
  517. je DEND_6
  518. DLOOP_6_2:
  519. movdqu xmm1, [rdi]
  520. movdqa xmm10, xmm1
  521. add rdi, 16
  522. pxor xmm1, [r8]
  523. movdqu xmm2, 160[r8]
  524. cmp r9d, 12
  525. aesdec xmm1, 16[r8]
  526. aesdec xmm1, 32[r8]
  527. aesdec xmm1, 48[r8]
  528. aesdec xmm1, 64[r8]
  529. aesdec xmm1, 80[r8]
  530. aesdec xmm1, 96[r8]
  531. aesdec xmm1, 112[r8]
  532. aesdec xmm1, 128[r8]
  533. aesdec xmm1, 144[r8]
  534. jb DLAST_6_2
  535. movdqu xmm2, 192[r8]
  536. cmp r9d, 14
  537. aesdec xmm1, 160[r8]
  538. aesdec xmm1, 176[r8]
  539. jb DLAST_6_2
  540. movdqu xmm2, 224[r8]
  541. aesdec xmm1, 192[r8]
  542. aesdec xmm1, 208[r8]
  543. DLAST_6_2:
  544. aesdeclast xmm1, xmm2
  545. pxor xmm1, xmm7
  546. movdqa xmm7, xmm10
  547. movdqu [rsi], xmm1
  548. add rsi, 16
  549. dec r10
  550. jne DLOOP_6_2
  551. DEND_6:
  552. ; restore non volatile rdi,rsi
  553. mov rdi, rax
  554. mov rsi, r11
  555. ; restore non volatile xmms from stack
  556. movdqa xmm6, [rsp+0]
  557. movdqa xmm7, [rsp+16]
  558. movdqa xmm8, [rsp+32]
  559. movdqa xmm9, [rsp+48]
  560. movdqa xmm10, [rsp+64]
  561. movdqa xmm11, [rsp+80]
  562. movdqa xmm12, [rsp+96]
  563. movdqa xmm13, [rsp+112]
  564. movdqa xmm14, [rsp+128]
  565. add rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each
  566. ret
  567. AES_CBC_decrypt_AESNI_by6 ENDP
  568. ; void AES_CBC_decrypt_AESNI_by8(const unsigned char *in,
  569. ; unsigned char *out,
  570. ; unsigned char ivec[16],
  571. ; unsigned long length,
  572. ; const unsigned char *KS,
  573. ; int nr)
  574. AES_CBC_decrypt_AESNI_by8 PROC
  575. ; parameter 1: rdi - in
  576. ; parameter 2: rsi - out
  577. ; parameter 3: rdx - ivec
  578. ; parameter 4: rcx - length
  579. ; parameter 5: r8 - KS
  580. ; parameter 6: r9d - nr
  581. ; save rdi and rsi to rax and r11, restore before ret
  582. mov rax, rdi
  583. mov r11, rsi
  584. ; convert to what we had for att&t convention
  585. mov rdi, rcx
  586. mov rsi, rdx
  587. mov rdx, r8
  588. mov rcx,r9
  589. mov r8, [rsp+40]
  590. mov r9d, [rsp+48]
  591. ; on microsoft xmm6-xmm15 are non volatile,
  592. ; let's save on stack and restore at end
  593. sub rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each
  594. movdqa [rsp+0], xmm6
  595. movdqa [rsp+16], xmm7
  596. movdqa [rsp+32], xmm8
  597. movdqa [rsp+48], xmm9
  598. movdqa [rsp+64], xmm10
  599. movdqa [rsp+80], xmm11
  600. movdqa [rsp+96], xmm12
  601. movdqa [rsp+112], xmm13
  602. ; back to our original code, more or less
  603. mov r10, rcx
  604. shr rcx, 4
  605. shl r10, 60
  606. je DNO_PARTS_8
  607. add rcx, 1
  608. DNO_PARTS_8:
  609. mov r10, rcx
  610. shl r10, 61
  611. shr r10, 61
  612. shr rcx, 3
  613. movdqu xmm9, [rdx]
  614. je DREMAINDER_8
  615. sub rsi, 128
  616. DLOOP_8:
  617. movdqu xmm1, [rdi]
  618. movdqu xmm2, 16[rdi]
  619. movdqu xmm3, 32[rdi]
  620. movdqu xmm4, 48[rdi]
  621. movdqu xmm5, 64[rdi]
  622. movdqu xmm6, 80[rdi]
  623. movdqu xmm7, 96[rdi]
  624. movdqu xmm8, 112[rdi]
  625. movdqa xmm10, [r8]
  626. movdqa xmm11, 16[r8]
  627. movdqa xmm12, 32[r8]
  628. movdqa xmm13, 48[r8]
  629. pxor xmm1, xmm10
  630. pxor xmm2, xmm10
  631. pxor xmm3, xmm10
  632. pxor xmm4, xmm10
  633. pxor xmm5, xmm10
  634. pxor xmm6, xmm10
  635. pxor xmm7, xmm10
  636. pxor xmm8, xmm10
  637. aesdec xmm1, xmm11
  638. aesdec xmm2, xmm11
  639. aesdec xmm3, xmm11
  640. aesdec xmm4, xmm11
  641. aesdec xmm5, xmm11
  642. aesdec xmm6, xmm11
  643. aesdec xmm7, xmm11
  644. aesdec xmm8, xmm11
  645. aesdec xmm1, xmm12
  646. aesdec xmm2, xmm12
  647. aesdec xmm3, xmm12
  648. aesdec xmm4, xmm12
  649. aesdec xmm5, xmm12
  650. aesdec xmm6, xmm12
  651. aesdec xmm7, xmm12
  652. aesdec xmm8, xmm12
  653. aesdec xmm1, xmm13
  654. aesdec xmm2, xmm13
  655. aesdec xmm3, xmm13
  656. aesdec xmm4, xmm13
  657. aesdec xmm5, xmm13
  658. aesdec xmm6, xmm13
  659. aesdec xmm7, xmm13
  660. aesdec xmm8, xmm13
  661. movdqa xmm10, 64[r8]
  662. movdqa xmm11, 80[r8]
  663. movdqa xmm12, 96[r8]
  664. movdqa xmm13, 112[r8]
  665. aesdec xmm1, xmm10
  666. aesdec xmm2, xmm10
  667. aesdec xmm3, xmm10
  668. aesdec xmm4, xmm10
  669. aesdec xmm5, xmm10
  670. aesdec xmm6, xmm10
  671. aesdec xmm7, xmm10
  672. aesdec xmm8, xmm10
  673. aesdec xmm1, xmm11
  674. aesdec xmm2, xmm11
  675. aesdec xmm3, xmm11
  676. aesdec xmm4, xmm11
  677. aesdec xmm5, xmm11
  678. aesdec xmm6, xmm11
  679. aesdec xmm7, xmm11
  680. aesdec xmm8, xmm11
  681. aesdec xmm1, xmm12
  682. aesdec xmm2, xmm12
  683. aesdec xmm3, xmm12
  684. aesdec xmm4, xmm12
  685. aesdec xmm5, xmm12
  686. aesdec xmm6, xmm12
  687. aesdec xmm7, xmm12
  688. aesdec xmm8, xmm12
  689. aesdec xmm1, xmm13
  690. aesdec xmm2, xmm13
  691. aesdec xmm3, xmm13
  692. aesdec xmm4, xmm13
  693. aesdec xmm5, xmm13
  694. aesdec xmm6, xmm13
  695. aesdec xmm7, xmm13
  696. aesdec xmm8, xmm13
  697. movdqa xmm10, 128[r8]
  698. movdqa xmm11, 144[r8]
  699. movdqa xmm12, 160[r8]
  700. cmp r9d, 12
  701. aesdec xmm1, xmm10
  702. aesdec xmm2, xmm10
  703. aesdec xmm3, xmm10
  704. aesdec xmm4, xmm10
  705. aesdec xmm5, xmm10
  706. aesdec xmm6, xmm10
  707. aesdec xmm7, xmm10
  708. aesdec xmm8, xmm10
  709. aesdec xmm1, xmm11
  710. aesdec xmm2, xmm11
  711. aesdec xmm3, xmm11
  712. aesdec xmm4, xmm11
  713. aesdec xmm5, xmm11
  714. aesdec xmm6, xmm11
  715. aesdec xmm7, xmm11
  716. aesdec xmm8, xmm11
  717. jb DLAST_8
  718. movdqa xmm10, 160[r8]
  719. movdqa xmm11, 176[r8]
  720. movdqa xmm12, 192[r8]
  721. cmp r9d, 14
  722. aesdec xmm1, xmm10
  723. aesdec xmm2, xmm10
  724. aesdec xmm3, xmm10
  725. aesdec xmm4, xmm10
  726. aesdec xmm5, xmm10
  727. aesdec xmm6, xmm10
  728. aesdec xmm7, xmm10
  729. aesdec xmm8, xmm10
  730. aesdec xmm1, xmm11
  731. aesdec xmm2, xmm11
  732. aesdec xmm3, xmm11
  733. aesdec xmm4, xmm11
  734. aesdec xmm5, xmm11
  735. aesdec xmm6, xmm11
  736. aesdec xmm7, xmm11
  737. aesdec xmm8, xmm11
  738. jb DLAST_8
  739. movdqa xmm10, 192[r8]
  740. movdqa xmm11, 208[r8]
  741. movdqa xmm12, 224[r8]
  742. aesdec xmm1, xmm10
  743. aesdec xmm2, xmm10
  744. aesdec xmm3, xmm10
  745. aesdec xmm4, xmm10
  746. aesdec xmm5, xmm10
  747. aesdec xmm6, xmm10
  748. aesdec xmm7, xmm10
  749. aesdec xmm8, xmm10
  750. aesdec xmm1, xmm11
  751. aesdec xmm2, xmm11
  752. aesdec xmm3, xmm11
  753. aesdec xmm4, xmm11
  754. aesdec xmm5, xmm11
  755. aesdec xmm6, xmm11
  756. aesdec xmm7, xmm11
  757. aesdec xmm8, xmm11
  758. DLAST_8:
  759. add rsi, 128
  760. aesdeclast xmm1, xmm12
  761. aesdeclast xmm2, xmm12
  762. aesdeclast xmm3, xmm12
  763. aesdeclast xmm4, xmm12
  764. aesdeclast xmm5, xmm12
  765. aesdeclast xmm6, xmm12
  766. aesdeclast xmm7, xmm12
  767. aesdeclast xmm8, xmm12
  768. movdqu xmm10, [rdi]
  769. movdqu xmm11, 16[rdi]
  770. movdqu xmm12, 32[rdi]
  771. movdqu xmm13, 48[rdi]
  772. pxor xmm1, xmm9
  773. pxor xmm2, xmm10
  774. pxor xmm3, xmm11
  775. pxor xmm4, xmm12
  776. pxor xmm5, xmm13
  777. movdqu xmm10, 64[rdi]
  778. movdqu xmm11, 80[rdi]
  779. movdqu xmm12, 96[rdi]
  780. movdqu xmm9, 112[rdi]
  781. pxor xmm6, xmm10
  782. pxor xmm7, xmm11
  783. pxor xmm8, xmm12
  784. movdqu [rsi], xmm1
  785. movdqu 16[rsi], xmm2
  786. movdqu 32[rsi], xmm3
  787. movdqu 48[rsi], xmm4
  788. movdqu 64[rsi], xmm5
  789. movdqu 80[rsi], xmm6
  790. movdqu 96[rsi], xmm7
  791. movdqu 112[rsi], xmm8
  792. add rdi, 128
  793. dec rcx
  794. jne DLOOP_8
  795. add rsi, 128
  796. DREMAINDER_8:
  797. cmp r10, 0
  798. je DEND_8
  799. DLOOP_8_2:
  800. movdqu xmm1, [rdi]
  801. movdqa xmm10, xmm1
  802. add rdi, 16
  803. pxor xmm1, [r8]
  804. movdqu xmm2, 160[r8]
  805. cmp r9d, 12
  806. aesdec xmm1, 16[r8]
  807. aesdec xmm1, 32[r8]
  808. aesdec xmm1, 48[r8]
  809. aesdec xmm1, 64[r8]
  810. aesdec xmm1, 80[r8]
  811. aesdec xmm1, 96[r8]
  812. aesdec xmm1, 112[r8]
  813. aesdec xmm1, 128[r8]
  814. aesdec xmm1, 144[r8]
  815. jb DLAST_8_2
  816. movdqu xmm2, 192[r8]
  817. cmp r9d, 14
  818. aesdec xmm1, 160[r8]
  819. aesdec xmm1, 176[r8]
  820. jb DLAST_8_2
  821. movdqu xmm2, 224[r8]
  822. aesdec xmm1, 192[r8]
  823. aesdec xmm1, 208[r8]
  824. DLAST_8_2:
  825. aesdeclast xmm1, xmm2
  826. pxor xmm1, xmm9
  827. movdqa xmm9, xmm10
  828. movdqu [rsi], xmm1
  829. add rsi, 16
  830. dec r10
  831. jne DLOOP_8_2
  832. DEND_8:
  833. ; restore non volatile rdi,rsi
  834. mov rdi, rax
  835. mov rsi, r11
  836. ; restore non volatile xmms from stack
  837. movdqa xmm6, [rsp+0]
  838. movdqa xmm7, [rsp+16]
  839. movdqa xmm8, [rsp+32]
  840. movdqa xmm9, [rsp+48]
  841. movdqa xmm10, [rsp+64]
  842. movdqa xmm11, [rsp+80]
  843. movdqa xmm12, [rsp+96]
  844. movdqa xmm13, [rsp+112]
  845. add rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each
  846. ret
  847. AES_CBC_decrypt_AESNI_by8 ENDP
  848. ; /*
  849. ; AES_ECB_encrypt_AESNI[const ,unsigned char*in
  850. ; unsigned ,char*out
  851. ; unsigned ,long length
  852. ; const ,unsigned char*KS
  853. ; int nr]
  854. ; */
  855. ; . globl AES_ECB_encrypt_AESNI
  856. AES_ECB_encrypt_AESNI PROC
  857. ;# parameter 1: rdi
  858. ;# parameter 2: rsi
  859. ;# parameter 3: rdx
  860. ;# parameter 4: rcx
  861. ;# parameter 5: r8d
  862. ; save rdi and rsi to rax and r11, restore before ret
  863. mov rax,rdi
  864. mov r11,rsi
  865. ; convert to what we had for att&t convention
  866. mov rdi,rcx
  867. mov rsi,rdx
  868. mov rdx,r8
  869. mov rcx,r9
  870. mov r8d,[rsp+40]
  871. ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
  872. sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each
  873. movdqa [rsp+0], xmm9
  874. movdqa [rsp+16], xmm10
  875. movdqa [rsp+32], xmm11
  876. movdqa [rsp+48], xmm12
  877. mov r10,rdx
  878. shr rdx,4
  879. shl r10,60
  880. je EECB_NO_PARTS_4
  881. add rdx,1
  882. EECB_NO_PARTS_4:
  883. mov r10,rdx
  884. shl r10,62
  885. shr r10,62
  886. shr rdx,2
  887. je EECB_REMAINDER_4
  888. sub rsi,64
  889. EECB_LOOP_4:
  890. movdqu xmm1,[rdi]
  891. movdqu xmm2,16[rdi]
  892. movdqu xmm3,32[rdi]
  893. movdqu xmm4,48[rdi]
  894. movdqa xmm9,[rcx]
  895. movdqa xmm10,16[rcx]
  896. movdqa xmm11,32[rcx]
  897. movdqa xmm12,48[rcx]
  898. pxor xmm1,xmm9
  899. pxor xmm2,xmm9
  900. pxor xmm3,xmm9
  901. pxor xmm4,xmm9
  902. aesenc xmm1,xmm10
  903. aesenc xmm2,xmm10
  904. aesenc xmm3,xmm10
  905. aesenc xmm4,xmm10
  906. aesenc xmm1,xmm11
  907. aesenc xmm2,xmm11
  908. aesenc xmm3,xmm11
  909. aesenc xmm4,xmm11
  910. aesenc xmm1,xmm12
  911. aesenc xmm2,xmm12
  912. aesenc xmm3,xmm12
  913. aesenc xmm4,xmm12
  914. movdqa xmm9,64[rcx]
  915. movdqa xmm10,80[rcx]
  916. movdqa xmm11,96[rcx]
  917. movdqa xmm12,112[rcx]
  918. aesenc xmm1,xmm9
  919. aesenc xmm2,xmm9
  920. aesenc xmm3,xmm9
  921. aesenc xmm4,xmm9
  922. aesenc xmm1,xmm10
  923. aesenc xmm2,xmm10
  924. aesenc xmm3,xmm10
  925. aesenc xmm4,xmm10
  926. aesenc xmm1,xmm11
  927. aesenc xmm2,xmm11
  928. aesenc xmm3,xmm11
  929. aesenc xmm4,xmm11
  930. aesenc xmm1,xmm12
  931. aesenc xmm2,xmm12
  932. aesenc xmm3,xmm12
  933. aesenc xmm4,xmm12
  934. movdqa xmm9,128[rcx]
  935. movdqa xmm10,144[rcx]
  936. movdqa xmm11,160[rcx]
  937. cmp r8d,12
  938. aesenc xmm1,xmm9
  939. aesenc xmm2,xmm9
  940. aesenc xmm3,xmm9
  941. aesenc xmm4,xmm9
  942. aesenc xmm1,xmm10
  943. aesenc xmm2,xmm10
  944. aesenc xmm3,xmm10
  945. aesenc xmm4,xmm10
  946. jb EECB_LAST_4
  947. movdqa xmm9,160[rcx]
  948. movdqa xmm10,176[rcx]
  949. movdqa xmm11,192[rcx]
  950. cmp r8d,14
  951. aesenc xmm1,xmm9
  952. aesenc xmm2,xmm9
  953. aesenc xmm3,xmm9
  954. aesenc xmm4,xmm9
  955. aesenc xmm1,xmm10
  956. aesenc xmm2,xmm10
  957. aesenc xmm3,xmm10
  958. aesenc xmm4,xmm10
  959. jb EECB_LAST_4
  960. movdqa xmm9,192[rcx]
  961. movdqa xmm10,208[rcx]
  962. movdqa xmm11,224[rcx]
  963. aesenc xmm1,xmm9
  964. aesenc xmm2,xmm9
  965. aesenc xmm3,xmm9
  966. aesenc xmm4,xmm9
  967. aesenc xmm1,xmm10
  968. aesenc xmm2,xmm10
  969. aesenc xmm3,xmm10
  970. aesenc xmm4,xmm10
  971. EECB_LAST_4:
  972. add rdi,64
  973. add rsi,64
  974. dec rdx
  975. aesenclast xmm1,xmm11
  976. aesenclast xmm2,xmm11
  977. aesenclast xmm3,xmm11
  978. aesenclast xmm4,xmm11
  979. movdqu [rsi],xmm1
  980. movdqu 16[rsi],xmm2
  981. movdqu 32[rsi],xmm3
  982. movdqu 48[rsi],xmm4
  983. jne EECB_LOOP_4
  984. add rsi,64
  985. EECB_REMAINDER_4:
  986. cmp r10,0
  987. je EECB_END_4
  988. EECB_LOOP_4_2:
  989. movdqu xmm1,[rdi]
  990. add rdi,16
  991. pxor xmm1,[rcx]
  992. movdqu xmm2,160[rcx]
  993. aesenc xmm1,16[rcx]
  994. aesenc xmm1,32[rcx]
  995. aesenc xmm1,48[rcx]
  996. aesenc xmm1,64[rcx]
  997. aesenc xmm1,80[rcx]
  998. aesenc xmm1,96[rcx]
  999. aesenc xmm1,112[rcx]
  1000. aesenc xmm1,128[rcx]
  1001. aesenc xmm1,144[rcx]
  1002. cmp r8d,12
  1003. jb EECB_LAST_4_2
  1004. movdqu xmm2,192[rcx]
  1005. aesenc xmm1,160[rcx]
  1006. aesenc xmm1,176[rcx]
  1007. cmp r8d,14
  1008. jb EECB_LAST_4_2
  1009. movdqu xmm2,224[rcx]
  1010. aesenc xmm1,192[rcx]
  1011. aesenc xmm1,208[rcx]
  1012. EECB_LAST_4_2:
  1013. aesenclast xmm1,xmm2
  1014. movdqu [rsi],xmm1
  1015. add rsi,16
  1016. dec r10
  1017. jne EECB_LOOP_4_2
  1018. EECB_END_4:
  1019. ; restore non volatile rdi,rsi
  1020. mov rdi,rax
  1021. mov rsi,r11
  1022. ; restore non volatile xmms from stack
  1023. movdqa xmm9, [rsp+0]
  1024. movdqa xmm10, [rsp+16]
  1025. movdqa xmm11, [rsp+32]
  1026. movdqa xmm12, [rsp+48]
  1027. add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
  1028. ret
  1029. AES_ECB_encrypt_AESNI ENDP
  1030. ; /*
  1031. ; AES_ECB_decrypt_AESNI[const ,unsigned char*in
  1032. ; unsigned ,char*out
  1033. ; unsigned ,long length
  1034. ; const ,unsigned char*KS
  1035. ; int nr]
  1036. ; */
  1037. ; . globl AES_ECB_decrypt_AESNI
  1038. AES_ECB_decrypt_AESNI PROC
  1039. ;# parameter 1: rdi
  1040. ;# parameter 2: rsi
  1041. ;# parameter 3: rdx
  1042. ;# parameter 4: rcx
  1043. ;# parameter 5: r8d
  1044. ; save rdi and rsi to rax and r11, restore before ret
  1045. mov rax,rdi
  1046. mov r11,rsi
  1047. ; convert to what we had for att&t convention
  1048. mov rdi,rcx
  1049. mov rsi,rdx
  1050. mov rdx,r8
  1051. mov rcx,r9
  1052. mov r8d,[rsp+40]
  1053. ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
  1054. sub rsp,8+4*16 ; 8 = align stack , 4 xmm9-12, 16 bytes each
  1055. movdqa [rsp+0], xmm9
  1056. movdqa [rsp+16], xmm10
  1057. movdqa [rsp+32], xmm11
  1058. movdqa [rsp+48], xmm12
  1059. mov r10,rdx
  1060. shr rdx,4
  1061. shl r10,60
  1062. je DECB_NO_PARTS_4
  1063. add rdx,1
  1064. DECB_NO_PARTS_4:
  1065. mov r10,rdx
  1066. shl r10,62
  1067. shr r10,62
  1068. shr rdx,2
  1069. je DECB_REMAINDER_4
  1070. sub rsi,64
  1071. DECB_LOOP_4:
  1072. movdqu xmm1,[rdi]
  1073. movdqu xmm2,16[rdi]
  1074. movdqu xmm3,32[rdi]
  1075. movdqu xmm4,48[rdi]
  1076. movdqa xmm9,[rcx]
  1077. movdqa xmm10,16[rcx]
  1078. movdqa xmm11,32[rcx]
  1079. movdqa xmm12,48[rcx]
  1080. pxor xmm1,xmm9
  1081. pxor xmm2,xmm9
  1082. pxor xmm3,xmm9
  1083. pxor xmm4,xmm9
  1084. aesdec xmm1,xmm10
  1085. aesdec xmm2,xmm10
  1086. aesdec xmm3,xmm10
  1087. aesdec xmm4,xmm10
  1088. aesdec xmm1,xmm11
  1089. aesdec xmm2,xmm11
  1090. aesdec xmm3,xmm11
  1091. aesdec xmm4,xmm11
  1092. aesdec xmm1,xmm12
  1093. aesdec xmm2,xmm12
  1094. aesdec xmm3,xmm12
  1095. aesdec xmm4,xmm12
  1096. movdqa xmm9,64[rcx]
  1097. movdqa xmm10,80[rcx]
  1098. movdqa xmm11,96[rcx]
  1099. movdqa xmm12,112[rcx]
  1100. aesdec xmm1,xmm9
  1101. aesdec xmm2,xmm9
  1102. aesdec xmm3,xmm9
  1103. aesdec xmm4,xmm9
  1104. aesdec xmm1,xmm10
  1105. aesdec xmm2,xmm10
  1106. aesdec xmm3,xmm10
  1107. aesdec xmm4,xmm10
  1108. aesdec xmm1,xmm11
  1109. aesdec xmm2,xmm11
  1110. aesdec xmm3,xmm11
  1111. aesdec xmm4,xmm11
  1112. aesdec xmm1,xmm12
  1113. aesdec xmm2,xmm12
  1114. aesdec xmm3,xmm12
  1115. aesdec xmm4,xmm12
  1116. movdqa xmm9,128[rcx]
  1117. movdqa xmm10,144[rcx]
  1118. movdqa xmm11,160[rcx]
  1119. cmp r8d,12
  1120. aesdec xmm1,xmm9
  1121. aesdec xmm2,xmm9
  1122. aesdec xmm3,xmm9
  1123. aesdec xmm4,xmm9
  1124. aesdec xmm1,xmm10
  1125. aesdec xmm2,xmm10
  1126. aesdec xmm3,xmm10
  1127. aesdec xmm4,xmm10
  1128. jb DECB_LAST_4
  1129. movdqa xmm9,160[rcx]
  1130. movdqa xmm10,176[rcx]
  1131. movdqa xmm11,192[rcx]
  1132. cmp r8d,14
  1133. aesdec xmm1,xmm9
  1134. aesdec xmm2,xmm9
  1135. aesdec xmm3,xmm9
  1136. aesdec xmm4,xmm9
  1137. aesdec xmm1,xmm10
  1138. aesdec xmm2,xmm10
  1139. aesdec xmm3,xmm10
  1140. aesdec xmm4,xmm10
  1141. jb DECB_LAST_4
  1142. movdqa xmm9,192[rcx]
  1143. movdqa xmm10,208[rcx]
  1144. movdqa xmm11,224[rcx]
  1145. aesdec xmm1,xmm9
  1146. aesdec xmm2,xmm9
  1147. aesdec xmm3,xmm9
  1148. aesdec xmm4,xmm9
  1149. aesdec xmm1,xmm10
  1150. aesdec xmm2,xmm10
  1151. aesdec xmm3,xmm10
  1152. aesdec xmm4,xmm10
  1153. DECB_LAST_4:
  1154. add rdi,64
  1155. add rsi,64
  1156. dec rdx
  1157. aesdeclast xmm1,xmm11
  1158. aesdeclast xmm2,xmm11
  1159. aesdeclast xmm3,xmm11
  1160. aesdeclast xmm4,xmm11
  1161. movdqu [rsi],xmm1
  1162. movdqu 16[rsi],xmm2
  1163. movdqu 32[rsi],xmm3
  1164. movdqu 48[rsi],xmm4
  1165. jne DECB_LOOP_4
  1166. add rsi,64
  1167. DECB_REMAINDER_4:
  1168. cmp r10,0
  1169. je DECB_END_4
  1170. DECB_LOOP_4_2:
  1171. movdqu xmm1,[rdi]
  1172. add rdi,16
  1173. pxor xmm1,[rcx]
  1174. movdqu xmm2,160[rcx]
  1175. cmp r8d,12
  1176. aesdec xmm1,16[rcx]
  1177. aesdec xmm1,32[rcx]
  1178. aesdec xmm1,48[rcx]
  1179. aesdec xmm1,64[rcx]
  1180. aesdec xmm1,80[rcx]
  1181. aesdec xmm1,96[rcx]
  1182. aesdec xmm1,112[rcx]
  1183. aesdec xmm1,128[rcx]
  1184. aesdec xmm1,144[rcx]
  1185. jb DECB_LAST_4_2
  1186. cmp r8d,14
  1187. movdqu xmm2,192[rcx]
  1188. aesdec xmm1,160[rcx]
  1189. aesdec xmm1,176[rcx]
  1190. jb DECB_LAST_4_2
  1191. movdqu xmm2,224[rcx]
  1192. aesdec xmm1,192[rcx]
  1193. aesdec xmm1,208[rcx]
  1194. DECB_LAST_4_2:
  1195. aesdeclast xmm1,xmm2
  1196. movdqu [rsi],xmm1
  1197. add rsi,16
  1198. dec r10
  1199. jne DECB_LOOP_4_2
  1200. DECB_END_4:
  1201. ; restore non volatile rdi,rsi
  1202. mov rdi,rax
  1203. mov rsi,r11
  1204. ; restore non volatile xmms from stack
  1205. movdqa xmm9, [rsp+0]
  1206. movdqa xmm10, [rsp+16]
  1207. movdqa xmm11, [rsp+32]
  1208. movdqa xmm12, [rsp+48]
  1209. add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
  1210. ret
  1211. AES_ECB_decrypt_AESNI ENDP
  1212. ; /*
  1213. ; void ,AES_128_Key_Expansion_AESNI[const unsigned char*userkey
  1214. ; unsigned char*key_schedule]/
  1215. ; */
  1216. ; . align 16,0x90
  1217. ; . globl AES_128_Key_Expansion_AESNI
  1218. AES_128_Key_Expansion_AESNI PROC
  1219. ;# parameter 1: rdi
  1220. ;# parameter 2: rsi
  1221. ; save rdi and rsi to rax and r11, restore before ret
  1222. mov rax,rdi
  1223. mov r11,rsi
  1224. ; convert to what we had for att&t convention
  1225. mov rdi,rcx
  1226. mov rsi,rdx
  1227. mov dword ptr 240[rsi],10
  1228. movdqu xmm1,[rdi]
  1229. movdqa [rsi],xmm1
  1230. ASSISTS:
  1231. aeskeygenassist xmm2,xmm1,1
  1232. call PREPARE_ROUNDKEY_128
  1233. movdqa 16[rsi],xmm1
  1234. aeskeygenassist xmm2,xmm1,2
  1235. call PREPARE_ROUNDKEY_128
  1236. movdqa 32[rsi],xmm1
  1237. aeskeygenassist xmm2,xmm1,4
  1238. call PREPARE_ROUNDKEY_128
  1239. movdqa 48[rsi],xmm1
  1240. aeskeygenassist xmm2,xmm1,8
  1241. call PREPARE_ROUNDKEY_128
  1242. movdqa 64[rsi],xmm1
  1243. aeskeygenassist xmm2,xmm1,16
  1244. call PREPARE_ROUNDKEY_128
  1245. movdqa 80[rsi],xmm1
  1246. aeskeygenassist xmm2,xmm1,32
  1247. call PREPARE_ROUNDKEY_128
  1248. movdqa 96[rsi],xmm1
  1249. aeskeygenassist xmm2,xmm1,64
  1250. call PREPARE_ROUNDKEY_128
  1251. movdqa 112[rsi],xmm1
  1252. aeskeygenassist xmm2,xmm1,80h
  1253. call PREPARE_ROUNDKEY_128
  1254. movdqa 128[rsi],xmm1
  1255. aeskeygenassist xmm2,xmm1,1bh
  1256. call PREPARE_ROUNDKEY_128
  1257. movdqa 144[rsi],xmm1
  1258. aeskeygenassist xmm2,xmm1,36h
  1259. call PREPARE_ROUNDKEY_128
  1260. movdqa 160[rsi],xmm1
  1261. ; restore non volatile rdi,rsi
  1262. mov rdi,rax
  1263. mov rsi,r11
  1264. ret
  1265. PREPARE_ROUNDKEY_128:
  1266. pshufd xmm2,xmm2,255
  1267. movdqa xmm3,xmm1
  1268. pslldq xmm3,4
  1269. pxor xmm1,xmm3
  1270. pslldq xmm3,4
  1271. pxor xmm1,xmm3
  1272. pslldq xmm3,4
  1273. pxor xmm1,xmm3
  1274. pxor xmm1,xmm2
  1275. ret
  1276. AES_128_Key_Expansion_AESNI ENDP
  1277. ; /*
  1278. ; void ,AES_192_Key_Expansion_AESNI[const unsigned char*userkey
  1279. ; unsigned char*key]
  1280. ; */
  1281. ; . globl AES_192_Key_Expansion_AESNI
  1282. AES_192_Key_Expansion_AESNI PROC
  1283. ;# parameter 1: rdi
  1284. ;# parameter 2: rsi
  1285. ; save rdi and rsi to rax and r11, restore before ret
  1286. mov rax,rdi
  1287. mov r11,rsi
  1288. ; convert to what we had for att&t convention
  1289. mov rdi,rcx
  1290. mov rsi,rdx
  1291. ; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
  1292. sub rsp,8+1*16 ; 8 = align stack , 1 xmm6, 16 bytes each
  1293. movdqa [rsp+0], xmm6
  1294. movdqu xmm1,[rdi]
  1295. movq xmm3,qword ptr 16[rdi]
  1296. movdqa [rsi],xmm1
  1297. movdqa xmm5,xmm3
  1298. aeskeygenassist xmm2,xmm3,1h
  1299. call PREPARE_ROUNDKEY_192
  1300. shufpd xmm5,xmm1,0
  1301. movdqa 16[rsi],xmm5
  1302. movdqa xmm6,xmm1
  1303. shufpd xmm6,xmm3,1
  1304. movdqa 32[rsi],xmm6
  1305. aeskeygenassist xmm2,xmm3,2h
  1306. call PREPARE_ROUNDKEY_192
  1307. movdqa 48[rsi],xmm1
  1308. movdqa xmm5,xmm3
  1309. aeskeygenassist xmm2,xmm3,4h
  1310. call PREPARE_ROUNDKEY_192
  1311. shufpd xmm5,xmm1,0
  1312. movdqa 64[rsi],xmm5
  1313. movdqa xmm6,xmm1
  1314. shufpd xmm6,xmm3,1
  1315. movdqa 80[rsi],xmm6
  1316. aeskeygenassist xmm2,xmm3,8h
  1317. call PREPARE_ROUNDKEY_192
  1318. movdqa 96[rsi],xmm1
  1319. movdqa xmm5,xmm3
  1320. aeskeygenassist xmm2,xmm3,10h
  1321. call PREPARE_ROUNDKEY_192
  1322. shufpd xmm5,xmm1,0
  1323. movdqa 112[rsi],xmm5
  1324. movdqa xmm6,xmm1
  1325. shufpd xmm6,xmm3,1
  1326. movdqa 128[rsi],xmm6
  1327. aeskeygenassist xmm2,xmm3,20h
  1328. call PREPARE_ROUNDKEY_192
  1329. movdqa 144[rsi],xmm1
  1330. movdqa xmm5,xmm3
  1331. aeskeygenassist xmm2,xmm3,40h
  1332. call PREPARE_ROUNDKEY_192
  1333. shufpd xmm5,xmm1,0
  1334. movdqa 160[rsi],xmm5
  1335. movdqa xmm6,xmm1
  1336. shufpd xmm6,xmm3,1
  1337. movdqa 176[rsi],xmm6
  1338. aeskeygenassist xmm2,xmm3,80h
  1339. call PREPARE_ROUNDKEY_192
  1340. movdqa 192[rsi],xmm1
  1341. movdqa 208[rsi],xmm3
  1342. ; restore non volatile rdi,rsi
  1343. mov rdi,rax
  1344. mov rsi,r11
  1345. ; restore non volatile xmms from stack
  1346. movdqa xmm6, [rsp+0]
  1347. add rsp,8+1*16 ; 8 = align stack , 1 xmm6 16 bytes each
  1348. ret
  1349. PREPARE_ROUNDKEY_192:
  1350. pshufd xmm2,xmm2,55h
  1351. movdqu xmm4,xmm1
  1352. pslldq xmm4,4
  1353. pxor xmm1,xmm4
  1354. pslldq xmm4,4
  1355. pxor xmm1,xmm4
  1356. pslldq xmm4,4
  1357. pxor xmm1,xmm4
  1358. pxor xmm1,xmm2
  1359. pshufd xmm2,xmm1,0ffh
  1360. movdqu xmm4,xmm3
  1361. pslldq xmm4,4
  1362. pxor xmm3,xmm4
  1363. pxor xmm3,xmm2
  1364. ret
  1365. AES_192_Key_Expansion_AESNI ENDP
  1366. ; /*
  1367. ; void ,AES_256_Key_Expansion_AESNI[const unsigned char*userkey
  1368. ; unsigned char*key]
  1369. ; */
  1370. ; . globl AES_256_Key_Expansion_AESNI
  1371. AES_256_Key_Expansion_AESNI PROC
  1372. ;# parameter 1: rdi
  1373. ;# parameter 2: rsi
  1374. ; save rdi and rsi to rax and r11, restore before ret
  1375. mov rax,rdi
  1376. mov r11,rsi
  1377. ; convert to what we had for att&t convention
  1378. mov rdi,rcx
  1379. mov rsi,rdx
  1380. movdqu xmm1,[rdi]
  1381. movdqu xmm3,16[rdi]
  1382. movdqa [rsi],xmm1
  1383. movdqa 16[rsi],xmm3
  1384. aeskeygenassist xmm2,xmm3,1h
  1385. call MAKE_RK256_a
  1386. movdqa 32[rsi],xmm1
  1387. aeskeygenassist xmm2,xmm1,0h
  1388. call MAKE_RK256_b
  1389. movdqa 48[rsi],xmm3
  1390. aeskeygenassist xmm2,xmm3,2h
  1391. call MAKE_RK256_a
  1392. movdqa 64[rsi],xmm1
  1393. aeskeygenassist xmm2,xmm1,0h
  1394. call MAKE_RK256_b
  1395. movdqa 80[rsi],xmm3
  1396. aeskeygenassist xmm2,xmm3,4h
  1397. call MAKE_RK256_a
  1398. movdqa 96[rsi],xmm1
  1399. aeskeygenassist xmm2,xmm1,0h
  1400. call MAKE_RK256_b
  1401. movdqa 112[rsi],xmm3
  1402. aeskeygenassist xmm2,xmm3,8h
  1403. call MAKE_RK256_a
  1404. movdqa 128[rsi],xmm1
  1405. aeskeygenassist xmm2,xmm1,0h
  1406. call MAKE_RK256_b
  1407. movdqa 144[rsi],xmm3
  1408. aeskeygenassist xmm2,xmm3,10h
  1409. call MAKE_RK256_a
  1410. movdqa 160[rsi],xmm1
  1411. aeskeygenassist xmm2,xmm1,0h
  1412. call MAKE_RK256_b
  1413. movdqa 176[rsi],xmm3
  1414. aeskeygenassist xmm2,xmm3,20h
  1415. call MAKE_RK256_a
  1416. movdqa 192[rsi],xmm1
  1417. aeskeygenassist xmm2,xmm1,0h
  1418. call MAKE_RK256_b
  1419. movdqa 208[rsi],xmm3
  1420. aeskeygenassist xmm2,xmm3,40h
  1421. call MAKE_RK256_a
  1422. movdqa 224[rsi],xmm1
  1423. ; restore non volatile rdi,rsi
  1424. mov rdi,rax
  1425. mov rsi,r11
  1426. ret
  1427. AES_256_Key_Expansion_AESNI ENDP
  1428. MAKE_RK256_a:
  1429. pshufd xmm2,xmm2,0ffh
  1430. movdqa xmm4,xmm1
  1431. pslldq xmm4,4
  1432. pxor xmm1,xmm4
  1433. pslldq xmm4,4
  1434. pxor xmm1,xmm4
  1435. pslldq xmm4,4
  1436. pxor xmm1,xmm4
  1437. pxor xmm1,xmm2
  1438. ret
  1439. MAKE_RK256_b:
  1440. pshufd xmm2,xmm2,0aah
  1441. movdqa xmm4,xmm3
  1442. pslldq xmm4,4
  1443. pxor xmm3,xmm4
  1444. pslldq xmm4,4
  1445. pxor xmm3,xmm4
  1446. pslldq xmm4,4
  1447. pxor xmm3,xmm4
  1448. pxor xmm3,xmm2
  1449. ret
  1450. IF fips_version GE 2
  1451. fipsAh ENDS
  1452. ELSE
  1453. _text ENDS
  1454. ENDIF
  1455. END