2
0

aesni-mb-x86_64.pl 37 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer AES-NI procedures process several independent buffers
  15. # in parallel by interleaving independent instructions.
  16. #
  17. # Cycles per byte for interleave factor 4:
  18. #
  19. # asymptotic measured
  20. # ---------------------------
  21. # Westmere 5.00/4=1.25 5.13/4=1.28
  22. # Atom 15.0/4=3.75 ?15.7/4=3.93
  23. # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
  24. # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
  25. # Haswell 4.44/4=1.11 4.44/4=1.11
  26. # Bulldozer 5.75/4=1.44 5.76/4=1.44
  27. #
  28. # Cycles per byte for interleave factor 8 (not implemented for
  29. # pre-AVX processors, where higher interleave factor incidentally
  30. # doesn't result in improvement):
  31. #
  32. # asymptotic measured
  33. # ---------------------------
  34. # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
  35. # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
  36. # Haswell 5.00/8=0.63 5.00/8=0.63
  37. # Bulldozer 5.75/8=0.72 5.77/8=0.72
  38. #
  39. # (*) Sandy/Ivy Bridge are known to handle high interleave factors
  40. # suboptimally;
  41. $flavour = shift;
  42. $output = shift;
  43. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  44. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  45. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  46. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  47. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  48. die "can't locate x86_64-xlate.pl";
  49. $avx=0;
  50. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  51. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  52. $avx = ($1>=2.19) + ($1>=2.22);
  53. }
  54. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  55. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  56. $avx = ($1>=2.09) + ($1>=2.10);
  57. }
  58. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60. $avx = ($1>=10) + ($1>=11);
  61. }
  62. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  63. $avx = ($2>=3.0) + ($2>3.0);
  64. }
  65. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  66. *STDOUT=*OUT;
  67. # void aesni_multi_cbc_encrypt (
  68. # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
  69. # const AES_KEY *key,
  70. # int num); /* 1 or 2 */
  71. #
  72. $inp="%rdi"; # 1st arg
  73. $key="%rsi"; # 2nd arg
  74. $num="%edx";
  75. @inptr=map("%r$_",(8..11));
  76. @outptr=map("%r$_",(12..15));
  77. ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
  78. @out=map("%xmm$_",(2..5));
  79. @inp=map("%xmm$_",(6..9));
  80. ($counters,$mask,$zero)=map("%xmm$_",(10..12));
  81. ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
  82. $code.=<<___;
  83. .text
  84. .extern OPENSSL_ia32cap_P
  85. .globl aesni_multi_cbc_encrypt
  86. .type aesni_multi_cbc_encrypt,\@function,3
  87. .align 32
  88. aesni_multi_cbc_encrypt:
  89. .cfi_startproc
  90. ___
  91. $code.=<<___ if ($avx);
  92. cmp \$2,$num
  93. jb .Lenc_non_avx
  94. mov OPENSSL_ia32cap_P+4(%rip),%ecx
  95. test \$`1<<28`,%ecx # AVX bit
  96. jnz _avx_cbc_enc_shortcut
  97. jmp .Lenc_non_avx
  98. .align 16
  99. .Lenc_non_avx:
  100. ___
  101. $code.=<<___;
  102. mov %rsp,%rax
  103. .cfi_def_cfa_register %rax
  104. push %rbx
  105. .cfi_push %rbx
  106. push %rbp
  107. .cfi_push %rbp
  108. push %r12
  109. .cfi_push %r12
  110. push %r13
  111. .cfi_push %r13
  112. push %r14
  113. .cfi_push %r14
  114. push %r15
  115. .cfi_push %r15
  116. ___
  117. $code.=<<___ if ($win64);
  118. lea -0xa8(%rsp),%rsp
  119. movaps %xmm6,(%rsp)
  120. movaps %xmm7,0x10(%rsp)
  121. movaps %xmm8,0x20(%rsp)
  122. movaps %xmm9,0x30(%rsp)
  123. movaps %xmm10,0x40(%rsp)
  124. movaps %xmm11,0x50(%rsp)
  125. movaps %xmm12,0x60(%rsp)
  126. movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
  127. movaps %xmm14,-0x58(%rax)
  128. movaps %xmm15,-0x48(%rax)
  129. ___
  130. $code.=<<___;
  131. # stack layout
  132. #
  133. # +0 output sink
  134. # +16 input sink [original %rsp and $num]
  135. # +32 counters
  136. sub \$48,%rsp
  137. and \$-64,%rsp
  138. mov %rax,16(%rsp) # original %rsp
  139. .cfi_cfa_expression %rsp+16,deref,+8
  140. .Lenc4x_body:
  141. movdqu ($key),$zero # 0-round key
  142. lea 0x78($key),$key # size optimization
  143. lea 40*2($inp),$inp
  144. .Lenc4x_loop_grande:
  145. mov $num,24(%rsp) # original $num
  146. xor $num,$num
  147. ___
  148. for($i=0;$i<4;$i++) {
  149. $code.=<<___;
  150. mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
  151. mov `40*$i+0-40*2`($inp),@inptr[$i]
  152. cmp $num,$one
  153. mov `40*$i+8-40*2`($inp),@outptr[$i]
  154. cmovg $one,$num # find maximum
  155. test $one,$one
  156. movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
  157. mov $one,`32+4*$i`(%rsp) # initialize counters
  158. cmovle %rsp,@inptr[$i] # cancel input
  159. ___
  160. }
  161. $code.=<<___;
  162. test $num,$num
  163. jz .Lenc4x_done
  164. movups 0x10-0x78($key),$rndkey1
  165. pxor $zero,@out[0]
  166. movups 0x20-0x78($key),$rndkey0
  167. pxor $zero,@out[1]
  168. mov 0xf0-0x78($key),$rounds
  169. pxor $zero,@out[2]
  170. movdqu (@inptr[0]),@inp[0] # load inputs
  171. pxor $zero,@out[3]
  172. movdqu (@inptr[1]),@inp[1]
  173. pxor @inp[0],@out[0]
  174. movdqu (@inptr[2]),@inp[2]
  175. pxor @inp[1],@out[1]
  176. movdqu (@inptr[3]),@inp[3]
  177. pxor @inp[2],@out[2]
  178. pxor @inp[3],@out[3]
  179. movdqa 32(%rsp),$counters # load counters
  180. xor $offset,$offset
  181. jmp .Loop_enc4x
  182. .align 32
  183. .Loop_enc4x:
  184. add \$16,$offset
  185. lea 16(%rsp),$sink # sink pointer
  186. mov \$1,$one # constant of 1
  187. sub $offset,$sink
  188. aesenc $rndkey1,@out[0]
  189. prefetcht0 31(@inptr[0],$offset) # prefetch input
  190. prefetcht0 31(@inptr[1],$offset)
  191. aesenc $rndkey1,@out[1]
  192. prefetcht0 31(@inptr[2],$offset)
  193. prefetcht0 31(@inptr[2],$offset)
  194. aesenc $rndkey1,@out[2]
  195. aesenc $rndkey1,@out[3]
  196. movups 0x30-0x78($key),$rndkey1
  197. ___
  198. for($i=0;$i<4;$i++) {
  199. my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
  200. $code.=<<___;
  201. cmp `32+4*$i`(%rsp),$one
  202. aesenc $rndkey,@out[0]
  203. aesenc $rndkey,@out[1]
  204. aesenc $rndkey,@out[2]
  205. cmovge $sink,@inptr[$i] # cancel input
  206. cmovg $sink,@outptr[$i] # sink output
  207. aesenc $rndkey,@out[3]
  208. movups `0x40+16*$i-0x78`($key),$rndkey
  209. ___
  210. }
  211. $code.=<<___;
  212. movdqa $counters,$mask
  213. aesenc $rndkey0,@out[0]
  214. prefetcht0 15(@outptr[0],$offset) # prefetch output
  215. prefetcht0 15(@outptr[1],$offset)
  216. aesenc $rndkey0,@out[1]
  217. prefetcht0 15(@outptr[2],$offset)
  218. prefetcht0 15(@outptr[3],$offset)
  219. aesenc $rndkey0,@out[2]
  220. aesenc $rndkey0,@out[3]
  221. movups 0x80-0x78($key),$rndkey0
  222. pxor $zero,$zero
  223. aesenc $rndkey1,@out[0]
  224. pcmpgtd $zero,$mask
  225. movdqu -0x78($key),$zero # reload 0-round key
  226. aesenc $rndkey1,@out[1]
  227. paddd $mask,$counters # decrement counters
  228. movdqa $counters,32(%rsp) # update counters
  229. aesenc $rndkey1,@out[2]
  230. aesenc $rndkey1,@out[3]
  231. movups 0x90-0x78($key),$rndkey1
  232. cmp \$11,$rounds
  233. aesenc $rndkey0,@out[0]
  234. aesenc $rndkey0,@out[1]
  235. aesenc $rndkey0,@out[2]
  236. aesenc $rndkey0,@out[3]
  237. movups 0xa0-0x78($key),$rndkey0
  238. jb .Lenc4x_tail
  239. aesenc $rndkey1,@out[0]
  240. aesenc $rndkey1,@out[1]
  241. aesenc $rndkey1,@out[2]
  242. aesenc $rndkey1,@out[3]
  243. movups 0xb0-0x78($key),$rndkey1
  244. aesenc $rndkey0,@out[0]
  245. aesenc $rndkey0,@out[1]
  246. aesenc $rndkey0,@out[2]
  247. aesenc $rndkey0,@out[3]
  248. movups 0xc0-0x78($key),$rndkey0
  249. je .Lenc4x_tail
  250. aesenc $rndkey1,@out[0]
  251. aesenc $rndkey1,@out[1]
  252. aesenc $rndkey1,@out[2]
  253. aesenc $rndkey1,@out[3]
  254. movups 0xd0-0x78($key),$rndkey1
  255. aesenc $rndkey0,@out[0]
  256. aesenc $rndkey0,@out[1]
  257. aesenc $rndkey0,@out[2]
  258. aesenc $rndkey0,@out[3]
  259. movups 0xe0-0x78($key),$rndkey0
  260. jmp .Lenc4x_tail
  261. .align 32
  262. .Lenc4x_tail:
  263. aesenc $rndkey1,@out[0]
  264. aesenc $rndkey1,@out[1]
  265. aesenc $rndkey1,@out[2]
  266. aesenc $rndkey1,@out[3]
  267. movdqu (@inptr[0],$offset),@inp[0]
  268. movdqu 0x10-0x78($key),$rndkey1
  269. aesenclast $rndkey0,@out[0]
  270. movdqu (@inptr[1],$offset),@inp[1]
  271. pxor $zero,@inp[0]
  272. aesenclast $rndkey0,@out[1]
  273. movdqu (@inptr[2],$offset),@inp[2]
  274. pxor $zero,@inp[1]
  275. aesenclast $rndkey0,@out[2]
  276. movdqu (@inptr[3],$offset),@inp[3]
  277. pxor $zero,@inp[2]
  278. aesenclast $rndkey0,@out[3]
  279. movdqu 0x20-0x78($key),$rndkey0
  280. pxor $zero,@inp[3]
  281. movups @out[0],-16(@outptr[0],$offset)
  282. pxor @inp[0],@out[0]
  283. movups @out[1],-16(@outptr[1],$offset)
  284. pxor @inp[1],@out[1]
  285. movups @out[2],-16(@outptr[2],$offset)
  286. pxor @inp[2],@out[2]
  287. movups @out[3],-16(@outptr[3],$offset)
  288. pxor @inp[3],@out[3]
  289. dec $num
  290. jnz .Loop_enc4x
  291. mov 16(%rsp),%rax # original %rsp
  292. .cfi_def_cfa %rax,8
  293. mov 24(%rsp),$num
  294. #pxor @inp[0],@out[0]
  295. #pxor @inp[1],@out[1]
  296. #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
  297. #pxor @inp[2],@out[2]
  298. #movdqu @out[1],`40*1+24-40*2`($inp)
  299. #pxor @inp[3],@out[3]
  300. #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
  301. #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
  302. lea `40*4`($inp),$inp
  303. dec $num
  304. jnz .Lenc4x_loop_grande
  305. .Lenc4x_done:
  306. ___
  307. $code.=<<___ if ($win64);
  308. movaps -0xd8(%rax),%xmm6
  309. movaps -0xc8(%rax),%xmm7
  310. movaps -0xb8(%rax),%xmm8
  311. movaps -0xa8(%rax),%xmm9
  312. movaps -0x98(%rax),%xmm10
  313. movaps -0x88(%rax),%xmm11
  314. movaps -0x78(%rax),%xmm12
  315. #movaps -0x68(%rax),%xmm13
  316. #movaps -0x58(%rax),%xmm14
  317. #movaps -0x48(%rax),%xmm15
  318. ___
  319. $code.=<<___;
  320. mov -48(%rax),%r15
  321. .cfi_restore %r15
  322. mov -40(%rax),%r14
  323. .cfi_restore %r14
  324. mov -32(%rax),%r13
  325. .cfi_restore %r13
  326. mov -24(%rax),%r12
  327. .cfi_restore %r12
  328. mov -16(%rax),%rbp
  329. .cfi_restore %rbp
  330. mov -8(%rax),%rbx
  331. .cfi_restore %rbx
  332. lea (%rax),%rsp
  333. .cfi_def_cfa_register %rsp
  334. .Lenc4x_epilogue:
  335. ret
  336. .cfi_endproc
  337. .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
  338. .globl aesni_multi_cbc_decrypt
  339. .type aesni_multi_cbc_decrypt,\@function,3
  340. .align 32
  341. aesni_multi_cbc_decrypt:
  342. .cfi_startproc
  343. ___
  344. $code.=<<___ if ($avx);
  345. cmp \$2,$num
  346. jb .Ldec_non_avx
  347. mov OPENSSL_ia32cap_P+4(%rip),%ecx
  348. test \$`1<<28`,%ecx # AVX bit
  349. jnz _avx_cbc_dec_shortcut
  350. jmp .Ldec_non_avx
  351. .align 16
  352. .Ldec_non_avx:
  353. ___
  354. $code.=<<___;
  355. mov %rsp,%rax
  356. .cfi_def_cfa_register %rax
  357. push %rbx
  358. .cfi_push %rbx
  359. push %rbp
  360. .cfi_push %rbp
  361. push %r12
  362. .cfi_push %r12
  363. push %r13
  364. .cfi_push %r13
  365. push %r14
  366. .cfi_push %r14
  367. push %r15
  368. .cfi_push %r15
  369. ___
  370. $code.=<<___ if ($win64);
  371. lea -0xa8(%rsp),%rsp
  372. movaps %xmm6,(%rsp)
  373. movaps %xmm7,0x10(%rsp)
  374. movaps %xmm8,0x20(%rsp)
  375. movaps %xmm9,0x30(%rsp)
  376. movaps %xmm10,0x40(%rsp)
  377. movaps %xmm11,0x50(%rsp)
  378. movaps %xmm12,0x60(%rsp)
  379. movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
  380. movaps %xmm14,-0x58(%rax)
  381. movaps %xmm15,-0x48(%rax)
  382. ___
  383. $code.=<<___;
  384. # stack layout
  385. #
  386. # +0 output sink
  387. # +16 input sink [original %rsp and $num]
  388. # +32 counters
  389. sub \$48,%rsp
  390. and \$-64,%rsp
  391. mov %rax,16(%rsp) # original %rsp
  392. .cfi_cfa_expression %rsp+16,deref,+8
  393. .Ldec4x_body:
  394. movdqu ($key),$zero # 0-round key
  395. lea 0x78($key),$key # size optimization
  396. lea 40*2($inp),$inp
  397. .Ldec4x_loop_grande:
  398. mov $num,24(%rsp) # original $num
  399. xor $num,$num
  400. ___
  401. for($i=0;$i<4;$i++) {
  402. $code.=<<___;
  403. mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
  404. mov `40*$i+0-40*2`($inp),@inptr[$i]
  405. cmp $num,$one
  406. mov `40*$i+8-40*2`($inp),@outptr[$i]
  407. cmovg $one,$num # find maximum
  408. test $one,$one
  409. movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
  410. mov $one,`32+4*$i`(%rsp) # initialize counters
  411. cmovle %rsp,@inptr[$i] # cancel input
  412. ___
  413. }
  414. $code.=<<___;
  415. test $num,$num
  416. jz .Ldec4x_done
  417. movups 0x10-0x78($key),$rndkey1
  418. movups 0x20-0x78($key),$rndkey0
  419. mov 0xf0-0x78($key),$rounds
  420. movdqu (@inptr[0]),@out[0] # load inputs
  421. movdqu (@inptr[1]),@out[1]
  422. pxor $zero,@out[0]
  423. movdqu (@inptr[2]),@out[2]
  424. pxor $zero,@out[1]
  425. movdqu (@inptr[3]),@out[3]
  426. pxor $zero,@out[2]
  427. pxor $zero,@out[3]
  428. movdqa 32(%rsp),$counters # load counters
  429. xor $offset,$offset
  430. jmp .Loop_dec4x
  431. .align 32
  432. .Loop_dec4x:
  433. add \$16,$offset
  434. lea 16(%rsp),$sink # sink pointer
  435. mov \$1,$one # constant of 1
  436. sub $offset,$sink
  437. aesdec $rndkey1,@out[0]
  438. prefetcht0 31(@inptr[0],$offset) # prefetch input
  439. prefetcht0 31(@inptr[1],$offset)
  440. aesdec $rndkey1,@out[1]
  441. prefetcht0 31(@inptr[2],$offset)
  442. prefetcht0 31(@inptr[3],$offset)
  443. aesdec $rndkey1,@out[2]
  444. aesdec $rndkey1,@out[3]
  445. movups 0x30-0x78($key),$rndkey1
  446. ___
  447. for($i=0;$i<4;$i++) {
  448. my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
  449. $code.=<<___;
  450. cmp `32+4*$i`(%rsp),$one
  451. aesdec $rndkey,@out[0]
  452. aesdec $rndkey,@out[1]
  453. aesdec $rndkey,@out[2]
  454. cmovge $sink,@inptr[$i] # cancel input
  455. cmovg $sink,@outptr[$i] # sink output
  456. aesdec $rndkey,@out[3]
  457. movups `0x40+16*$i-0x78`($key),$rndkey
  458. ___
  459. }
  460. $code.=<<___;
  461. movdqa $counters,$mask
  462. aesdec $rndkey0,@out[0]
  463. prefetcht0 15(@outptr[0],$offset) # prefetch output
  464. prefetcht0 15(@outptr[1],$offset)
  465. aesdec $rndkey0,@out[1]
  466. prefetcht0 15(@outptr[2],$offset)
  467. prefetcht0 15(@outptr[3],$offset)
  468. aesdec $rndkey0,@out[2]
  469. aesdec $rndkey0,@out[3]
  470. movups 0x80-0x78($key),$rndkey0
  471. pxor $zero,$zero
  472. aesdec $rndkey1,@out[0]
  473. pcmpgtd $zero,$mask
  474. movdqu -0x78($key),$zero # reload 0-round key
  475. aesdec $rndkey1,@out[1]
  476. paddd $mask,$counters # decrement counters
  477. movdqa $counters,32(%rsp) # update counters
  478. aesdec $rndkey1,@out[2]
  479. aesdec $rndkey1,@out[3]
  480. movups 0x90-0x78($key),$rndkey1
  481. cmp \$11,$rounds
  482. aesdec $rndkey0,@out[0]
  483. aesdec $rndkey0,@out[1]
  484. aesdec $rndkey0,@out[2]
  485. aesdec $rndkey0,@out[3]
  486. movups 0xa0-0x78($key),$rndkey0
  487. jb .Ldec4x_tail
  488. aesdec $rndkey1,@out[0]
  489. aesdec $rndkey1,@out[1]
  490. aesdec $rndkey1,@out[2]
  491. aesdec $rndkey1,@out[3]
  492. movups 0xb0-0x78($key),$rndkey1
  493. aesdec $rndkey0,@out[0]
  494. aesdec $rndkey0,@out[1]
  495. aesdec $rndkey0,@out[2]
  496. aesdec $rndkey0,@out[3]
  497. movups 0xc0-0x78($key),$rndkey0
  498. je .Ldec4x_tail
  499. aesdec $rndkey1,@out[0]
  500. aesdec $rndkey1,@out[1]
  501. aesdec $rndkey1,@out[2]
  502. aesdec $rndkey1,@out[3]
  503. movups 0xd0-0x78($key),$rndkey1
  504. aesdec $rndkey0,@out[0]
  505. aesdec $rndkey0,@out[1]
  506. aesdec $rndkey0,@out[2]
  507. aesdec $rndkey0,@out[3]
  508. movups 0xe0-0x78($key),$rndkey0
  509. jmp .Ldec4x_tail
  510. .align 32
  511. .Ldec4x_tail:
  512. aesdec $rndkey1,@out[0]
  513. aesdec $rndkey1,@out[1]
  514. aesdec $rndkey1,@out[2]
  515. pxor $rndkey0,@inp[0]
  516. pxor $rndkey0,@inp[1]
  517. aesdec $rndkey1,@out[3]
  518. movdqu 0x10-0x78($key),$rndkey1
  519. pxor $rndkey0,@inp[2]
  520. pxor $rndkey0,@inp[3]
  521. movdqu 0x20-0x78($key),$rndkey0
  522. aesdeclast @inp[0],@out[0]
  523. aesdeclast @inp[1],@out[1]
  524. movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
  525. movdqu -16(@inptr[1],$offset),@inp[1]
  526. aesdeclast @inp[2],@out[2]
  527. aesdeclast @inp[3],@out[3]
  528. movdqu -16(@inptr[2],$offset),@inp[2]
  529. movdqu -16(@inptr[3],$offset),@inp[3]
  530. movups @out[0],-16(@outptr[0],$offset)
  531. movdqu (@inptr[0],$offset),@out[0]
  532. movups @out[1],-16(@outptr[1],$offset)
  533. movdqu (@inptr[1],$offset),@out[1]
  534. pxor $zero,@out[0]
  535. movups @out[2],-16(@outptr[2],$offset)
  536. movdqu (@inptr[2],$offset),@out[2]
  537. pxor $zero,@out[1]
  538. movups @out[3],-16(@outptr[3],$offset)
  539. movdqu (@inptr[3],$offset),@out[3]
  540. pxor $zero,@out[2]
  541. pxor $zero,@out[3]
  542. dec $num
  543. jnz .Loop_dec4x
  544. mov 16(%rsp),%rax # original %rsp
  545. .cfi_def_cfa %rax,8
  546. mov 24(%rsp),$num
  547. lea `40*4`($inp),$inp
  548. dec $num
  549. jnz .Ldec4x_loop_grande
  550. .Ldec4x_done:
  551. ___
  552. $code.=<<___ if ($win64);
  553. movaps -0xd8(%rax),%xmm6
  554. movaps -0xc8(%rax),%xmm7
  555. movaps -0xb8(%rax),%xmm8
  556. movaps -0xa8(%rax),%xmm9
  557. movaps -0x98(%rax),%xmm10
  558. movaps -0x88(%rax),%xmm11
  559. movaps -0x78(%rax),%xmm12
  560. #movaps -0x68(%rax),%xmm13
  561. #movaps -0x58(%rax),%xmm14
  562. #movaps -0x48(%rax),%xmm15
  563. ___
  564. $code.=<<___;
  565. mov -48(%rax),%r15
  566. .cfi_restore %r15
  567. mov -40(%rax),%r14
  568. .cfi_restore %r14
  569. mov -32(%rax),%r13
  570. .cfi_restore %r13
  571. mov -24(%rax),%r12
  572. .cfi_restore %r12
  573. mov -16(%rax),%rbp
  574. .cfi_restore %rbp
  575. mov -8(%rax),%rbx
  576. .cfi_restore %rbx
  577. lea (%rax),%rsp
  578. .cfi_def_cfa_register %rsp
  579. .Ldec4x_epilogue:
  580. ret
  581. .cfi_endproc
  582. .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
  583. ___
  584. if ($avx) {{{
  585. my @ptr=map("%r$_",(8..15));
  586. my $offload=$sink;
  587. my @out=map("%xmm$_",(2..9));
  588. my @inp=map("%xmm$_",(10..13));
  589. my ($counters,$zero)=("%xmm14","%xmm15");
  590. $code.=<<___;
  591. .type aesni_multi_cbc_encrypt_avx,\@function,3
  592. .align 32
  593. aesni_multi_cbc_encrypt_avx:
  594. .cfi_startproc
  595. _avx_cbc_enc_shortcut:
  596. mov %rsp,%rax
  597. .cfi_def_cfa_register %rax
  598. push %rbx
  599. .cfi_push %rbx
  600. push %rbp
  601. .cfi_push %rbp
  602. push %r12
  603. .cfi_push %r12
  604. push %r13
  605. .cfi_push %r13
  606. push %r14
  607. .cfi_push %r14
  608. push %r15
  609. .cfi_push %r15
  610. ___
  611. $code.=<<___ if ($win64);
  612. lea -0xa8(%rsp),%rsp
  613. movaps %xmm6,(%rsp)
  614. movaps %xmm7,0x10(%rsp)
  615. movaps %xmm8,0x20(%rsp)
  616. movaps %xmm9,0x30(%rsp)
  617. movaps %xmm10,0x40(%rsp)
  618. movaps %xmm11,0x50(%rsp)
  619. movaps %xmm12,-0x78(%rax)
  620. movaps %xmm13,-0x68(%rax)
  621. movaps %xmm14,-0x58(%rax)
  622. movaps %xmm15,-0x48(%rax)
  623. ___
  624. $code.=<<___;
  625. # stack layout
  626. #
  627. # +0 output sink
  628. # +16 input sink [original %rsp and $num]
  629. # +32 counters
  630. # +64 distances between inputs and outputs
  631. # +128 off-load area for @inp[0..3]
  632. sub \$192,%rsp
  633. and \$-128,%rsp
  634. mov %rax,16(%rsp) # original %rsp
  635. .cfi_cfa_expression %rsp+16,deref,+8
  636. .Lenc8x_body:
  637. vzeroupper
  638. vmovdqu ($key),$zero # 0-round key
  639. lea 0x78($key),$key # size optimization
  640. lea 40*4($inp),$inp
  641. shr \$1,$num
  642. .Lenc8x_loop_grande:
  643. #mov $num,24(%rsp) # original $num
  644. xor $num,$num
  645. ___
  646. for($i=0;$i<8;$i++) {
  647. my $temp = $i ? $offload : $offset;
  648. $code.=<<___;
  649. mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
  650. mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
  651. cmp $num,$one
  652. mov `40*$i+8-40*4`($inp),$temp # output pointer
  653. cmovg $one,$num # find maximum
  654. test $one,$one
  655. vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
  656. mov $one,`32+4*$i`(%rsp) # initialize counters
  657. cmovle %rsp,@ptr[$i] # cancel input
  658. sub @ptr[$i],$temp # distance between input and output
  659. mov $temp,`64+8*$i`(%rsp) # initialize distances
  660. ___
  661. }
  662. $code.=<<___;
  663. test $num,$num
  664. jz .Lenc8x_done
  665. vmovups 0x10-0x78($key),$rndkey1
  666. vmovups 0x20-0x78($key),$rndkey0
  667. mov 0xf0-0x78($key),$rounds
  668. vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
  669. lea 128(%rsp),$offload # offload area
  670. vpxor (@ptr[1]),$zero,@inp[1]
  671. vpxor (@ptr[2]),$zero,@inp[2]
  672. vpxor (@ptr[3]),$zero,@inp[3]
  673. vpxor @inp[0],@out[0],@out[0]
  674. vpxor (@ptr[4]),$zero,@inp[0]
  675. vpxor @inp[1],@out[1],@out[1]
  676. vpxor (@ptr[5]),$zero,@inp[1]
  677. vpxor @inp[2],@out[2],@out[2]
  678. vpxor (@ptr[6]),$zero,@inp[2]
  679. vpxor @inp[3],@out[3],@out[3]
  680. vpxor (@ptr[7]),$zero,@inp[3]
  681. vpxor @inp[0],@out[4],@out[4]
  682. mov \$1,$one # constant of 1
  683. vpxor @inp[1],@out[5],@out[5]
  684. vpxor @inp[2],@out[6],@out[6]
  685. vpxor @inp[3],@out[7],@out[7]
  686. jmp .Loop_enc8x
  687. .align 32
  688. .Loop_enc8x:
  689. ___
  690. for($i=0;$i<8;$i++) {
  691. my $rndkey=($i&1)?$rndkey0:$rndkey1;
  692. $code.=<<___;
  693. vaesenc $rndkey,@out[0],@out[0]
  694. cmp 32+4*$i(%rsp),$one
  695. ___
  696. $code.=<<___ if ($i);
  697. mov 64+8*$i(%rsp),$offset
  698. ___
  699. $code.=<<___;
  700. vaesenc $rndkey,@out[1],@out[1]
  701. prefetcht0 31(@ptr[$i]) # prefetch input
  702. vaesenc $rndkey,@out[2],@out[2]
  703. ___
  704. $code.=<<___ if ($i>1);
  705. prefetcht0 15(@ptr[$i-2]) # prefetch output
  706. ___
  707. $code.=<<___;
  708. vaesenc $rndkey,@out[3],@out[3]
  709. lea (@ptr[$i],$offset),$offset
  710. cmovge %rsp,@ptr[$i] # cancel input
  711. vaesenc $rndkey,@out[4],@out[4]
  712. cmovg %rsp,$offset # sink output
  713. vaesenc $rndkey,@out[5],@out[5]
  714. sub @ptr[$i],$offset
  715. vaesenc $rndkey,@out[6],@out[6]
  716. vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
  717. mov $offset,64+8*$i(%rsp)
  718. vaesenc $rndkey,@out[7],@out[7]
  719. vmovups `16*(3+$i)-0x78`($key),$rndkey
  720. lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
  721. ___
  722. $code.=<<___ if ($i<4)
  723. vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
  724. ___
  725. }
  726. $code.=<<___;
  727. vmovdqu 32(%rsp),$counters
  728. prefetcht0 15(@ptr[$i-2]) # prefetch output
  729. prefetcht0 15(@ptr[$i-1])
  730. cmp \$11,$rounds
  731. jb .Lenc8x_tail
  732. vaesenc $rndkey1,@out[0],@out[0]
  733. vaesenc $rndkey1,@out[1],@out[1]
  734. vaesenc $rndkey1,@out[2],@out[2]
  735. vaesenc $rndkey1,@out[3],@out[3]
  736. vaesenc $rndkey1,@out[4],@out[4]
  737. vaesenc $rndkey1,@out[5],@out[5]
  738. vaesenc $rndkey1,@out[6],@out[6]
  739. vaesenc $rndkey1,@out[7],@out[7]
  740. vmovups 0xb0-0x78($key),$rndkey1
  741. vaesenc $rndkey0,@out[0],@out[0]
  742. vaesenc $rndkey0,@out[1],@out[1]
  743. vaesenc $rndkey0,@out[2],@out[2]
  744. vaesenc $rndkey0,@out[3],@out[3]
  745. vaesenc $rndkey0,@out[4],@out[4]
  746. vaesenc $rndkey0,@out[5],@out[5]
  747. vaesenc $rndkey0,@out[6],@out[6]
  748. vaesenc $rndkey0,@out[7],@out[7]
  749. vmovups 0xc0-0x78($key),$rndkey0
  750. je .Lenc8x_tail
  751. vaesenc $rndkey1,@out[0],@out[0]
  752. vaesenc $rndkey1,@out[1],@out[1]
  753. vaesenc $rndkey1,@out[2],@out[2]
  754. vaesenc $rndkey1,@out[3],@out[3]
  755. vaesenc $rndkey1,@out[4],@out[4]
  756. vaesenc $rndkey1,@out[5],@out[5]
  757. vaesenc $rndkey1,@out[6],@out[6]
  758. vaesenc $rndkey1,@out[7],@out[7]
  759. vmovups 0xd0-0x78($key),$rndkey1
  760. vaesenc $rndkey0,@out[0],@out[0]
  761. vaesenc $rndkey0,@out[1],@out[1]
  762. vaesenc $rndkey0,@out[2],@out[2]
  763. vaesenc $rndkey0,@out[3],@out[3]
  764. vaesenc $rndkey0,@out[4],@out[4]
  765. vaesenc $rndkey0,@out[5],@out[5]
  766. vaesenc $rndkey0,@out[6],@out[6]
  767. vaesenc $rndkey0,@out[7],@out[7]
  768. vmovups 0xe0-0x78($key),$rndkey0
  769. .Lenc8x_tail:
  770. vaesenc $rndkey1,@out[0],@out[0]
  771. vpxor $zero,$zero,$zero
  772. vaesenc $rndkey1,@out[1],@out[1]
  773. vaesenc $rndkey1,@out[2],@out[2]
  774. vpcmpgtd $zero,$counters,$zero
  775. vaesenc $rndkey1,@out[3],@out[3]
  776. vaesenc $rndkey1,@out[4],@out[4]
  777. vpaddd $counters,$zero,$zero # decrement counters
  778. vmovdqu 48(%rsp),$counters
  779. vaesenc $rndkey1,@out[5],@out[5]
  780. mov 64(%rsp),$offset # pre-load 1st offset
  781. vaesenc $rndkey1,@out[6],@out[6]
  782. vaesenc $rndkey1,@out[7],@out[7]
  783. vmovups 0x10-0x78($key),$rndkey1
  784. vaesenclast $rndkey0,@out[0],@out[0]
  785. vmovdqa $zero,32(%rsp) # update counters
  786. vpxor $zero,$zero,$zero
  787. vaesenclast $rndkey0,@out[1],@out[1]
  788. vaesenclast $rndkey0,@out[2],@out[2]
  789. vpcmpgtd $zero,$counters,$zero
  790. vaesenclast $rndkey0,@out[3],@out[3]
  791. vaesenclast $rndkey0,@out[4],@out[4]
  792. vpaddd $zero,$counters,$counters # decrement counters
  793. vmovdqu -0x78($key),$zero # 0-round
  794. vaesenclast $rndkey0,@out[5],@out[5]
  795. vaesenclast $rndkey0,@out[6],@out[6]
  796. vmovdqa $counters,48(%rsp) # update counters
  797. vaesenclast $rndkey0,@out[7],@out[7]
  798. vmovups 0x20-0x78($key),$rndkey0
  799. vmovups @out[0],-16(@ptr[0]) # write output
  800. sub $offset,@ptr[0] # switch to input
  801. vpxor 0x00($offload),@out[0],@out[0]
  802. vmovups @out[1],-16(@ptr[1])
  803. sub `64+1*8`(%rsp),@ptr[1]
  804. vpxor 0x10($offload),@out[1],@out[1]
  805. vmovups @out[2],-16(@ptr[2])
  806. sub `64+2*8`(%rsp),@ptr[2]
  807. vpxor 0x20($offload),@out[2],@out[2]
  808. vmovups @out[3],-16(@ptr[3])
  809. sub `64+3*8`(%rsp),@ptr[3]
  810. vpxor 0x30($offload),@out[3],@out[3]
  811. vmovups @out[4],-16(@ptr[4])
  812. sub `64+4*8`(%rsp),@ptr[4]
  813. vpxor @inp[0],@out[4],@out[4]
  814. vmovups @out[5],-16(@ptr[5])
  815. sub `64+5*8`(%rsp),@ptr[5]
  816. vpxor @inp[1],@out[5],@out[5]
  817. vmovups @out[6],-16(@ptr[6])
  818. sub `64+6*8`(%rsp),@ptr[6]
  819. vpxor @inp[2],@out[6],@out[6]
  820. vmovups @out[7],-16(@ptr[7])
  821. sub `64+7*8`(%rsp),@ptr[7]
  822. vpxor @inp[3],@out[7],@out[7]
  823. dec $num
  824. jnz .Loop_enc8x
  825. mov 16(%rsp),%rax # original %rsp
  826. .cfi_def_cfa %rax,8
  827. #mov 24(%rsp),$num
  828. #lea `40*8`($inp),$inp
  829. #dec $num
  830. #jnz .Lenc8x_loop_grande
  831. .Lenc8x_done:
  832. vzeroupper
  833. ___
  834. $code.=<<___ if ($win64);
  835. movaps -0xd8(%rax),%xmm6
  836. movaps -0xc8(%rax),%xmm7
  837. movaps -0xb8(%rax),%xmm8
  838. movaps -0xa8(%rax),%xmm9
  839. movaps -0x98(%rax),%xmm10
  840. movaps -0x88(%rax),%xmm11
  841. movaps -0x78(%rax),%xmm12
  842. movaps -0x68(%rax),%xmm13
  843. movaps -0x58(%rax),%xmm14
  844. movaps -0x48(%rax),%xmm15
  845. ___
  846. $code.=<<___;
  847. mov -48(%rax),%r15
  848. .cfi_restore %r15
  849. mov -40(%rax),%r14
  850. .cfi_restore %r14
  851. mov -32(%rax),%r13
  852. .cfi_restore %r13
  853. mov -24(%rax),%r12
  854. .cfi_restore %r12
  855. mov -16(%rax),%rbp
  856. .cfi_restore %rbp
  857. mov -8(%rax),%rbx
  858. .cfi_restore %rbx
  859. lea (%rax),%rsp
  860. .cfi_def_cfa_register %rsp
  861. .Lenc8x_epilogue:
  862. ret
  863. .cfi_endproc
  864. .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
  865. .type aesni_multi_cbc_decrypt_avx,\@function,3
  866. .align 32
  867. aesni_multi_cbc_decrypt_avx:
  868. .cfi_startproc
  869. _avx_cbc_dec_shortcut:
  870. mov %rsp,%rax
  871. .cfi_def_cfa_register %rax
  872. push %rbx
  873. .cfi_push %rbx
  874. push %rbp
  875. .cfi_push %rbp
  876. push %r12
  877. .cfi_push %r12
  878. push %r13
  879. .cfi_push %r13
  880. push %r14
  881. .cfi_push %r14
  882. push %r15
  883. .cfi_push %r15
  884. ___
  885. $code.=<<___ if ($win64);
  886. lea -0xa8(%rsp),%rsp
  887. movaps %xmm6,(%rsp)
  888. movaps %xmm7,0x10(%rsp)
  889. movaps %xmm8,0x20(%rsp)
  890. movaps %xmm9,0x30(%rsp)
  891. movaps %xmm10,0x40(%rsp)
  892. movaps %xmm11,0x50(%rsp)
  893. movaps %xmm12,-0x78(%rax)
  894. movaps %xmm13,-0x68(%rax)
  895. movaps %xmm14,-0x58(%rax)
  896. movaps %xmm15,-0x48(%rax)
  897. ___
  898. $code.=<<___;
  899. # stack layout
  900. #
  901. # +0 output sink
  902. # +16 input sink [original %rsp and $num]
  903. # +32 counters
  904. # +64 distances between inputs and outputs
  905. # +128 off-load area for @inp[0..3]
  906. # +192 IV/input offload
  907. sub \$256,%rsp
  908. and \$-256,%rsp
  909. sub \$192,%rsp
  910. mov %rax,16(%rsp) # original %rsp
  911. .cfi_cfa_expression %rsp+16,deref,+8
  912. .Ldec8x_body:
  913. vzeroupper
  914. vmovdqu ($key),$zero # 0-round key
  915. lea 0x78($key),$key # size optimization
  916. lea 40*4($inp),$inp
  917. shr \$1,$num
  918. .Ldec8x_loop_grande:
  919. #mov $num,24(%rsp) # original $num
  920. xor $num,$num
  921. ___
  922. for($i=0;$i<8;$i++) {
  923. my $temp = $i ? $offload : $offset;
  924. $code.=<<___;
  925. mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
  926. mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
  927. cmp $num,$one
  928. mov `40*$i+8-40*4`($inp),$temp # output pointer
  929. cmovg $one,$num # find maximum
  930. test $one,$one
  931. vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
  932. mov $one,`32+4*$i`(%rsp) # initialize counters
  933. cmovle %rsp,@ptr[$i] # cancel input
  934. sub @ptr[$i],$temp # distance between input and output
  935. mov $temp,`64+8*$i`(%rsp) # initialize distances
  936. vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
  937. ___
  938. }
  939. $code.=<<___;
  940. test $num,$num
  941. jz .Ldec8x_done
  942. vmovups 0x10-0x78($key),$rndkey1
  943. vmovups 0x20-0x78($key),$rndkey0
  944. mov 0xf0-0x78($key),$rounds
  945. lea 192+128(%rsp),$offload # offload area
  946. vmovdqu (@ptr[0]),@out[0] # load inputs
  947. vmovdqu (@ptr[1]),@out[1]
  948. vmovdqu (@ptr[2]),@out[2]
  949. vmovdqu (@ptr[3]),@out[3]
  950. vmovdqu (@ptr[4]),@out[4]
  951. vmovdqu (@ptr[5]),@out[5]
  952. vmovdqu (@ptr[6]),@out[6]
  953. vmovdqu (@ptr[7]),@out[7]
  954. vmovdqu @out[0],0x00($offload) # offload inputs
  955. vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
  956. vmovdqu @out[1],0x10($offload)
  957. vpxor $zero,@out[1],@out[1]
  958. vmovdqu @out[2],0x20($offload)
  959. vpxor $zero,@out[2],@out[2]
  960. vmovdqu @out[3],0x30($offload)
  961. vpxor $zero,@out[3],@out[3]
  962. vmovdqu @out[4],0x40($offload)
  963. vpxor $zero,@out[4],@out[4]
  964. vmovdqu @out[5],0x50($offload)
  965. vpxor $zero,@out[5],@out[5]
  966. vmovdqu @out[6],0x60($offload)
  967. vpxor $zero,@out[6],@out[6]
  968. vmovdqu @out[7],0x70($offload)
  969. vpxor $zero,@out[7],@out[7]
  970. xor \$0x80,$offload
  971. mov \$1,$one # constant of 1
  972. jmp .Loop_dec8x
  973. .align 32
  974. .Loop_dec8x:
  975. ___
  976. for($i=0;$i<8;$i++) {
  977. my $rndkey=($i&1)?$rndkey0:$rndkey1;
  978. $code.=<<___;
  979. vaesdec $rndkey,@out[0],@out[0]
  980. cmp 32+4*$i(%rsp),$one
  981. ___
  982. $code.=<<___ if ($i);
  983. mov 64+8*$i(%rsp),$offset
  984. ___
  985. $code.=<<___;
  986. vaesdec $rndkey,@out[1],@out[1]
  987. prefetcht0 31(@ptr[$i]) # prefetch input
  988. vaesdec $rndkey,@out[2],@out[2]
  989. ___
  990. $code.=<<___ if ($i>1);
  991. prefetcht0 15(@ptr[$i-2]) # prefetch output
  992. ___
  993. $code.=<<___;
  994. vaesdec $rndkey,@out[3],@out[3]
  995. lea (@ptr[$i],$offset),$offset
  996. cmovge %rsp,@ptr[$i] # cancel input
  997. vaesdec $rndkey,@out[4],@out[4]
  998. cmovg %rsp,$offset # sink output
  999. vaesdec $rndkey,@out[5],@out[5]
  1000. sub @ptr[$i],$offset
  1001. vaesdec $rndkey,@out[6],@out[6]
  1002. vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
  1003. mov $offset,64+8*$i(%rsp)
  1004. vaesdec $rndkey,@out[7],@out[7]
  1005. vmovups `16*(3+$i)-0x78`($key),$rndkey
  1006. lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
  1007. ___
  1008. $code.=<<___ if ($i<4);
  1009. vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
  1010. ___
  1011. }
  1012. $code.=<<___;
  1013. vmovdqu 32(%rsp),$counters
  1014. prefetcht0 15(@ptr[$i-2]) # prefetch output
  1015. prefetcht0 15(@ptr[$i-1])
  1016. cmp \$11,$rounds
  1017. jb .Ldec8x_tail
  1018. vaesdec $rndkey1,@out[0],@out[0]
  1019. vaesdec $rndkey1,@out[1],@out[1]
  1020. vaesdec $rndkey1,@out[2],@out[2]
  1021. vaesdec $rndkey1,@out[3],@out[3]
  1022. vaesdec $rndkey1,@out[4],@out[4]
  1023. vaesdec $rndkey1,@out[5],@out[5]
  1024. vaesdec $rndkey1,@out[6],@out[6]
  1025. vaesdec $rndkey1,@out[7],@out[7]
  1026. vmovups 0xb0-0x78($key),$rndkey1
  1027. vaesdec $rndkey0,@out[0],@out[0]
  1028. vaesdec $rndkey0,@out[1],@out[1]
  1029. vaesdec $rndkey0,@out[2],@out[2]
  1030. vaesdec $rndkey0,@out[3],@out[3]
  1031. vaesdec $rndkey0,@out[4],@out[4]
  1032. vaesdec $rndkey0,@out[5],@out[5]
  1033. vaesdec $rndkey0,@out[6],@out[6]
  1034. vaesdec $rndkey0,@out[7],@out[7]
  1035. vmovups 0xc0-0x78($key),$rndkey0
  1036. je .Ldec8x_tail
  1037. vaesdec $rndkey1,@out[0],@out[0]
  1038. vaesdec $rndkey1,@out[1],@out[1]
  1039. vaesdec $rndkey1,@out[2],@out[2]
  1040. vaesdec $rndkey1,@out[3],@out[3]
  1041. vaesdec $rndkey1,@out[4],@out[4]
  1042. vaesdec $rndkey1,@out[5],@out[5]
  1043. vaesdec $rndkey1,@out[6],@out[6]
  1044. vaesdec $rndkey1,@out[7],@out[7]
  1045. vmovups 0xd0-0x78($key),$rndkey1
  1046. vaesdec $rndkey0,@out[0],@out[0]
  1047. vaesdec $rndkey0,@out[1],@out[1]
  1048. vaesdec $rndkey0,@out[2],@out[2]
  1049. vaesdec $rndkey0,@out[3],@out[3]
  1050. vaesdec $rndkey0,@out[4],@out[4]
  1051. vaesdec $rndkey0,@out[5],@out[5]
  1052. vaesdec $rndkey0,@out[6],@out[6]
  1053. vaesdec $rndkey0,@out[7],@out[7]
  1054. vmovups 0xe0-0x78($key),$rndkey0
  1055. .Ldec8x_tail:
  1056. vaesdec $rndkey1,@out[0],@out[0]
  1057. vpxor $zero,$zero,$zero
  1058. vaesdec $rndkey1,@out[1],@out[1]
  1059. vaesdec $rndkey1,@out[2],@out[2]
  1060. vpcmpgtd $zero,$counters,$zero
  1061. vaesdec $rndkey1,@out[3],@out[3]
  1062. vaesdec $rndkey1,@out[4],@out[4]
  1063. vpaddd $counters,$zero,$zero # decrement counters
  1064. vmovdqu 48(%rsp),$counters
  1065. vaesdec $rndkey1,@out[5],@out[5]
  1066. mov 64(%rsp),$offset # pre-load 1st offset
  1067. vaesdec $rndkey1,@out[6],@out[6]
  1068. vaesdec $rndkey1,@out[7],@out[7]
  1069. vmovups 0x10-0x78($key),$rndkey1
  1070. vaesdeclast $rndkey0,@out[0],@out[0]
  1071. vmovdqa $zero,32(%rsp) # update counters
  1072. vpxor $zero,$zero,$zero
  1073. vaesdeclast $rndkey0,@out[1],@out[1]
  1074. vpxor 0x00($offload),@out[0],@out[0] # xor with IV
  1075. vaesdeclast $rndkey0,@out[2],@out[2]
  1076. vpxor 0x10($offload),@out[1],@out[1]
  1077. vpcmpgtd $zero,$counters,$zero
  1078. vaesdeclast $rndkey0,@out[3],@out[3]
  1079. vpxor 0x20($offload),@out[2],@out[2]
  1080. vaesdeclast $rndkey0,@out[4],@out[4]
  1081. vpxor 0x30($offload),@out[3],@out[3]
  1082. vpaddd $zero,$counters,$counters # decrement counters
  1083. vmovdqu -0x78($key),$zero # 0-round
  1084. vaesdeclast $rndkey0,@out[5],@out[5]
  1085. vpxor 0x40($offload),@out[4],@out[4]
  1086. vaesdeclast $rndkey0,@out[6],@out[6]
  1087. vpxor 0x50($offload),@out[5],@out[5]
  1088. vmovdqa $counters,48(%rsp) # update counters
  1089. vaesdeclast $rndkey0,@out[7],@out[7]
  1090. vpxor 0x60($offload),@out[6],@out[6]
  1091. vmovups 0x20-0x78($key),$rndkey0
  1092. vmovups @out[0],-16(@ptr[0]) # write output
  1093. sub $offset,@ptr[0] # switch to input
  1094. vmovdqu 128+0(%rsp),@out[0]
  1095. vpxor 0x70($offload),@out[7],@out[7]
  1096. vmovups @out[1],-16(@ptr[1])
  1097. sub `64+1*8`(%rsp),@ptr[1]
  1098. vmovdqu @out[0],0x00($offload)
  1099. vpxor $zero,@out[0],@out[0]
  1100. vmovdqu 128+16(%rsp),@out[1]
  1101. vmovups @out[2],-16(@ptr[2])
  1102. sub `64+2*8`(%rsp),@ptr[2]
  1103. vmovdqu @out[1],0x10($offload)
  1104. vpxor $zero,@out[1],@out[1]
  1105. vmovdqu 128+32(%rsp),@out[2]
  1106. vmovups @out[3],-16(@ptr[3])
  1107. sub `64+3*8`(%rsp),@ptr[3]
  1108. vmovdqu @out[2],0x20($offload)
  1109. vpxor $zero,@out[2],@out[2]
  1110. vmovdqu 128+48(%rsp),@out[3]
  1111. vmovups @out[4],-16(@ptr[4])
  1112. sub `64+4*8`(%rsp),@ptr[4]
  1113. vmovdqu @out[3],0x30($offload)
  1114. vpxor $zero,@out[3],@out[3]
  1115. vmovdqu @inp[0],0x40($offload)
  1116. vpxor @inp[0],$zero,@out[4]
  1117. vmovups @out[5],-16(@ptr[5])
  1118. sub `64+5*8`(%rsp),@ptr[5]
  1119. vmovdqu @inp[1],0x50($offload)
  1120. vpxor @inp[1],$zero,@out[5]
  1121. vmovups @out[6],-16(@ptr[6])
  1122. sub `64+6*8`(%rsp),@ptr[6]
  1123. vmovdqu @inp[2],0x60($offload)
  1124. vpxor @inp[2],$zero,@out[6]
  1125. vmovups @out[7],-16(@ptr[7])
  1126. sub `64+7*8`(%rsp),@ptr[7]
  1127. vmovdqu @inp[3],0x70($offload)
  1128. vpxor @inp[3],$zero,@out[7]
  1129. xor \$128,$offload
  1130. dec $num
  1131. jnz .Loop_dec8x
  1132. mov 16(%rsp),%rax # original %rsp
  1133. .cfi_def_cfa %rax,8
  1134. #mov 24(%rsp),$num
  1135. #lea `40*8`($inp),$inp
  1136. #dec $num
  1137. #jnz .Ldec8x_loop_grande
  1138. .Ldec8x_done:
  1139. vzeroupper
  1140. ___
  1141. $code.=<<___ if ($win64);
  1142. movaps -0xd8(%rax),%xmm6
  1143. movaps -0xc8(%rax),%xmm7
  1144. movaps -0xb8(%rax),%xmm8
  1145. movaps -0xa8(%rax),%xmm9
  1146. movaps -0x98(%rax),%xmm10
  1147. movaps -0x88(%rax),%xmm11
  1148. movaps -0x78(%rax),%xmm12
  1149. movaps -0x68(%rax),%xmm13
  1150. movaps -0x58(%rax),%xmm14
  1151. movaps -0x48(%rax),%xmm15
  1152. ___
  1153. $code.=<<___;
  1154. mov -48(%rax),%r15
  1155. .cfi_restore %r15
  1156. mov -40(%rax),%r14
  1157. .cfi_restore %r14
  1158. mov -32(%rax),%r13
  1159. .cfi_restore %r13
  1160. mov -24(%rax),%r12
  1161. .cfi_restore %r12
  1162. mov -16(%rax),%rbp
  1163. .cfi_restore %rbp
  1164. mov -8(%rax),%rbx
  1165. .cfi_restore %rbx
  1166. lea (%rax),%rsp
  1167. .cfi_def_cfa_register %rsp
  1168. .Ldec8x_epilogue:
  1169. ret
  1170. .cfi_endproc
  1171. .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
  1172. ___
  1173. }}}
  1174. if ($win64) {
  1175. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1176. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1177. $rec="%rcx";
  1178. $frame="%rdx";
  1179. $context="%r8";
  1180. $disp="%r9";
  1181. $code.=<<___;
  1182. .extern __imp_RtlVirtualUnwind
  1183. .type se_handler,\@abi-omnipotent
  1184. .align 16
  1185. se_handler:
  1186. push %rsi
  1187. push %rdi
  1188. push %rbx
  1189. push %rbp
  1190. push %r12
  1191. push %r13
  1192. push %r14
  1193. push %r15
  1194. pushfq
  1195. sub \$64,%rsp
  1196. mov 120($context),%rax # pull context->Rax
  1197. mov 248($context),%rbx # pull context->Rip
  1198. mov 8($disp),%rsi # disp->ImageBase
  1199. mov 56($disp),%r11 # disp->HandlerData
  1200. mov 0(%r11),%r10d # HandlerData[0]
  1201. lea (%rsi,%r10),%r10 # prologue label
  1202. cmp %r10,%rbx # context->Rip<.Lprologue
  1203. jb .Lin_prologue
  1204. mov 152($context),%rax # pull context->Rsp
  1205. mov 4(%r11),%r10d # HandlerData[1]
  1206. lea (%rsi,%r10),%r10 # epilogue label
  1207. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1208. jae .Lin_prologue
  1209. mov 16(%rax),%rax # pull saved stack pointer
  1210. mov -8(%rax),%rbx
  1211. mov -16(%rax),%rbp
  1212. mov -24(%rax),%r12
  1213. mov -32(%rax),%r13
  1214. mov -40(%rax),%r14
  1215. mov -48(%rax),%r15
  1216. mov %rbx,144($context) # restore context->Rbx
  1217. mov %rbp,160($context) # restore context->Rbp
  1218. mov %r12,216($context) # restore context->R12
  1219. mov %r13,224($context) # restore context->R13
  1220. mov %r14,232($context) # restore context->R14
  1221. mov %r15,240($context) # restore context->R15
  1222. lea -56-10*16(%rax),%rsi
  1223. lea 512($context),%rdi # &context.Xmm6
  1224. mov \$20,%ecx
  1225. .long 0xa548f3fc # cld; rep movsq
  1226. .Lin_prologue:
  1227. mov 8(%rax),%rdi
  1228. mov 16(%rax),%rsi
  1229. mov %rax,152($context) # restore context->Rsp
  1230. mov %rsi,168($context) # restore context->Rsi
  1231. mov %rdi,176($context) # restore context->Rdi
  1232. mov 40($disp),%rdi # disp->ContextRecord
  1233. mov $context,%rsi # context
  1234. mov \$154,%ecx # sizeof(CONTEXT)
  1235. .long 0xa548f3fc # cld; rep movsq
  1236. mov $disp,%rsi
  1237. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1238. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1239. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1240. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1241. mov 40(%rsi),%r10 # disp->ContextRecord
  1242. lea 56(%rsi),%r11 # &disp->HandlerData
  1243. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1244. mov %r10,32(%rsp) # arg5
  1245. mov %r11,40(%rsp) # arg6
  1246. mov %r12,48(%rsp) # arg7
  1247. mov %rcx,56(%rsp) # arg8, (NULL)
  1248. call *__imp_RtlVirtualUnwind(%rip)
  1249. mov \$1,%eax # ExceptionContinueSearch
  1250. add \$64,%rsp
  1251. popfq
  1252. pop %r15
  1253. pop %r14
  1254. pop %r13
  1255. pop %r12
  1256. pop %rbp
  1257. pop %rbx
  1258. pop %rdi
  1259. pop %rsi
  1260. ret
  1261. .size se_handler,.-se_handler
  1262. .section .pdata
  1263. .align 4
  1264. .rva .LSEH_begin_aesni_multi_cbc_encrypt
  1265. .rva .LSEH_end_aesni_multi_cbc_encrypt
  1266. .rva .LSEH_info_aesni_multi_cbc_encrypt
  1267. .rva .LSEH_begin_aesni_multi_cbc_decrypt
  1268. .rva .LSEH_end_aesni_multi_cbc_decrypt
  1269. .rva .LSEH_info_aesni_multi_cbc_decrypt
  1270. ___
  1271. $code.=<<___ if ($avx);
  1272. .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
  1273. .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
  1274. .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
  1275. .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
  1276. .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
  1277. .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
  1278. ___
  1279. $code.=<<___;
  1280. .section .xdata
  1281. .align 8
  1282. .LSEH_info_aesni_multi_cbc_encrypt:
  1283. .byte 9,0,0,0
  1284. .rva se_handler
  1285. .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
  1286. .LSEH_info_aesni_multi_cbc_decrypt:
  1287. .byte 9,0,0,0
  1288. .rva se_handler
  1289. .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
  1290. ___
  1291. $code.=<<___ if ($avx);
  1292. .LSEH_info_aesni_multi_cbc_encrypt_avx:
  1293. .byte 9,0,0,0
  1294. .rva se_handler
  1295. .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
  1296. .LSEH_info_aesni_multi_cbc_decrypt_avx:
  1297. .byte 9,0,0,0
  1298. .rva se_handler
  1299. .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
  1300. ___
  1301. }
  1302. ####################################################################
  1303. sub rex {
  1304. local *opcode=shift;
  1305. my ($dst,$src)=@_;
  1306. my $rex=0;
  1307. $rex|=0x04 if($dst>=8);
  1308. $rex|=0x01 if($src>=8);
  1309. push @opcode,$rex|0x40 if($rex);
  1310. }
  1311. sub aesni {
  1312. my $line=shift;
  1313. my @opcode=(0x66);
  1314. if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1315. rex(\@opcode,$4,$3);
  1316. push @opcode,0x0f,0x3a,0xdf;
  1317. push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
  1318. my $c=$2;
  1319. push @opcode,$c=~/^0/?oct($c):$c;
  1320. return ".byte\t".join(',',@opcode);
  1321. }
  1322. elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1323. my %opcodelet = (
  1324. "aesimc" => 0xdb,
  1325. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  1326. "aesdec" => 0xde, "aesdeclast" => 0xdf
  1327. );
  1328. return undef if (!defined($opcodelet{$1}));
  1329. rex(\@opcode,$3,$2);
  1330. push @opcode,0x0f,0x38,$opcodelet{$1};
  1331. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1332. return ".byte\t".join(',',@opcode);
  1333. }
  1334. elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
  1335. my %opcodelet = (
  1336. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  1337. "aesdec" => 0xde, "aesdeclast" => 0xdf
  1338. );
  1339. return undef if (!defined($opcodelet{$1}));
  1340. my $off = $2;
  1341. push @opcode,0x44 if ($3>=8);
  1342. push @opcode,0x0f,0x38,$opcodelet{$1};
  1343. push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
  1344. push @opcode,($off=~/^0/?oct($off):$off)&0xff;
  1345. return ".byte\t".join(',',@opcode);
  1346. }
  1347. return $line;
  1348. }
  1349. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1350. $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
  1351. print $code;
  1352. close STDOUT;