aesni-mb-x86_64.pl 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer AES-NI procedures process several independent buffers
  15. # in parallel by interleaving independent instructions.
  16. #
  17. # Cycles per byte for interleave factor 4:
  18. #
  19. # asymptotic measured
  20. # ---------------------------
  21. # Westmere 5.00/4=1.25 5.13/4=1.28
  22. # Atom 15.0/4=3.75 ?15.7/4=3.93
  23. # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
  24. # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
  25. # Haswell 4.44/4=1.11 4.44/4=1.11
  26. # Bulldozer 5.75/4=1.44 5.76/4=1.44
  27. #
  28. # Cycles per byte for interleave factor 8 (not implemented for
  29. # pre-AVX processors, where higher interleave factor incidentally
  30. # doesn't result in improvement):
  31. #
  32. # asymptotic measured
  33. # ---------------------------
  34. # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
  35. # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
  36. # Haswell 5.00/8=0.63 5.00/8=0.63
  37. # Bulldozer 5.75/8=0.72 5.77/8=0.72
  38. #
  39. # (*) Sandy/Ivy Bridge are known to handle high interleave factors
  40. # suboptimally;
  41. # $output is the last argument if it looks like a file (it has an extension)
  42. # $flavour is the first argument if it doesn't look like a file
  43. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  44. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  45. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  46. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  47. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  48. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  49. die "can't locate x86_64-xlate.pl";
  50. push(@INC,"${dir}","${dir}../../perlasm");
  51. require "x86_64-support.pl";
  52. $ptr_size=&pointer_size($flavour);
  53. $avx=0;
  54. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  55. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  56. $avx = ($1>=2.19) + ($1>=2.22);
  57. }
  58. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  59. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  60. $avx = ($1>=2.09) + ($1>=2.10);
  61. }
  62. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  63. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  64. $avx = ($1>=10) + ($1>=11);
  65. }
  66. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  67. $avx = ($2>=3.0) + ($2>3.0);
  68. }
  69. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  70. or die "can't call $xlate: $!";
  71. *STDOUT=*OUT;
  72. # void aesni_multi_cbc_encrypt (
  73. # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
  74. # const AES_KEY *key,
  75. # int num); /* 1 or 2 */
  76. #
  77. $inp="%rdi"; # 1st arg
  78. $key="%rsi"; # 2nd arg
  79. $num="%edx";
  80. $inp_elm_size=2*$ptr_size+8+16;
  81. @inptr=map("%r$_",(8..11));
  82. @outptr=map("%r$_",(12..15));
  83. ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
  84. @out=map("%xmm$_",(2..5));
  85. @inp=map("%xmm$_",(6..9));
  86. ($counters,$mask,$zero)=map("%xmm$_",(10..12));
  87. ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
  88. $code.=<<___;
  89. .text
  90. .extern OPENSSL_ia32cap_P
  91. .globl aesni_multi_cbc_encrypt
  92. .type aesni_multi_cbc_encrypt,\@function,3
  93. .align 32
  94. aesni_multi_cbc_encrypt:
  95. .cfi_startproc
  96. ___
  97. $code.=<<___ if ($avx);
  98. cmp \$2,$num
  99. jb .Lenc_non_avx
  100. mov OPENSSL_ia32cap_P+4(%rip),%ecx
  101. test \$`1<<28`,%ecx # AVX bit
  102. jnz _avx_cbc_enc_shortcut
  103. jmp .Lenc_non_avx
  104. .align 16
  105. .Lenc_non_avx:
  106. ___
  107. $code.=<<___;
  108. mov %rsp,%rax
  109. .cfi_def_cfa_register %rax
  110. push %rbx
  111. .cfi_push %rbx
  112. push %rbp
  113. .cfi_push %rbp
  114. push %r12
  115. .cfi_push %r12
  116. push %r13
  117. .cfi_push %r13
  118. push %r14
  119. .cfi_push %r14
  120. push %r15
  121. .cfi_push %r15
  122. ___
  123. $code.=<<___ if ($win64);
  124. lea -0xa8(%rsp),%rsp
  125. movaps %xmm6,(%rsp)
  126. movaps %xmm7,0x10(%rsp)
  127. movaps %xmm8,0x20(%rsp)
  128. movaps %xmm9,0x30(%rsp)
  129. movaps %xmm10,0x40(%rsp)
  130. movaps %xmm11,0x50(%rsp)
  131. movaps %xmm12,0x60(%rsp)
  132. movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
  133. movaps %xmm14,-0x58(%rax)
  134. movaps %xmm15,-0x48(%rax)
  135. ___
  136. $code.=<<___;
  137. # stack layout
  138. #
  139. # +0 output sink
  140. # +16 input sink [original %rsp and $num]
  141. # +32 counters
  142. sub \$48,%rsp
  143. and \$-64,%rsp
  144. mov %rax,16(%rsp) # original %rsp
  145. .cfi_cfa_expression %rsp+16,deref,+8
  146. .Lenc4x_body:
  147. movdqu ($key),$zero # 0-round key
  148. lea 0x78($key),$key # size optimization
  149. lea $inp_elm_size*2($inp),$inp
  150. .Lenc4x_loop_grande:
  151. mov $num,24(%rsp) # original $num
  152. xor $num,$num
  153. ___
  154. for($i=0;$i<4;$i++) {
  155. $inptr_reg=&pointer_register($flavour,@inptr[$i]);
  156. $outptr_reg=&pointer_register($flavour,@outptr[$i]);
  157. $code.=<<___;
  158. # borrow $one for number of blocks
  159. mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
  160. mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
  161. cmp $num,$one
  162. mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
  163. cmovg $one,$num # find maximum
  164. test $one,$one
  165. # load IV
  166. movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
  167. mov $one,`32+4*$i`(%rsp) # initialize counters
  168. cmovle %rsp,@inptr[$i] # cancel input
  169. ___
  170. }
  171. $code.=<<___;
  172. test $num,$num
  173. jz .Lenc4x_done
  174. movups 0x10-0x78($key),$rndkey1
  175. pxor $zero,@out[0]
  176. movups 0x20-0x78($key),$rndkey0
  177. pxor $zero,@out[1]
  178. mov 0xf0-0x78($key),$rounds
  179. pxor $zero,@out[2]
  180. movdqu (@inptr[0]),@inp[0] # load inputs
  181. pxor $zero,@out[3]
  182. movdqu (@inptr[1]),@inp[1]
  183. pxor @inp[0],@out[0]
  184. movdqu (@inptr[2]),@inp[2]
  185. pxor @inp[1],@out[1]
  186. movdqu (@inptr[3]),@inp[3]
  187. pxor @inp[2],@out[2]
  188. pxor @inp[3],@out[3]
  189. movdqa 32(%rsp),$counters # load counters
  190. xor $offset,$offset
  191. jmp .Loop_enc4x
  192. .align 32
  193. .Loop_enc4x:
  194. add \$16,$offset
  195. lea 16(%rsp),$sink # sink pointer
  196. mov \$1,$one # constant of 1
  197. sub $offset,$sink
  198. aesenc $rndkey1,@out[0]
  199. prefetcht0 31(@inptr[0],$offset) # prefetch input
  200. prefetcht0 31(@inptr[1],$offset)
  201. aesenc $rndkey1,@out[1]
  202. prefetcht0 31(@inptr[2],$offset)
  203. prefetcht0 31(@inptr[2],$offset)
  204. aesenc $rndkey1,@out[2]
  205. aesenc $rndkey1,@out[3]
  206. movups 0x30-0x78($key),$rndkey1
  207. ___
  208. for($i=0;$i<4;$i++) {
  209. my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
  210. $code.=<<___;
  211. cmp `32+4*$i`(%rsp),$one
  212. aesenc $rndkey,@out[0]
  213. aesenc $rndkey,@out[1]
  214. aesenc $rndkey,@out[2]
  215. cmovge $sink,@inptr[$i] # cancel input
  216. cmovg $sink,@outptr[$i] # sink output
  217. aesenc $rndkey,@out[3]
  218. movups `0x40+16*$i-0x78`($key),$rndkey
  219. ___
  220. }
  221. $code.=<<___;
  222. movdqa $counters,$mask
  223. aesenc $rndkey0,@out[0]
  224. prefetcht0 15(@outptr[0],$offset) # prefetch output
  225. prefetcht0 15(@outptr[1],$offset)
  226. aesenc $rndkey0,@out[1]
  227. prefetcht0 15(@outptr[2],$offset)
  228. prefetcht0 15(@outptr[3],$offset)
  229. aesenc $rndkey0,@out[2]
  230. aesenc $rndkey0,@out[3]
  231. movups 0x80-0x78($key),$rndkey0
  232. pxor $zero,$zero
  233. aesenc $rndkey1,@out[0]
  234. pcmpgtd $zero,$mask
  235. movdqu -0x78($key),$zero # reload 0-round key
  236. aesenc $rndkey1,@out[1]
  237. paddd $mask,$counters # decrement counters
  238. movdqa $counters,32(%rsp) # update counters
  239. aesenc $rndkey1,@out[2]
  240. aesenc $rndkey1,@out[3]
  241. movups 0x90-0x78($key),$rndkey1
  242. cmp \$11,$rounds
  243. aesenc $rndkey0,@out[0]
  244. aesenc $rndkey0,@out[1]
  245. aesenc $rndkey0,@out[2]
  246. aesenc $rndkey0,@out[3]
  247. movups 0xa0-0x78($key),$rndkey0
  248. jb .Lenc4x_tail
  249. aesenc $rndkey1,@out[0]
  250. aesenc $rndkey1,@out[1]
  251. aesenc $rndkey1,@out[2]
  252. aesenc $rndkey1,@out[3]
  253. movups 0xb0-0x78($key),$rndkey1
  254. aesenc $rndkey0,@out[0]
  255. aesenc $rndkey0,@out[1]
  256. aesenc $rndkey0,@out[2]
  257. aesenc $rndkey0,@out[3]
  258. movups 0xc0-0x78($key),$rndkey0
  259. je .Lenc4x_tail
  260. aesenc $rndkey1,@out[0]
  261. aesenc $rndkey1,@out[1]
  262. aesenc $rndkey1,@out[2]
  263. aesenc $rndkey1,@out[3]
  264. movups 0xd0-0x78($key),$rndkey1
  265. aesenc $rndkey0,@out[0]
  266. aesenc $rndkey0,@out[1]
  267. aesenc $rndkey0,@out[2]
  268. aesenc $rndkey0,@out[3]
  269. movups 0xe0-0x78($key),$rndkey0
  270. jmp .Lenc4x_tail
  271. .align 32
  272. .Lenc4x_tail:
  273. aesenc $rndkey1,@out[0]
  274. aesenc $rndkey1,@out[1]
  275. aesenc $rndkey1,@out[2]
  276. aesenc $rndkey1,@out[3]
  277. movdqu (@inptr[0],$offset),@inp[0]
  278. movdqu 0x10-0x78($key),$rndkey1
  279. aesenclast $rndkey0,@out[0]
  280. movdqu (@inptr[1],$offset),@inp[1]
  281. pxor $zero,@inp[0]
  282. aesenclast $rndkey0,@out[1]
  283. movdqu (@inptr[2],$offset),@inp[2]
  284. pxor $zero,@inp[1]
  285. aesenclast $rndkey0,@out[2]
  286. movdqu (@inptr[3],$offset),@inp[3]
  287. pxor $zero,@inp[2]
  288. aesenclast $rndkey0,@out[3]
  289. movdqu 0x20-0x78($key),$rndkey0
  290. pxor $zero,@inp[3]
  291. movups @out[0],-16(@outptr[0],$offset)
  292. pxor @inp[0],@out[0]
  293. movups @out[1],-16(@outptr[1],$offset)
  294. pxor @inp[1],@out[1]
  295. movups @out[2],-16(@outptr[2],$offset)
  296. pxor @inp[2],@out[2]
  297. movups @out[3],-16(@outptr[3],$offset)
  298. pxor @inp[3],@out[3]
  299. dec $num
  300. jnz .Loop_enc4x
  301. mov 16(%rsp),%rax # original %rsp
  302. .cfi_def_cfa %rax,8
  303. mov 24(%rsp),$num
  304. #pxor @inp[0],@out[0]
  305. #pxor @inp[1],@out[1]
  306. # output iv FIX ME!
  307. #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
  308. #pxor @inp[2],@out[2]
  309. #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
  310. #pxor @inp[3],@out[3]
  311. #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller
  312. #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out...
  313. lea `$inp_elm_size*4`($inp),$inp
  314. dec $num
  315. jnz .Lenc4x_loop_grande
  316. .Lenc4x_done:
  317. ___
  318. $code.=<<___ if ($win64);
  319. movaps -0xd8(%rax),%xmm6
  320. movaps -0xc8(%rax),%xmm7
  321. movaps -0xb8(%rax),%xmm8
  322. movaps -0xa8(%rax),%xmm9
  323. movaps -0x98(%rax),%xmm10
  324. movaps -0x88(%rax),%xmm11
  325. movaps -0x78(%rax),%xmm12
  326. #movaps -0x68(%rax),%xmm13
  327. #movaps -0x58(%rax),%xmm14
  328. #movaps -0x48(%rax),%xmm15
  329. ___
  330. $code.=<<___;
  331. mov -48(%rax),%r15
  332. .cfi_restore %r15
  333. mov -40(%rax),%r14
  334. .cfi_restore %r14
  335. mov -32(%rax),%r13
  336. .cfi_restore %r13
  337. mov -24(%rax),%r12
  338. .cfi_restore %r12
  339. mov -16(%rax),%rbp
  340. .cfi_restore %rbp
  341. mov -8(%rax),%rbx
  342. .cfi_restore %rbx
  343. lea (%rax),%rsp
  344. .cfi_def_cfa_register %rsp
  345. .Lenc4x_epilogue:
  346. ret
  347. .cfi_endproc
  348. .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
  349. .globl aesni_multi_cbc_decrypt
  350. .type aesni_multi_cbc_decrypt,\@function,3
  351. .align 32
  352. aesni_multi_cbc_decrypt:
  353. .cfi_startproc
  354. ___
  355. $code.=<<___ if ($avx);
  356. cmp \$2,$num
  357. jb .Ldec_non_avx
  358. mov OPENSSL_ia32cap_P+4(%rip),%ecx
  359. test \$`1<<28`,%ecx # AVX bit
  360. jnz _avx_cbc_dec_shortcut
  361. jmp .Ldec_non_avx
  362. .align 16
  363. .Ldec_non_avx:
  364. ___
  365. $code.=<<___;
  366. mov %rsp,%rax
  367. .cfi_def_cfa_register %rax
  368. push %rbx
  369. .cfi_push %rbx
  370. push %rbp
  371. .cfi_push %rbp
  372. push %r12
  373. .cfi_push %r12
  374. push %r13
  375. .cfi_push %r13
  376. push %r14
  377. .cfi_push %r14
  378. push %r15
  379. .cfi_push %r15
  380. ___
  381. $code.=<<___ if ($win64);
  382. lea -0xa8(%rsp),%rsp
  383. movaps %xmm6,(%rsp)
  384. movaps %xmm7,0x10(%rsp)
  385. movaps %xmm8,0x20(%rsp)
  386. movaps %xmm9,0x30(%rsp)
  387. movaps %xmm10,0x40(%rsp)
  388. movaps %xmm11,0x50(%rsp)
  389. movaps %xmm12,0x60(%rsp)
  390. movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
  391. movaps %xmm14,-0x58(%rax)
  392. movaps %xmm15,-0x48(%rax)
  393. ___
  394. $code.=<<___;
  395. # stack layout
  396. #
  397. # +0 output sink
  398. # +16 input sink [original %rsp and $num]
  399. # +32 counters
  400. sub \$48,%rsp
  401. and \$-64,%rsp
  402. mov %rax,16(%rsp) # original %rsp
  403. .cfi_cfa_expression %rsp+16,deref,+8
  404. .Ldec4x_body:
  405. movdqu ($key),$zero # 0-round key
  406. lea 0x78($key),$key # size optimization
  407. lea $inp_elm_size*2($inp),$inp
  408. .Ldec4x_loop_grande:
  409. mov $num,24(%rsp) # original $num
  410. xor $num,$num
  411. ___
  412. for($i=0;$i<4;$i++) {
  413. $inptr_reg=&pointer_register($flavour,@inptr[$i]);
  414. $outptr_reg=&pointer_register($flavour,@outptr[$i]);
  415. $code.=<<___;
  416. # borrow $one for number of blocks
  417. mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
  418. mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
  419. cmp $num,$one
  420. mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
  421. cmovg $one,$num # find maximum
  422. test $one,$one
  423. # load IV
  424. movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
  425. mov $one,`32+4*$i`(%rsp) # initialize counters
  426. cmovle %rsp,@inptr[$i] # cancel input
  427. ___
  428. }
  429. $code.=<<___;
  430. test $num,$num
  431. jz .Ldec4x_done
  432. movups 0x10-0x78($key),$rndkey1
  433. movups 0x20-0x78($key),$rndkey0
  434. mov 0xf0-0x78($key),$rounds
  435. movdqu (@inptr[0]),@out[0] # load inputs
  436. movdqu (@inptr[1]),@out[1]
  437. pxor $zero,@out[0]
  438. movdqu (@inptr[2]),@out[2]
  439. pxor $zero,@out[1]
  440. movdqu (@inptr[3]),@out[3]
  441. pxor $zero,@out[2]
  442. pxor $zero,@out[3]
  443. movdqa 32(%rsp),$counters # load counters
  444. xor $offset,$offset
  445. jmp .Loop_dec4x
  446. .align 32
  447. .Loop_dec4x:
  448. add \$16,$offset
  449. lea 16(%rsp),$sink # sink pointer
  450. mov \$1,$one # constant of 1
  451. sub $offset,$sink
  452. aesdec $rndkey1,@out[0]
  453. prefetcht0 31(@inptr[0],$offset) # prefetch input
  454. prefetcht0 31(@inptr[1],$offset)
  455. aesdec $rndkey1,@out[1]
  456. prefetcht0 31(@inptr[2],$offset)
  457. prefetcht0 31(@inptr[3],$offset)
  458. aesdec $rndkey1,@out[2]
  459. aesdec $rndkey1,@out[3]
  460. movups 0x30-0x78($key),$rndkey1
  461. ___
  462. for($i=0;$i<4;$i++) {
  463. my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
  464. $code.=<<___;
  465. cmp `32+4*$i`(%rsp),$one
  466. aesdec $rndkey,@out[0]
  467. aesdec $rndkey,@out[1]
  468. aesdec $rndkey,@out[2]
  469. cmovge $sink,@inptr[$i] # cancel input
  470. cmovg $sink,@outptr[$i] # sink output
  471. aesdec $rndkey,@out[3]
  472. movups `0x40+16*$i-0x78`($key),$rndkey
  473. ___
  474. }
  475. $code.=<<___;
  476. movdqa $counters,$mask
  477. aesdec $rndkey0,@out[0]
  478. prefetcht0 15(@outptr[0],$offset) # prefetch output
  479. prefetcht0 15(@outptr[1],$offset)
  480. aesdec $rndkey0,@out[1]
  481. prefetcht0 15(@outptr[2],$offset)
  482. prefetcht0 15(@outptr[3],$offset)
  483. aesdec $rndkey0,@out[2]
  484. aesdec $rndkey0,@out[3]
  485. movups 0x80-0x78($key),$rndkey0
  486. pxor $zero,$zero
  487. aesdec $rndkey1,@out[0]
  488. pcmpgtd $zero,$mask
  489. movdqu -0x78($key),$zero # reload 0-round key
  490. aesdec $rndkey1,@out[1]
  491. paddd $mask,$counters # decrement counters
  492. movdqa $counters,32(%rsp) # update counters
  493. aesdec $rndkey1,@out[2]
  494. aesdec $rndkey1,@out[3]
  495. movups 0x90-0x78($key),$rndkey1
  496. cmp \$11,$rounds
  497. aesdec $rndkey0,@out[0]
  498. aesdec $rndkey0,@out[1]
  499. aesdec $rndkey0,@out[2]
  500. aesdec $rndkey0,@out[3]
  501. movups 0xa0-0x78($key),$rndkey0
  502. jb .Ldec4x_tail
  503. aesdec $rndkey1,@out[0]
  504. aesdec $rndkey1,@out[1]
  505. aesdec $rndkey1,@out[2]
  506. aesdec $rndkey1,@out[3]
  507. movups 0xb0-0x78($key),$rndkey1
  508. aesdec $rndkey0,@out[0]
  509. aesdec $rndkey0,@out[1]
  510. aesdec $rndkey0,@out[2]
  511. aesdec $rndkey0,@out[3]
  512. movups 0xc0-0x78($key),$rndkey0
  513. je .Ldec4x_tail
  514. aesdec $rndkey1,@out[0]
  515. aesdec $rndkey1,@out[1]
  516. aesdec $rndkey1,@out[2]
  517. aesdec $rndkey1,@out[3]
  518. movups 0xd0-0x78($key),$rndkey1
  519. aesdec $rndkey0,@out[0]
  520. aesdec $rndkey0,@out[1]
  521. aesdec $rndkey0,@out[2]
  522. aesdec $rndkey0,@out[3]
  523. movups 0xe0-0x78($key),$rndkey0
  524. jmp .Ldec4x_tail
  525. .align 32
  526. .Ldec4x_tail:
  527. aesdec $rndkey1,@out[0]
  528. aesdec $rndkey1,@out[1]
  529. aesdec $rndkey1,@out[2]
  530. pxor $rndkey0,@inp[0]
  531. pxor $rndkey0,@inp[1]
  532. aesdec $rndkey1,@out[3]
  533. movdqu 0x10-0x78($key),$rndkey1
  534. pxor $rndkey0,@inp[2]
  535. pxor $rndkey0,@inp[3]
  536. movdqu 0x20-0x78($key),$rndkey0
  537. aesdeclast @inp[0],@out[0]
  538. aesdeclast @inp[1],@out[1]
  539. movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
  540. movdqu -16(@inptr[1],$offset),@inp[1]
  541. aesdeclast @inp[2],@out[2]
  542. aesdeclast @inp[3],@out[3]
  543. movdqu -16(@inptr[2],$offset),@inp[2]
  544. movdqu -16(@inptr[3],$offset),@inp[3]
  545. movups @out[0],-16(@outptr[0],$offset)
  546. movdqu (@inptr[0],$offset),@out[0]
  547. movups @out[1],-16(@outptr[1],$offset)
  548. movdqu (@inptr[1],$offset),@out[1]
  549. pxor $zero,@out[0]
  550. movups @out[2],-16(@outptr[2],$offset)
  551. movdqu (@inptr[2],$offset),@out[2]
  552. pxor $zero,@out[1]
  553. movups @out[3],-16(@outptr[3],$offset)
  554. movdqu (@inptr[3],$offset),@out[3]
  555. pxor $zero,@out[2]
  556. pxor $zero,@out[3]
  557. dec $num
  558. jnz .Loop_dec4x
  559. mov 16(%rsp),%rax # original %rsp
  560. .cfi_def_cfa %rax,8
  561. mov 24(%rsp),$num
  562. lea `$inp_elm_size*4`($inp),$inp
  563. dec $num
  564. jnz .Ldec4x_loop_grande
  565. .Ldec4x_done:
  566. ___
  567. $code.=<<___ if ($win64);
  568. movaps -0xd8(%rax),%xmm6
  569. movaps -0xc8(%rax),%xmm7
  570. movaps -0xb8(%rax),%xmm8
  571. movaps -0xa8(%rax),%xmm9
  572. movaps -0x98(%rax),%xmm10
  573. movaps -0x88(%rax),%xmm11
  574. movaps -0x78(%rax),%xmm12
  575. #movaps -0x68(%rax),%xmm13
  576. #movaps -0x58(%rax),%xmm14
  577. #movaps -0x48(%rax),%xmm15
  578. ___
  579. $code.=<<___;
  580. mov -48(%rax),%r15
  581. .cfi_restore %r15
  582. mov -40(%rax),%r14
  583. .cfi_restore %r14
  584. mov -32(%rax),%r13
  585. .cfi_restore %r13
  586. mov -24(%rax),%r12
  587. .cfi_restore %r12
  588. mov -16(%rax),%rbp
  589. .cfi_restore %rbp
  590. mov -8(%rax),%rbx
  591. .cfi_restore %rbx
  592. lea (%rax),%rsp
  593. .cfi_def_cfa_register %rsp
  594. .Ldec4x_epilogue:
  595. ret
  596. .cfi_endproc
  597. .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
  598. ___
  599. if ($avx) {{{
  600. my @ptr=map("%r$_",(8..15));
  601. my $offload=$sink;
  602. my @out=map("%xmm$_",(2..9));
  603. my @inp=map("%xmm$_",(10..13));
  604. my ($counters,$zero)=("%xmm14","%xmm15");
  605. $code.=<<___;
  606. .type aesni_multi_cbc_encrypt_avx,\@function,3
  607. .align 32
  608. aesni_multi_cbc_encrypt_avx:
  609. .cfi_startproc
  610. _avx_cbc_enc_shortcut:
  611. mov %rsp,%rax
  612. .cfi_def_cfa_register %rax
  613. push %rbx
  614. .cfi_push %rbx
  615. push %rbp
  616. .cfi_push %rbp
  617. push %r12
  618. .cfi_push %r12
  619. push %r13
  620. .cfi_push %r13
  621. push %r14
  622. .cfi_push %r14
  623. push %r15
  624. .cfi_push %r15
  625. ___
  626. $code.=<<___ if ($win64);
  627. lea -0xa8(%rsp),%rsp
  628. movaps %xmm6,(%rsp)
  629. movaps %xmm7,0x10(%rsp)
  630. movaps %xmm8,0x20(%rsp)
  631. movaps %xmm9,0x30(%rsp)
  632. movaps %xmm10,0x40(%rsp)
  633. movaps %xmm11,0x50(%rsp)
  634. movaps %xmm12,-0x78(%rax)
  635. movaps %xmm13,-0x68(%rax)
  636. movaps %xmm14,-0x58(%rax)
  637. movaps %xmm15,-0x48(%rax)
  638. ___
  639. $code.=<<___;
  640. # stack layout
  641. #
  642. # +0 output sink
  643. # +16 input sink [original %rsp and $num]
  644. # +32 counters
  645. # +64 distances between inputs and outputs
  646. # +128 off-load area for @inp[0..3]
  647. sub \$192,%rsp
  648. and \$-128,%rsp
  649. mov %rax,16(%rsp) # original %rsp
  650. .cfi_cfa_expression %rsp+16,deref,+8
  651. .Lenc8x_body:
  652. vzeroupper
  653. vmovdqu ($key),$zero # 0-round key
  654. lea 0x78($key),$key # size optimization
  655. lea `$inp_elm_size*4`($inp),$inp
  656. shr \$1,$num
  657. .Lenc8x_loop_grande:
  658. #mov $num,24(%rsp) # original $num
  659. xor $num,$num
  660. ___
  661. for($i=0;$i<8;$i++) {
  662. my $temp = $i ? $offload : $offset;
  663. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  664. $temp_reg=&pointer_register($flavour,$temp);
  665. $code.=<<___;
  666. # borrow $one for number of blocks
  667. mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
  668. # input pointer
  669. mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
  670. cmp $num,$one
  671. # output pointer
  672. mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
  673. cmovg $one,$num # find maximum
  674. test $one,$one
  675. # load IV
  676. vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
  677. mov $one,`32+4*$i`(%rsp) # initialize counters
  678. cmovle %rsp,@ptr[$i] # cancel input
  679. sub @ptr[$i],$temp # distance between input and output
  680. mov $temp,`64+8*$i`(%rsp) # initialize distances
  681. ___
  682. }
  683. $code.=<<___;
  684. test $num,$num
  685. jz .Lenc8x_done
  686. vmovups 0x10-0x78($key),$rndkey1
  687. vmovups 0x20-0x78($key),$rndkey0
  688. mov 0xf0-0x78($key),$rounds
  689. vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
  690. lea 128(%rsp),$offload # offload area
  691. vpxor (@ptr[1]),$zero,@inp[1]
  692. vpxor (@ptr[2]),$zero,@inp[2]
  693. vpxor (@ptr[3]),$zero,@inp[3]
  694. vpxor @inp[0],@out[0],@out[0]
  695. vpxor (@ptr[4]),$zero,@inp[0]
  696. vpxor @inp[1],@out[1],@out[1]
  697. vpxor (@ptr[5]),$zero,@inp[1]
  698. vpxor @inp[2],@out[2],@out[2]
  699. vpxor (@ptr[6]),$zero,@inp[2]
  700. vpxor @inp[3],@out[3],@out[3]
  701. vpxor (@ptr[7]),$zero,@inp[3]
  702. vpxor @inp[0],@out[4],@out[4]
  703. mov \$1,$one # constant of 1
  704. vpxor @inp[1],@out[5],@out[5]
  705. vpxor @inp[2],@out[6],@out[6]
  706. vpxor @inp[3],@out[7],@out[7]
  707. jmp .Loop_enc8x
  708. .align 32
  709. .Loop_enc8x:
  710. ___
  711. for($i=0;$i<8;$i++) {
  712. my $rndkey=($i&1)?$rndkey0:$rndkey1;
  713. $code.=<<___;
  714. vaesenc $rndkey,@out[0],@out[0]
  715. cmp 32+4*$i(%rsp),$one
  716. ___
  717. $code.=<<___ if ($i);
  718. mov 64+8*$i(%rsp),$offset
  719. ___
  720. $code.=<<___;
  721. vaesenc $rndkey,@out[1],@out[1]
  722. prefetcht0 31(@ptr[$i]) # prefetch input
  723. vaesenc $rndkey,@out[2],@out[2]
  724. ___
  725. $code.=<<___ if ($i>1);
  726. prefetcht0 15(@ptr[$i-2]) # prefetch output
  727. ___
  728. $code.=<<___;
  729. vaesenc $rndkey,@out[3],@out[3]
  730. lea (@ptr[$i],$offset),$offset
  731. cmovge %rsp,@ptr[$i] # cancel input
  732. vaesenc $rndkey,@out[4],@out[4]
  733. cmovg %rsp,$offset # sink output
  734. vaesenc $rndkey,@out[5],@out[5]
  735. sub @ptr[$i],$offset
  736. vaesenc $rndkey,@out[6],@out[6]
  737. vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
  738. mov $offset,64+8*$i(%rsp)
  739. vaesenc $rndkey,@out[7],@out[7]
  740. vmovups `16*(3+$i)-0x78`($key),$rndkey
  741. lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
  742. ___
  743. $code.=<<___ if ($i<4)
  744. vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
  745. ___
  746. }
  747. $code.=<<___;
  748. vmovdqu 32(%rsp),$counters
  749. prefetcht0 15(@ptr[$i-2]) # prefetch output
  750. prefetcht0 15(@ptr[$i-1])
  751. cmp \$11,$rounds
  752. jb .Lenc8x_tail
  753. vaesenc $rndkey1,@out[0],@out[0]
  754. vaesenc $rndkey1,@out[1],@out[1]
  755. vaesenc $rndkey1,@out[2],@out[2]
  756. vaesenc $rndkey1,@out[3],@out[3]
  757. vaesenc $rndkey1,@out[4],@out[4]
  758. vaesenc $rndkey1,@out[5],@out[5]
  759. vaesenc $rndkey1,@out[6],@out[6]
  760. vaesenc $rndkey1,@out[7],@out[7]
  761. vmovups 0xb0-0x78($key),$rndkey1
  762. vaesenc $rndkey0,@out[0],@out[0]
  763. vaesenc $rndkey0,@out[1],@out[1]
  764. vaesenc $rndkey0,@out[2],@out[2]
  765. vaesenc $rndkey0,@out[3],@out[3]
  766. vaesenc $rndkey0,@out[4],@out[4]
  767. vaesenc $rndkey0,@out[5],@out[5]
  768. vaesenc $rndkey0,@out[6],@out[6]
  769. vaesenc $rndkey0,@out[7],@out[7]
  770. vmovups 0xc0-0x78($key),$rndkey0
  771. je .Lenc8x_tail
  772. vaesenc $rndkey1,@out[0],@out[0]
  773. vaesenc $rndkey1,@out[1],@out[1]
  774. vaesenc $rndkey1,@out[2],@out[2]
  775. vaesenc $rndkey1,@out[3],@out[3]
  776. vaesenc $rndkey1,@out[4],@out[4]
  777. vaesenc $rndkey1,@out[5],@out[5]
  778. vaesenc $rndkey1,@out[6],@out[6]
  779. vaesenc $rndkey1,@out[7],@out[7]
  780. vmovups 0xd0-0x78($key),$rndkey1
  781. vaesenc $rndkey0,@out[0],@out[0]
  782. vaesenc $rndkey0,@out[1],@out[1]
  783. vaesenc $rndkey0,@out[2],@out[2]
  784. vaesenc $rndkey0,@out[3],@out[3]
  785. vaesenc $rndkey0,@out[4],@out[4]
  786. vaesenc $rndkey0,@out[5],@out[5]
  787. vaesenc $rndkey0,@out[6],@out[6]
  788. vaesenc $rndkey0,@out[7],@out[7]
  789. vmovups 0xe0-0x78($key),$rndkey0
  790. .Lenc8x_tail:
  791. vaesenc $rndkey1,@out[0],@out[0]
  792. vpxor $zero,$zero,$zero
  793. vaesenc $rndkey1,@out[1],@out[1]
  794. vaesenc $rndkey1,@out[2],@out[2]
  795. vpcmpgtd $zero,$counters,$zero
  796. vaesenc $rndkey1,@out[3],@out[3]
  797. vaesenc $rndkey1,@out[4],@out[4]
  798. vpaddd $counters,$zero,$zero # decrement counters
  799. vmovdqu 48(%rsp),$counters
  800. vaesenc $rndkey1,@out[5],@out[5]
  801. mov 64(%rsp),$offset # pre-load 1st offset
  802. vaesenc $rndkey1,@out[6],@out[6]
  803. vaesenc $rndkey1,@out[7],@out[7]
  804. vmovups 0x10-0x78($key),$rndkey1
  805. vaesenclast $rndkey0,@out[0],@out[0]
  806. vmovdqa $zero,32(%rsp) # update counters
  807. vpxor $zero,$zero,$zero
  808. vaesenclast $rndkey0,@out[1],@out[1]
  809. vaesenclast $rndkey0,@out[2],@out[2]
  810. vpcmpgtd $zero,$counters,$zero
  811. vaesenclast $rndkey0,@out[3],@out[3]
  812. vaesenclast $rndkey0,@out[4],@out[4]
  813. vpaddd $zero,$counters,$counters # decrement counters
  814. vmovdqu -0x78($key),$zero # 0-round
  815. vaesenclast $rndkey0,@out[5],@out[5]
  816. vaesenclast $rndkey0,@out[6],@out[6]
  817. vmovdqa $counters,48(%rsp) # update counters
  818. vaesenclast $rndkey0,@out[7],@out[7]
  819. vmovups 0x20-0x78($key),$rndkey0
  820. vmovups @out[0],-16(@ptr[0]) # write output
  821. sub $offset,@ptr[0] # switch to input
  822. vpxor 0x00($offload),@out[0],@out[0]
  823. vmovups @out[1],-16(@ptr[1])
  824. sub `64+1*8`(%rsp),@ptr[1]
  825. vpxor 0x10($offload),@out[1],@out[1]
  826. vmovups @out[2],-16(@ptr[2])
  827. sub `64+2*8`(%rsp),@ptr[2]
  828. vpxor 0x20($offload),@out[2],@out[2]
  829. vmovups @out[3],-16(@ptr[3])
  830. sub `64+3*8`(%rsp),@ptr[3]
  831. vpxor 0x30($offload),@out[3],@out[3]
  832. vmovups @out[4],-16(@ptr[4])
  833. sub `64+4*8`(%rsp),@ptr[4]
  834. vpxor @inp[0],@out[4],@out[4]
  835. vmovups @out[5],-16(@ptr[5])
  836. sub `64+5*8`(%rsp),@ptr[5]
  837. vpxor @inp[1],@out[5],@out[5]
  838. vmovups @out[6],-16(@ptr[6])
  839. sub `64+6*8`(%rsp),@ptr[6]
  840. vpxor @inp[2],@out[6],@out[6]
  841. vmovups @out[7],-16(@ptr[7])
  842. sub `64+7*8`(%rsp),@ptr[7]
  843. vpxor @inp[3],@out[7],@out[7]
  844. dec $num
  845. jnz .Loop_enc8x
  846. mov 16(%rsp),%rax # original %rsp
  847. .cfi_def_cfa %rax,8
  848. #mov 24(%rsp),$num
  849. #lea `$inp_elm_size*8`($inp),$inp
  850. #dec $num
  851. #jnz .Lenc8x_loop_grande
  852. .Lenc8x_done:
  853. vzeroupper
  854. ___
  855. $code.=<<___ if ($win64);
  856. movaps -0xd8(%rax),%xmm6
  857. movaps -0xc8(%rax),%xmm7
  858. movaps -0xb8(%rax),%xmm8
  859. movaps -0xa8(%rax),%xmm9
  860. movaps -0x98(%rax),%xmm10
  861. movaps -0x88(%rax),%xmm11
  862. movaps -0x78(%rax),%xmm12
  863. movaps -0x68(%rax),%xmm13
  864. movaps -0x58(%rax),%xmm14
  865. movaps -0x48(%rax),%xmm15
  866. ___
  867. $code.=<<___;
  868. mov -48(%rax),%r15
  869. .cfi_restore %r15
  870. mov -40(%rax),%r14
  871. .cfi_restore %r14
  872. mov -32(%rax),%r13
  873. .cfi_restore %r13
  874. mov -24(%rax),%r12
  875. .cfi_restore %r12
  876. mov -16(%rax),%rbp
  877. .cfi_restore %rbp
  878. mov -8(%rax),%rbx
  879. .cfi_restore %rbx
  880. lea (%rax),%rsp
  881. .cfi_def_cfa_register %rsp
  882. .Lenc8x_epilogue:
  883. ret
  884. .cfi_endproc
  885. .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
  886. .type aesni_multi_cbc_decrypt_avx,\@function,3
  887. .align 32
  888. aesni_multi_cbc_decrypt_avx:
  889. .cfi_startproc
  890. _avx_cbc_dec_shortcut:
  891. mov %rsp,%rax
  892. .cfi_def_cfa_register %rax
  893. push %rbx
  894. .cfi_push %rbx
  895. push %rbp
  896. .cfi_push %rbp
  897. push %r12
  898. .cfi_push %r12
  899. push %r13
  900. .cfi_push %r13
  901. push %r14
  902. .cfi_push %r14
  903. push %r15
  904. .cfi_push %r15
  905. ___
  906. $code.=<<___ if ($win64);
  907. lea -0xa8(%rsp),%rsp
  908. movaps %xmm6,(%rsp)
  909. movaps %xmm7,0x10(%rsp)
  910. movaps %xmm8,0x20(%rsp)
  911. movaps %xmm9,0x30(%rsp)
  912. movaps %xmm10,0x40(%rsp)
  913. movaps %xmm11,0x50(%rsp)
  914. movaps %xmm12,-0x78(%rax)
  915. movaps %xmm13,-0x68(%rax)
  916. movaps %xmm14,-0x58(%rax)
  917. movaps %xmm15,-0x48(%rax)
  918. ___
  919. $code.=<<___;
  920. # stack layout
  921. #
  922. # +0 output sink
  923. # +16 input sink [original %rsp and $num]
  924. # +32 counters
  925. # +64 distances between inputs and outputs
  926. # +128 off-load area for @inp[0..3]
  927. # +192 IV/input offload
  928. sub \$256,%rsp
  929. and \$-256,%rsp
  930. sub \$192,%rsp
  931. mov %rax,16(%rsp) # original %rsp
  932. .cfi_cfa_expression %rsp+16,deref,+8
  933. .Ldec8x_body:
  934. vzeroupper
  935. vmovdqu ($key),$zero # 0-round key
  936. lea 0x78($key),$key # size optimization
  937. lea `$inp_elm_size*4`($inp),$inp
  938. shr \$1,$num
  939. .Ldec8x_loop_grande:
  940. #mov $num,24(%rsp) # original $num
  941. xor $num,$num
  942. ___
  943. for($i=0;$i<8;$i++) {
  944. my $temp = $i ? $offload : $offset;
  945. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  946. $temp_reg=&pointer_register($flavour,$temp);
  947. $code.=<<___;
  948. # borrow $one for number of blocks
  949. mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
  950. # input pointer
  951. mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
  952. cmp $num,$one
  953. # output pointer
  954. mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
  955. cmovg $one,$num # find maximum
  956. test $one,$one
  957. # load IV
  958. vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
  959. mov $one,`32+4*$i`(%rsp) # initialize counters
  960. cmovle %rsp,@ptr[$i] # cancel input
  961. sub @ptr[$i],$temp # distance between input and output
  962. mov $temp,`64+8*$i`(%rsp) # initialize distances
  963. vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
  964. ___
  965. }
  966. $code.=<<___;
  967. test $num,$num
  968. jz .Ldec8x_done
  969. vmovups 0x10-0x78($key),$rndkey1
  970. vmovups 0x20-0x78($key),$rndkey0
  971. mov 0xf0-0x78($key),$rounds
  972. lea 192+128(%rsp),$offload # offload area
  973. vmovdqu (@ptr[0]),@out[0] # load inputs
  974. vmovdqu (@ptr[1]),@out[1]
  975. vmovdqu (@ptr[2]),@out[2]
  976. vmovdqu (@ptr[3]),@out[3]
  977. vmovdqu (@ptr[4]),@out[4]
  978. vmovdqu (@ptr[5]),@out[5]
  979. vmovdqu (@ptr[6]),@out[6]
  980. vmovdqu (@ptr[7]),@out[7]
  981. vmovdqu @out[0],0x00($offload) # offload inputs
  982. vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
  983. vmovdqu @out[1],0x10($offload)
  984. vpxor $zero,@out[1],@out[1]
  985. vmovdqu @out[2],0x20($offload)
  986. vpxor $zero,@out[2],@out[2]
  987. vmovdqu @out[3],0x30($offload)
  988. vpxor $zero,@out[3],@out[3]
  989. vmovdqu @out[4],0x40($offload)
  990. vpxor $zero,@out[4],@out[4]
  991. vmovdqu @out[5],0x50($offload)
  992. vpxor $zero,@out[5],@out[5]
  993. vmovdqu @out[6],0x60($offload)
  994. vpxor $zero,@out[6],@out[6]
  995. vmovdqu @out[7],0x70($offload)
  996. vpxor $zero,@out[7],@out[7]
  997. xor \$0x80,$offload
  998. mov \$1,$one # constant of 1
  999. jmp .Loop_dec8x
  1000. .align 32
  1001. .Loop_dec8x:
  1002. ___
  1003. for($i=0;$i<8;$i++) {
  1004. my $rndkey=($i&1)?$rndkey0:$rndkey1;
  1005. $code.=<<___;
  1006. vaesdec $rndkey,@out[0],@out[0]
  1007. cmp 32+4*$i(%rsp),$one
  1008. ___
  1009. $code.=<<___ if ($i);
  1010. mov 64+8*$i(%rsp),$offset
  1011. ___
  1012. $code.=<<___;
  1013. vaesdec $rndkey,@out[1],@out[1]
  1014. prefetcht0 31(@ptr[$i]) # prefetch input
  1015. vaesdec $rndkey,@out[2],@out[2]
  1016. ___
  1017. $code.=<<___ if ($i>1);
  1018. prefetcht0 15(@ptr[$i-2]) # prefetch output
  1019. ___
  1020. $code.=<<___;
  1021. vaesdec $rndkey,@out[3],@out[3]
  1022. lea (@ptr[$i],$offset),$offset
  1023. cmovge %rsp,@ptr[$i] # cancel input
  1024. vaesdec $rndkey,@out[4],@out[4]
  1025. cmovg %rsp,$offset # sink output
  1026. vaesdec $rndkey,@out[5],@out[5]
  1027. sub @ptr[$i],$offset
  1028. vaesdec $rndkey,@out[6],@out[6]
  1029. vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
  1030. mov $offset,64+8*$i(%rsp)
  1031. vaesdec $rndkey,@out[7],@out[7]
  1032. vmovups `16*(3+$i)-0x78`($key),$rndkey
  1033. lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
  1034. ___
  1035. $code.=<<___ if ($i<4);
  1036. vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
  1037. ___
  1038. }
  1039. $code.=<<___;
  1040. vmovdqu 32(%rsp),$counters
  1041. prefetcht0 15(@ptr[$i-2]) # prefetch output
  1042. prefetcht0 15(@ptr[$i-1])
  1043. cmp \$11,$rounds
  1044. jb .Ldec8x_tail
  1045. vaesdec $rndkey1,@out[0],@out[0]
  1046. vaesdec $rndkey1,@out[1],@out[1]
  1047. vaesdec $rndkey1,@out[2],@out[2]
  1048. vaesdec $rndkey1,@out[3],@out[3]
  1049. vaesdec $rndkey1,@out[4],@out[4]
  1050. vaesdec $rndkey1,@out[5],@out[5]
  1051. vaesdec $rndkey1,@out[6],@out[6]
  1052. vaesdec $rndkey1,@out[7],@out[7]
  1053. vmovups 0xb0-0x78($key),$rndkey1
  1054. vaesdec $rndkey0,@out[0],@out[0]
  1055. vaesdec $rndkey0,@out[1],@out[1]
  1056. vaesdec $rndkey0,@out[2],@out[2]
  1057. vaesdec $rndkey0,@out[3],@out[3]
  1058. vaesdec $rndkey0,@out[4],@out[4]
  1059. vaesdec $rndkey0,@out[5],@out[5]
  1060. vaesdec $rndkey0,@out[6],@out[6]
  1061. vaesdec $rndkey0,@out[7],@out[7]
  1062. vmovups 0xc0-0x78($key),$rndkey0
  1063. je .Ldec8x_tail
  1064. vaesdec $rndkey1,@out[0],@out[0]
  1065. vaesdec $rndkey1,@out[1],@out[1]
  1066. vaesdec $rndkey1,@out[2],@out[2]
  1067. vaesdec $rndkey1,@out[3],@out[3]
  1068. vaesdec $rndkey1,@out[4],@out[4]
  1069. vaesdec $rndkey1,@out[5],@out[5]
  1070. vaesdec $rndkey1,@out[6],@out[6]
  1071. vaesdec $rndkey1,@out[7],@out[7]
  1072. vmovups 0xd0-0x78($key),$rndkey1
  1073. vaesdec $rndkey0,@out[0],@out[0]
  1074. vaesdec $rndkey0,@out[1],@out[1]
  1075. vaesdec $rndkey0,@out[2],@out[2]
  1076. vaesdec $rndkey0,@out[3],@out[3]
  1077. vaesdec $rndkey0,@out[4],@out[4]
  1078. vaesdec $rndkey0,@out[5],@out[5]
  1079. vaesdec $rndkey0,@out[6],@out[6]
  1080. vaesdec $rndkey0,@out[7],@out[7]
  1081. vmovups 0xe0-0x78($key),$rndkey0
  1082. .Ldec8x_tail:
  1083. vaesdec $rndkey1,@out[0],@out[0]
  1084. vpxor $zero,$zero,$zero
  1085. vaesdec $rndkey1,@out[1],@out[1]
  1086. vaesdec $rndkey1,@out[2],@out[2]
  1087. vpcmpgtd $zero,$counters,$zero
  1088. vaesdec $rndkey1,@out[3],@out[3]
  1089. vaesdec $rndkey1,@out[4],@out[4]
  1090. vpaddd $counters,$zero,$zero # decrement counters
  1091. vmovdqu 48(%rsp),$counters
  1092. vaesdec $rndkey1,@out[5],@out[5]
  1093. mov 64(%rsp),$offset # pre-load 1st offset
  1094. vaesdec $rndkey1,@out[6],@out[6]
  1095. vaesdec $rndkey1,@out[7],@out[7]
  1096. vmovups 0x10-0x78($key),$rndkey1
  1097. vaesdeclast $rndkey0,@out[0],@out[0]
  1098. vmovdqa $zero,32(%rsp) # update counters
  1099. vpxor $zero,$zero,$zero
  1100. vaesdeclast $rndkey0,@out[1],@out[1]
  1101. vpxor 0x00($offload),@out[0],@out[0] # xor with IV
  1102. vaesdeclast $rndkey0,@out[2],@out[2]
  1103. vpxor 0x10($offload),@out[1],@out[1]
  1104. vpcmpgtd $zero,$counters,$zero
  1105. vaesdeclast $rndkey0,@out[3],@out[3]
  1106. vpxor 0x20($offload),@out[2],@out[2]
  1107. vaesdeclast $rndkey0,@out[4],@out[4]
  1108. vpxor 0x30($offload),@out[3],@out[3]
  1109. vpaddd $zero,$counters,$counters # decrement counters
  1110. vmovdqu -0x78($key),$zero # 0-round
  1111. vaesdeclast $rndkey0,@out[5],@out[5]
  1112. vpxor 0x40($offload),@out[4],@out[4]
  1113. vaesdeclast $rndkey0,@out[6],@out[6]
  1114. vpxor 0x50($offload),@out[5],@out[5]
  1115. vmovdqa $counters,48(%rsp) # update counters
  1116. vaesdeclast $rndkey0,@out[7],@out[7]
  1117. vpxor 0x60($offload),@out[6],@out[6]
  1118. vmovups 0x20-0x78($key),$rndkey0
  1119. vmovups @out[0],-16(@ptr[0]) # write output
  1120. sub $offset,@ptr[0] # switch to input
  1121. vmovdqu 128+0(%rsp),@out[0]
  1122. vpxor 0x70($offload),@out[7],@out[7]
  1123. vmovups @out[1],-16(@ptr[1])
  1124. sub `64+1*8`(%rsp),@ptr[1]
  1125. vmovdqu @out[0],0x00($offload)
  1126. vpxor $zero,@out[0],@out[0]
  1127. vmovdqu 128+16(%rsp),@out[1]
  1128. vmovups @out[2],-16(@ptr[2])
  1129. sub `64+2*8`(%rsp),@ptr[2]
  1130. vmovdqu @out[1],0x10($offload)
  1131. vpxor $zero,@out[1],@out[1]
  1132. vmovdqu 128+32(%rsp),@out[2]
  1133. vmovups @out[3],-16(@ptr[3])
  1134. sub `64+3*8`(%rsp),@ptr[3]
  1135. vmovdqu @out[2],0x20($offload)
  1136. vpxor $zero,@out[2],@out[2]
  1137. vmovdqu 128+48(%rsp),@out[3]
  1138. vmovups @out[4],-16(@ptr[4])
  1139. sub `64+4*8`(%rsp),@ptr[4]
  1140. vmovdqu @out[3],0x30($offload)
  1141. vpxor $zero,@out[3],@out[3]
  1142. vmovdqu @inp[0],0x40($offload)
  1143. vpxor @inp[0],$zero,@out[4]
  1144. vmovups @out[5],-16(@ptr[5])
  1145. sub `64+5*8`(%rsp),@ptr[5]
  1146. vmovdqu @inp[1],0x50($offload)
  1147. vpxor @inp[1],$zero,@out[5]
  1148. vmovups @out[6],-16(@ptr[6])
  1149. sub `64+6*8`(%rsp),@ptr[6]
  1150. vmovdqu @inp[2],0x60($offload)
  1151. vpxor @inp[2],$zero,@out[6]
  1152. vmovups @out[7],-16(@ptr[7])
  1153. sub `64+7*8`(%rsp),@ptr[7]
  1154. vmovdqu @inp[3],0x70($offload)
  1155. vpxor @inp[3],$zero,@out[7]
  1156. xor \$128,$offload
  1157. dec $num
  1158. jnz .Loop_dec8x
  1159. mov 16(%rsp),%rax # original %rsp
  1160. .cfi_def_cfa %rax,8
  1161. #mov 24(%rsp),$num
  1162. #lea `$inp_elm_size*8`($inp),$inp
  1163. #dec $num
  1164. #jnz .Ldec8x_loop_grande
  1165. .Ldec8x_done:
  1166. vzeroupper
  1167. ___
  1168. $code.=<<___ if ($win64);
  1169. movaps -0xd8(%rax),%xmm6
  1170. movaps -0xc8(%rax),%xmm7
  1171. movaps -0xb8(%rax),%xmm8
  1172. movaps -0xa8(%rax),%xmm9
  1173. movaps -0x98(%rax),%xmm10
  1174. movaps -0x88(%rax),%xmm11
  1175. movaps -0x78(%rax),%xmm12
  1176. movaps -0x68(%rax),%xmm13
  1177. movaps -0x58(%rax),%xmm14
  1178. movaps -0x48(%rax),%xmm15
  1179. ___
  1180. $code.=<<___;
  1181. mov -48(%rax),%r15
  1182. .cfi_restore %r15
  1183. mov -40(%rax),%r14
  1184. .cfi_restore %r14
  1185. mov -32(%rax),%r13
  1186. .cfi_restore %r13
  1187. mov -24(%rax),%r12
  1188. .cfi_restore %r12
  1189. mov -16(%rax),%rbp
  1190. .cfi_restore %rbp
  1191. mov -8(%rax),%rbx
  1192. .cfi_restore %rbx
  1193. lea (%rax),%rsp
  1194. .cfi_def_cfa_register %rsp
  1195. .Ldec8x_epilogue:
  1196. ret
  1197. .cfi_endproc
  1198. .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
  1199. ___
  1200. }}}
  1201. if ($win64) {
  1202. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1203. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1204. $rec="%rcx";
  1205. $frame="%rdx";
  1206. $context="%r8";
  1207. $disp="%r9";
  1208. $code.=<<___;
  1209. .extern __imp_RtlVirtualUnwind
  1210. .type se_handler,\@abi-omnipotent
  1211. .align 16
  1212. se_handler:
  1213. push %rsi
  1214. push %rdi
  1215. push %rbx
  1216. push %rbp
  1217. push %r12
  1218. push %r13
  1219. push %r14
  1220. push %r15
  1221. pushfq
  1222. sub \$64,%rsp
  1223. mov 120($context),%rax # pull context->Rax
  1224. mov 248($context),%rbx # pull context->Rip
  1225. mov 8($disp),%rsi # disp->ImageBase
  1226. mov 56($disp),%r11 # disp->HandlerData
  1227. mov 0(%r11),%r10d # HandlerData[0]
  1228. lea (%rsi,%r10),%r10 # prologue label
  1229. cmp %r10,%rbx # context->Rip<.Lprologue
  1230. jb .Lin_prologue
  1231. mov 152($context),%rax # pull context->Rsp
  1232. mov 4(%r11),%r10d # HandlerData[1]
  1233. lea (%rsi,%r10),%r10 # epilogue label
  1234. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1235. jae .Lin_prologue
  1236. mov 16(%rax),%rax # pull saved stack pointer
  1237. mov -8(%rax),%rbx
  1238. mov -16(%rax),%rbp
  1239. mov -24(%rax),%r12
  1240. mov -32(%rax),%r13
  1241. mov -40(%rax),%r14
  1242. mov -48(%rax),%r15
  1243. mov %rbx,144($context) # restore context->Rbx
  1244. mov %rbp,160($context) # restore context->Rbp
  1245. mov %r12,216($context) # restore context->R12
  1246. mov %r13,224($context) # restore context->R13
  1247. mov %r14,232($context) # restore context->R14
  1248. mov %r15,240($context) # restore context->R15
  1249. lea -56-10*16(%rax),%rsi
  1250. lea 512($context),%rdi # &context.Xmm6
  1251. mov \$20,%ecx
  1252. .long 0xa548f3fc # cld; rep movsq
  1253. .Lin_prologue:
  1254. mov 8(%rax),%rdi
  1255. mov 16(%rax),%rsi
  1256. mov %rax,152($context) # restore context->Rsp
  1257. mov %rsi,168($context) # restore context->Rsi
  1258. mov %rdi,176($context) # restore context->Rdi
  1259. mov 40($disp),%rdi # disp->ContextRecord
  1260. mov $context,%rsi # context
  1261. mov \$154,%ecx # sizeof(CONTEXT)
  1262. .long 0xa548f3fc # cld; rep movsq
  1263. mov $disp,%rsi
  1264. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1265. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1266. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1267. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1268. mov 40(%rsi),%r10 # disp->ContextRecord
  1269. lea 56(%rsi),%r11 # &disp->HandlerData
  1270. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1271. mov %r10,32(%rsp) # arg5
  1272. mov %r11,40(%rsp) # arg6
  1273. mov %r12,48(%rsp) # arg7
  1274. mov %rcx,56(%rsp) # arg8, (NULL)
  1275. call *__imp_RtlVirtualUnwind(%rip)
  1276. mov \$1,%eax # ExceptionContinueSearch
  1277. add \$64,%rsp
  1278. popfq
  1279. pop %r15
  1280. pop %r14
  1281. pop %r13
  1282. pop %r12
  1283. pop %rbp
  1284. pop %rbx
  1285. pop %rdi
  1286. pop %rsi
  1287. ret
  1288. .size se_handler,.-se_handler
  1289. .section .pdata
  1290. .align 4
  1291. .rva .LSEH_begin_aesni_multi_cbc_encrypt
  1292. .rva .LSEH_end_aesni_multi_cbc_encrypt
  1293. .rva .LSEH_info_aesni_multi_cbc_encrypt
  1294. .rva .LSEH_begin_aesni_multi_cbc_decrypt
  1295. .rva .LSEH_end_aesni_multi_cbc_decrypt
  1296. .rva .LSEH_info_aesni_multi_cbc_decrypt
  1297. ___
  1298. $code.=<<___ if ($avx);
  1299. .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
  1300. .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
  1301. .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
  1302. .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
  1303. .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
  1304. .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
  1305. ___
  1306. $code.=<<___;
  1307. .section .xdata
  1308. .align 8
  1309. .LSEH_info_aesni_multi_cbc_encrypt:
  1310. .byte 9,0,0,0
  1311. .rva se_handler
  1312. .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
  1313. .LSEH_info_aesni_multi_cbc_decrypt:
  1314. .byte 9,0,0,0
  1315. .rva se_handler
  1316. .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
  1317. ___
  1318. $code.=<<___ if ($avx);
  1319. .LSEH_info_aesni_multi_cbc_encrypt_avx:
  1320. .byte 9,0,0,0
  1321. .rva se_handler
  1322. .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
  1323. .LSEH_info_aesni_multi_cbc_decrypt_avx:
  1324. .byte 9,0,0,0
  1325. .rva se_handler
  1326. .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
  1327. ___
  1328. }
  1329. ####################################################################
  1330. sub rex {
  1331. local *opcode=shift;
  1332. my ($dst,$src)=@_;
  1333. my $rex=0;
  1334. $rex|=0x04 if($dst>=8);
  1335. $rex|=0x01 if($src>=8);
  1336. push @opcode,$rex|0x40 if($rex);
  1337. }
  1338. sub aesni {
  1339. my $line=shift;
  1340. my @opcode=(0x66);
  1341. if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1342. rex(\@opcode,$4,$3);
  1343. push @opcode,0x0f,0x3a,0xdf;
  1344. push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
  1345. my $c=$2;
  1346. push @opcode,$c=~/^0/?oct($c):$c;
  1347. return ".byte\t".join(',',@opcode);
  1348. }
  1349. elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1350. my %opcodelet = (
  1351. "aesimc" => 0xdb,
  1352. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  1353. "aesdec" => 0xde, "aesdeclast" => 0xdf
  1354. );
  1355. return undef if (!defined($opcodelet{$1}));
  1356. rex(\@opcode,$3,$2);
  1357. push @opcode,0x0f,0x38,$opcodelet{$1};
  1358. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1359. return ".byte\t".join(',',@opcode);
  1360. }
  1361. elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
  1362. my %opcodelet = (
  1363. "aesenc" => 0xdc, "aesenclast" => 0xdd,
  1364. "aesdec" => 0xde, "aesdeclast" => 0xdf
  1365. );
  1366. return undef if (!defined($opcodelet{$1}));
  1367. my $off = $2;
  1368. push @opcode,0x44 if ($3>=8);
  1369. push @opcode,0x0f,0x38,$opcodelet{$1};
  1370. push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
  1371. push @opcode,($off=~/^0/?oct($off):$off)&0xff;
  1372. return ".byte\t".join(',',@opcode);
  1373. }
  1374. return $line;
  1375. }
  1376. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1377. $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
  1378. print $code;
  1379. close STDOUT or die "error closing STDOUT: $!";