sha1-mb-x86_64.pl 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer SHA1 procedure processes n buffers in parallel by
  15. # placing buffer data to designated lane of SIMD register. n is
  16. # naturally limited to 4 on pre-AVX2 processors and to 8 on
  17. # AVX2-capable processors such as Haswell.
  18. #
  19. # this +aesni(i) sha1 aesni-sha1 gain(iv)
  20. # -------------------------------------------------------------------
  21. # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
  22. # Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51%
  23. # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
  24. # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
  25. # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
  26. # Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145%
  27. # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
  28. #
  29. # (i) multi-block CBC encrypt with 128-bit key;
  30. # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
  31. # because of lower AES-NI instruction throughput;
  32. # (iii) "this" is for n=8, when we gather twice as much data, result
  33. # for n=4 is 8.00+4.44=12.4;
  34. # (iv) presented improvement coefficients are asymptotic limits and
  35. # in real-life application are somewhat lower, e.g. for 2KB
  36. # fragments they range from 30% to 100% (on Haswell);
  37. # $output is the last argument if it looks like a file (it has an extension)
  38. # $flavour is the first argument if it doesn't look like a file
  39. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  40. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  41. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45. die "can't locate x86_64-xlate.pl";
  46. push(@INC,"${dir}","${dir}../../perlasm");
  47. require "x86_64-support.pl";
  48. $ptr_size=&pointer_size($flavour);
  49. $avx=0;
  50. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  51. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  52. $avx = ($1>=2.19) + ($1>=2.22);
  53. }
  54. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  55. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  56. $avx = ($1>=2.09) + ($1>=2.10);
  57. }
  58. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60. $avx = ($1>=10) + ($1>=11);
  61. }
  62. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  63. $avx = ($2>=3.0) + ($2>3.0);
  64. }
  65. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  66. or die "can't call $xlate: $!";
  67. *STDOUT=*OUT;
  68. # void sha1_multi_block (
  69. # struct { unsigned int A[8];
  70. # unsigned int B[8];
  71. # unsigned int C[8];
  72. # unsigned int D[8];
  73. # unsigned int E[8]; } *ctx,
  74. # struct { void *ptr; int blocks; } inp[8],
  75. # int num); /* 1 or 2 */
  76. #
  77. $ctx="%rdi"; # 1st arg
  78. $inp="%rsi"; # 2nd arg
  79. $num="%edx";
  80. @ptr=map("%r$_",(8..11));
  81. $Tbl="%rbp";
  82. $inp_elm_size=2*$ptr_size;
  83. @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
  84. ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
  85. @Xi=map("%xmm$_",(10..14));
  86. $K="%xmm15";
  87. if (1) {
  88. # Atom-specific optimization aiming to eliminate pshufb with high
  89. # registers [and thus get rid of 48 cycles accumulated penalty]
  90. @Xi=map("%xmm$_",(0..4));
  91. ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
  92. @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
  93. }
  94. $REG_SZ=16;
  95. sub Xi_off {
  96. my $off = shift;
  97. $off %= 16; $off *= $REG_SZ;
  98. $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
  99. }
  100. sub BODY_00_19 {
  101. my ($i,$a,$b,$c,$d,$e)=@_;
  102. my $j=$i+1;
  103. my $k=$i+2;
  104. # Loads are performed 2+3/4 iterations in advance. 3/4 means that out
  105. # of 4 words you would expect to be loaded per given iteration one is
  106. # spilled to next iteration. In other words indices in four input
  107. # streams are distributed as following:
  108. #
  109. # $i==0: 0,0,0,0,1,1,1,1,2,2,2,
  110. # $i==1: 2,3,3,3,
  111. # $i==2: 3,4,4,4,
  112. # ...
  113. # $i==13: 14,15,15,15,
  114. # $i==14: 15
  115. #
  116. # Then at $i==15 Xupdate is applied one iteration in advance...
  117. $code.=<<___ if ($i==0);
  118. movd (@ptr[0]),@Xi[0]
  119. lea `16*4`(@ptr[0]),@ptr[0]
  120. movd (@ptr[1]),@Xi[2] # borrow @Xi[2]
  121. lea `16*4`(@ptr[1]),@ptr[1]
  122. movd (@ptr[2]),@Xi[3] # borrow @Xi[3]
  123. lea `16*4`(@ptr[2]),@ptr[2]
  124. movd (@ptr[3]),@Xi[4] # borrow @Xi[4]
  125. lea `16*4`(@ptr[3]),@ptr[3]
  126. punpckldq @Xi[3],@Xi[0]
  127. movd `4*$j-16*4`(@ptr[0]),@Xi[1]
  128. punpckldq @Xi[4],@Xi[2]
  129. movd `4*$j-16*4`(@ptr[1]),$t3
  130. punpckldq @Xi[2],@Xi[0]
  131. movd `4*$j-16*4`(@ptr[2]),$t2
  132. pshufb $tx,@Xi[0]
  133. ___
  134. $code.=<<___ if ($i<14); # just load input
  135. movd `4*$j-16*4`(@ptr[3]),$t1
  136. punpckldq $t2,@Xi[1]
  137. movdqa $a,$t2
  138. paddd $K,$e # e+=K_00_19
  139. punpckldq $t1,$t3
  140. movdqa $b,$t1
  141. movdqa $b,$t0
  142. pslld \$5,$t2
  143. pandn $d,$t1
  144. pand $c,$t0
  145. punpckldq $t3,@Xi[1]
  146. movdqa $a,$t3
  147. movdqa @Xi[0],`&Xi_off($i)`
  148. paddd @Xi[0],$e # e+=X[i]
  149. movd `4*$k-16*4`(@ptr[0]),@Xi[2]
  150. psrld \$27,$t3
  151. pxor $t1,$t0 # Ch(b,c,d)
  152. movdqa $b,$t1
  153. por $t3,$t2 # rol(a,5)
  154. movd `4*$k-16*4`(@ptr[1]),$t3
  155. pslld \$30,$t1
  156. paddd $t0,$e # e+=Ch(b,c,d)
  157. psrld \$2,$b
  158. paddd $t2,$e # e+=rol(a,5)
  159. pshufb $tx,@Xi[1]
  160. movd `4*$k-16*4`(@ptr[2]),$t2
  161. por $t1,$b # b=rol(b,30)
  162. ___
  163. $code.=<<___ if ($i==14); # just load input
  164. movd `4*$j-16*4`(@ptr[3]),$t1
  165. punpckldq $t2,@Xi[1]
  166. movdqa $a,$t2
  167. paddd $K,$e # e+=K_00_19
  168. punpckldq $t1,$t3
  169. movdqa $b,$t1
  170. movdqa $b,$t0
  171. pslld \$5,$t2
  172. prefetcht0 63(@ptr[0])
  173. pandn $d,$t1
  174. pand $c,$t0
  175. punpckldq $t3,@Xi[1]
  176. movdqa $a,$t3
  177. movdqa @Xi[0],`&Xi_off($i)`
  178. paddd @Xi[0],$e # e+=X[i]
  179. psrld \$27,$t3
  180. pxor $t1,$t0 # Ch(b,c,d)
  181. movdqa $b,$t1
  182. prefetcht0 63(@ptr[1])
  183. por $t3,$t2 # rol(a,5)
  184. pslld \$30,$t1
  185. paddd $t0,$e # e+=Ch(b,c,d)
  186. prefetcht0 63(@ptr[2])
  187. psrld \$2,$b
  188. paddd $t2,$e # e+=rol(a,5)
  189. pshufb $tx,@Xi[1]
  190. prefetcht0 63(@ptr[3])
  191. por $t1,$b # b=rol(b,30)
  192. ___
  193. $code.=<<___ if ($i>=13 && $i<15);
  194. movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
  195. ___
  196. $code.=<<___ if ($i>=15); # apply Xupdate
  197. pxor @Xi[-2],@Xi[1] # "X[13]"
  198. movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  199. movdqa $a,$t2
  200. pxor `&Xi_off($j+8)`,@Xi[1]
  201. paddd $K,$e # e+=K_00_19
  202. movdqa $b,$t1
  203. pslld \$5,$t2
  204. pxor @Xi[3],@Xi[1]
  205. movdqa $b,$t0
  206. pandn $d,$t1
  207. movdqa @Xi[1],$tx
  208. pand $c,$t0
  209. movdqa $a,$t3
  210. psrld \$31,$tx
  211. paddd @Xi[1],@Xi[1]
  212. movdqa @Xi[0],`&Xi_off($i)`
  213. paddd @Xi[0],$e # e+=X[i]
  214. psrld \$27,$t3
  215. pxor $t1,$t0 # Ch(b,c,d)
  216. movdqa $b,$t1
  217. por $t3,$t2 # rol(a,5)
  218. pslld \$30,$t1
  219. paddd $t0,$e # e+=Ch(b,c,d)
  220. psrld \$2,$b
  221. paddd $t2,$e # e+=rol(a,5)
  222. por $tx,@Xi[1] # rol \$1,@Xi[1]
  223. por $t1,$b # b=rol(b,30)
  224. ___
  225. push(@Xi,shift(@Xi));
  226. }
  227. sub BODY_20_39 {
  228. my ($i,$a,$b,$c,$d,$e)=@_;
  229. my $j=$i+1;
  230. $code.=<<___ if ($i<79);
  231. pxor @Xi[-2],@Xi[1] # "X[13]"
  232. movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  233. movdqa $a,$t2
  234. movdqa $d,$t0
  235. pxor `&Xi_off($j+8)`,@Xi[1]
  236. paddd $K,$e # e+=K_20_39
  237. pslld \$5,$t2
  238. pxor $b,$t0
  239. movdqa $a,$t3
  240. ___
  241. $code.=<<___ if ($i<72);
  242. movdqa @Xi[0],`&Xi_off($i)`
  243. ___
  244. $code.=<<___ if ($i<79);
  245. paddd @Xi[0],$e # e+=X[i]
  246. pxor @Xi[3],@Xi[1]
  247. psrld \$27,$t3
  248. pxor $c,$t0 # Parity(b,c,d)
  249. movdqa $b,$t1
  250. pslld \$30,$t1
  251. movdqa @Xi[1],$tx
  252. por $t3,$t2 # rol(a,5)
  253. psrld \$31,$tx
  254. paddd $t0,$e # e+=Parity(b,c,d)
  255. paddd @Xi[1],@Xi[1]
  256. psrld \$2,$b
  257. paddd $t2,$e # e+=rol(a,5)
  258. por $tx,@Xi[1] # rol(@Xi[1],1)
  259. por $t1,$b # b=rol(b,30)
  260. ___
  261. $code.=<<___ if ($i==79);
  262. movdqa $a,$t2
  263. paddd $K,$e # e+=K_20_39
  264. movdqa $d,$t0
  265. pslld \$5,$t2
  266. pxor $b,$t0
  267. movdqa $a,$t3
  268. paddd @Xi[0],$e # e+=X[i]
  269. psrld \$27,$t3
  270. movdqa $b,$t1
  271. pxor $c,$t0 # Parity(b,c,d)
  272. pslld \$30,$t1
  273. por $t3,$t2 # rol(a,5)
  274. paddd $t0,$e # e+=Parity(b,c,d)
  275. psrld \$2,$b
  276. paddd $t2,$e # e+=rol(a,5)
  277. por $t1,$b # b=rol(b,30)
  278. ___
  279. push(@Xi,shift(@Xi));
  280. }
  281. sub BODY_40_59 {
  282. my ($i,$a,$b,$c,$d,$e)=@_;
  283. my $j=$i+1;
  284. $code.=<<___;
  285. pxor @Xi[-2],@Xi[1] # "X[13]"
  286. movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  287. movdqa $a,$t2
  288. movdqa $d,$t1
  289. pxor `&Xi_off($j+8)`,@Xi[1]
  290. pxor @Xi[3],@Xi[1]
  291. paddd $K,$e # e+=K_40_59
  292. pslld \$5,$t2
  293. movdqa $a,$t3
  294. pand $c,$t1
  295. movdqa $d,$t0
  296. movdqa @Xi[1],$tx
  297. psrld \$27,$t3
  298. paddd $t1,$e
  299. pxor $c,$t0
  300. movdqa @Xi[0],`&Xi_off($i)`
  301. paddd @Xi[0],$e # e+=X[i]
  302. por $t3,$t2 # rol(a,5)
  303. psrld \$31,$tx
  304. pand $b,$t0
  305. movdqa $b,$t1
  306. pslld \$30,$t1
  307. paddd @Xi[1],@Xi[1]
  308. paddd $t0,$e # e+=Maj(b,d,c)
  309. psrld \$2,$b
  310. paddd $t2,$e # e+=rol(a,5)
  311. por $tx,@Xi[1] # rol(@X[1],1)
  312. por $t1,$b # b=rol(b,30)
  313. ___
  314. push(@Xi,shift(@Xi));
  315. }
  316. $code.=<<___;
  317. .text
  318. .extern OPENSSL_ia32cap_P
  319. .globl sha1_multi_block
  320. .type sha1_multi_block,\@function,3
  321. .align 32
  322. sha1_multi_block:
  323. .cfi_startproc
  324. mov OPENSSL_ia32cap_P+4(%rip),%rcx
  325. bt \$61,%rcx # check SHA bit
  326. jc _shaext_shortcut
  327. ___
  328. $code.=<<___ if ($avx);
  329. test \$`1<<28`,%ecx
  330. jnz _avx_shortcut
  331. ___
  332. $code.=<<___;
  333. mov %rsp,%rax
  334. .cfi_def_cfa_register %rax
  335. push %rbx
  336. .cfi_push %rbx
  337. push %rbp
  338. .cfi_push %rbx
  339. ___
  340. $code.=<<___ if ($win64);
  341. lea -0xa8(%rsp),%rsp
  342. movaps %xmm6,(%rsp)
  343. movaps %xmm7,0x10(%rsp)
  344. movaps %xmm8,0x20(%rsp)
  345. movaps %xmm9,0x30(%rsp)
  346. movaps %xmm10,-0x78(%rax)
  347. movaps %xmm11,-0x68(%rax)
  348. movaps %xmm12,-0x58(%rax)
  349. movaps %xmm13,-0x48(%rax)
  350. movaps %xmm14,-0x38(%rax)
  351. movaps %xmm15,-0x28(%rax)
  352. ___
  353. $code.=<<___;
  354. sub \$`$REG_SZ*18`,%rsp
  355. and \$-256,%rsp
  356. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  357. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  358. .Lbody:
  359. lea K_XX_XX(%rip),$Tbl
  360. lea `$REG_SZ*16`(%rsp),%rbx
  361. .Loop_grande:
  362. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  363. xor $num,$num
  364. ___
  365. for($i=0;$i<4;$i++) {
  366. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  367. $code.=<<___;
  368. # input pointer
  369. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  370. # number of blocks
  371. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  372. cmp $num,%ecx
  373. cmovg %ecx,$num # find maximum
  374. test %ecx,%ecx
  375. mov %ecx,`4*$i`(%rbx) # initialize counters
  376. cmovle $Tbl,@ptr[$i] # cancel input
  377. ___
  378. }
  379. $code.=<<___;
  380. test $num,$num
  381. jz .Ldone
  382. movdqu 0x00($ctx),$A # load context
  383. lea 128(%rsp),%rax
  384. movdqu 0x20($ctx),$B
  385. movdqu 0x40($ctx),$C
  386. movdqu 0x60($ctx),$D
  387. movdqu 0x80($ctx),$E
  388. movdqa 0x60($Tbl),$tx # pbswap_mask
  389. movdqa -0x20($Tbl),$K # K_00_19
  390. jmp .Loop
  391. .align 32
  392. .Loop:
  393. ___
  394. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  395. $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
  396. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  397. $code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59
  398. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  399. $code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79
  400. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  401. $code.=<<___;
  402. movdqa (%rbx),@Xi[0] # pull counters
  403. mov \$1,%ecx
  404. cmp 4*0(%rbx),%ecx # examine counters
  405. pxor $t2,$t2
  406. cmovge $Tbl,@ptr[0] # cancel input
  407. cmp 4*1(%rbx),%ecx
  408. movdqa @Xi[0],@Xi[1]
  409. cmovge $Tbl,@ptr[1]
  410. cmp 4*2(%rbx),%ecx
  411. pcmpgtd $t2,@Xi[1] # mask value
  412. cmovge $Tbl,@ptr[2]
  413. cmp 4*3(%rbx),%ecx
  414. paddd @Xi[1],@Xi[0] # counters--
  415. cmovge $Tbl,@ptr[3]
  416. movdqu 0x00($ctx),$t0
  417. pand @Xi[1],$A
  418. movdqu 0x20($ctx),$t1
  419. pand @Xi[1],$B
  420. paddd $t0,$A
  421. movdqu 0x40($ctx),$t2
  422. pand @Xi[1],$C
  423. paddd $t1,$B
  424. movdqu 0x60($ctx),$t3
  425. pand @Xi[1],$D
  426. paddd $t2,$C
  427. movdqu 0x80($ctx),$tx
  428. pand @Xi[1],$E
  429. movdqu $A,0x00($ctx)
  430. paddd $t3,$D
  431. movdqu $B,0x20($ctx)
  432. paddd $tx,$E
  433. movdqu $C,0x40($ctx)
  434. movdqu $D,0x60($ctx)
  435. movdqu $E,0x80($ctx)
  436. movdqa @Xi[0],(%rbx) # save counters
  437. movdqa 0x60($Tbl),$tx # pbswap_mask
  438. movdqa -0x20($Tbl),$K # K_00_19
  439. dec $num
  440. jnz .Loop
  441. mov `$REG_SZ*17+8`(%rsp),$num
  442. lea $REG_SZ($ctx),$ctx
  443. lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  444. dec $num
  445. jnz .Loop_grande
  446. .Ldone:
  447. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  448. .cfi_def_cfa %rax,8
  449. ___
  450. $code.=<<___ if ($win64);
  451. movaps -0xb8(%rax),%xmm6
  452. movaps -0xa8(%rax),%xmm7
  453. movaps -0x98(%rax),%xmm8
  454. movaps -0x88(%rax),%xmm9
  455. movaps -0x78(%rax),%xmm10
  456. movaps -0x68(%rax),%xmm11
  457. movaps -0x58(%rax),%xmm12
  458. movaps -0x48(%rax),%xmm13
  459. movaps -0x38(%rax),%xmm14
  460. movaps -0x28(%rax),%xmm15
  461. ___
  462. $code.=<<___;
  463. mov -16(%rax),%rbp
  464. .cfi_restore %rbp
  465. mov -8(%rax),%rbx
  466. .cfi_restore %rbx
  467. lea (%rax),%rsp
  468. .cfi_def_cfa_register %rsp
  469. .Lepilogue:
  470. ret
  471. .cfi_endproc
  472. .size sha1_multi_block,.-sha1_multi_block
  473. ___
  474. {{{
  475. my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
  476. my @MSG0=map("%xmm$_",(4..7));
  477. my @MSG1=map("%xmm$_",(11..14));
  478. $code.=<<___;
  479. .type sha1_multi_block_shaext,\@function,3
  480. .align 32
  481. sha1_multi_block_shaext:
  482. .cfi_startproc
  483. _shaext_shortcut:
  484. mov %rsp,%rax
  485. .cfi_def_cfa_register %rax
  486. push %rbx
  487. .cfi_push %rbx
  488. push %rbp
  489. .cfi_push %rbp
  490. ___
  491. $code.=<<___ if ($win64);
  492. lea -0xa8(%rsp),%rsp
  493. movaps %xmm6,(%rsp)
  494. movaps %xmm7,0x10(%rsp)
  495. movaps %xmm8,0x20(%rsp)
  496. movaps %xmm9,0x30(%rsp)
  497. movaps %xmm10,-0x78(%rax)
  498. movaps %xmm11,-0x68(%rax)
  499. movaps %xmm12,-0x58(%rax)
  500. movaps %xmm13,-0x48(%rax)
  501. movaps %xmm14,-0x38(%rax)
  502. movaps %xmm15,-0x28(%rax)
  503. ___
  504. $code.=<<___;
  505. sub \$`$REG_SZ*18`,%rsp
  506. shl \$1,$num # we process pair at a time
  507. and \$-256,%rsp
  508. lea 0x40($ctx),$ctx # size optimization
  509. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  510. .Lbody_shaext:
  511. lea `$REG_SZ*16`(%rsp),%rbx
  512. movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap
  513. .Loop_grande_shaext:
  514. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  515. xor $num,$num
  516. ___
  517. for($i=0;$i<2;$i++) {
  518. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  519. $code.=<<___;
  520. # input pointer
  521. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  522. # number of blocks
  523. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  524. cmp $num,%ecx
  525. cmovg %ecx,$num # find maximum
  526. test %ecx,%ecx
  527. mov %ecx,`4*$i`(%rbx) # initialize counters
  528. cmovle %rsp,@ptr[$i] # cancel input
  529. ___
  530. }
  531. $code.=<<___;
  532. test $num,$num
  533. jz .Ldone_shaext
  534. movq 0x00-0x40($ctx),$ABCD0 # a1.a0
  535. movq 0x20-0x40($ctx),@MSG0[0]# b1.b0
  536. movq 0x40-0x40($ctx),@MSG0[1]# c1.c0
  537. movq 0x60-0x40($ctx),@MSG0[2]# d1.d0
  538. movq 0x80-0x40($ctx),@MSG0[3]# e1.e0
  539. punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0
  540. punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0
  541. movdqa $ABCD0,$ABCD1
  542. punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0
  543. punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1
  544. pshufd \$0b00111111,@MSG0[3],$E0
  545. pshufd \$0b01111111,@MSG0[3],$E1
  546. pshufd \$0b00011011,$ABCD0,$ABCD0
  547. pshufd \$0b00011011,$ABCD1,$ABCD1
  548. jmp .Loop_shaext
  549. .align 32
  550. .Loop_shaext:
  551. movdqu 0x00(@ptr[0]),@MSG0[0]
  552. movdqu 0x00(@ptr[1]),@MSG1[0]
  553. movdqu 0x10(@ptr[0]),@MSG0[1]
  554. movdqu 0x10(@ptr[1]),@MSG1[1]
  555. movdqu 0x20(@ptr[0]),@MSG0[2]
  556. pshufb $BSWAP,@MSG0[0]
  557. movdqu 0x20(@ptr[1]),@MSG1[2]
  558. pshufb $BSWAP,@MSG1[0]
  559. movdqu 0x30(@ptr[0]),@MSG0[3]
  560. lea 0x40(@ptr[0]),@ptr[0]
  561. pshufb $BSWAP,@MSG0[1]
  562. movdqu 0x30(@ptr[1]),@MSG1[3]
  563. lea 0x40(@ptr[1]),@ptr[1]
  564. pshufb $BSWAP,@MSG1[1]
  565. movdqa $E0,0x50(%rsp) # offload
  566. paddd @MSG0[0],$E0
  567. movdqa $E1,0x70(%rsp)
  568. paddd @MSG1[0],$E1
  569. movdqa $ABCD0,0x40(%rsp) # offload
  570. movdqa $ABCD0,$E0_
  571. movdqa $ABCD1,0x60(%rsp)
  572. movdqa $ABCD1,$E1_
  573. sha1rnds4 \$0,$E0,$ABCD0 # 0-3
  574. sha1nexte @MSG0[1],$E0_
  575. sha1rnds4 \$0,$E1,$ABCD1 # 0-3
  576. sha1nexte @MSG1[1],$E1_
  577. pshufb $BSWAP,@MSG0[2]
  578. prefetcht0 127(@ptr[0])
  579. sha1msg1 @MSG0[1],@MSG0[0]
  580. pshufb $BSWAP,@MSG1[2]
  581. prefetcht0 127(@ptr[1])
  582. sha1msg1 @MSG1[1],@MSG1[0]
  583. pshufb $BSWAP,@MSG0[3]
  584. movdqa $ABCD0,$E0
  585. pshufb $BSWAP,@MSG1[3]
  586. movdqa $ABCD1,$E1
  587. sha1rnds4 \$0,$E0_,$ABCD0 # 4-7
  588. sha1nexte @MSG0[2],$E0
  589. sha1rnds4 \$0,$E1_,$ABCD1 # 4-7
  590. sha1nexte @MSG1[2],$E1
  591. pxor @MSG0[2],@MSG0[0]
  592. sha1msg1 @MSG0[2],@MSG0[1]
  593. pxor @MSG1[2],@MSG1[0]
  594. sha1msg1 @MSG1[2],@MSG1[1]
  595. ___
  596. for($i=2;$i<20-4;$i++) {
  597. $code.=<<___;
  598. movdqa $ABCD0,$E0_
  599. movdqa $ABCD1,$E1_
  600. sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11
  601. sha1nexte @MSG0[3],$E0_
  602. sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11
  603. sha1nexte @MSG1[3],$E1_
  604. sha1msg2 @MSG0[3],@MSG0[0]
  605. sha1msg2 @MSG1[3],@MSG1[0]
  606. pxor @MSG0[3],@MSG0[1]
  607. sha1msg1 @MSG0[3],@MSG0[2]
  608. pxor @MSG1[3],@MSG1[1]
  609. sha1msg1 @MSG1[3],@MSG1[2]
  610. ___
  611. ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1);
  612. push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
  613. }
  614. $code.=<<___;
  615. movdqa $ABCD0,$E0_
  616. movdqa $ABCD1,$E1_
  617. sha1rnds4 \$3,$E0,$ABCD0 # 64-67
  618. sha1nexte @MSG0[3],$E0_
  619. sha1rnds4 \$3,$E1,$ABCD1 # 64-67
  620. sha1nexte @MSG1[3],$E1_
  621. sha1msg2 @MSG0[3],@MSG0[0]
  622. sha1msg2 @MSG1[3],@MSG1[0]
  623. pxor @MSG0[3],@MSG0[1]
  624. pxor @MSG1[3],@MSG1[1]
  625. mov \$1,%ecx
  626. pxor @MSG0[2],@MSG0[2] # zero
  627. cmp 4*0(%rbx),%ecx # examine counters
  628. cmovge %rsp,@ptr[0] # cancel input
  629. movdqa $ABCD0,$E0
  630. movdqa $ABCD1,$E1
  631. sha1rnds4 \$3,$E0_,$ABCD0 # 68-71
  632. sha1nexte @MSG0[0],$E0
  633. sha1rnds4 \$3,$E1_,$ABCD1 # 68-71
  634. sha1nexte @MSG1[0],$E1
  635. sha1msg2 @MSG0[0],@MSG0[1]
  636. sha1msg2 @MSG1[0],@MSG1[1]
  637. cmp 4*1(%rbx),%ecx
  638. cmovge %rsp,@ptr[1]
  639. movq (%rbx),@MSG0[0] # pull counters
  640. movdqa $ABCD0,$E0_
  641. movdqa $ABCD1,$E1_
  642. sha1rnds4 \$3,$E0,$ABCD0 # 72-75
  643. sha1nexte @MSG0[1],$E0_
  644. sha1rnds4 \$3,$E1,$ABCD1 # 72-75
  645. sha1nexte @MSG1[1],$E1_
  646. pshufd \$0x00,@MSG0[0],@MSG1[2]
  647. pshufd \$0x55,@MSG0[0],@MSG1[3]
  648. movdqa @MSG0[0],@MSG0[1]
  649. pcmpgtd @MSG0[2],@MSG1[2]
  650. pcmpgtd @MSG0[2],@MSG1[3]
  651. movdqa $ABCD0,$E0
  652. movdqa $ABCD1,$E1
  653. sha1rnds4 \$3,$E0_,$ABCD0 # 76-79
  654. sha1nexte $MSG0[2],$E0
  655. sha1rnds4 \$3,$E1_,$ABCD1 # 76-79
  656. sha1nexte $MSG0[2],$E1
  657. pcmpgtd @MSG0[2],@MSG0[1] # counter mask
  658. pand @MSG1[2],$ABCD0
  659. pand @MSG1[2],$E0
  660. pand @MSG1[3],$ABCD1
  661. pand @MSG1[3],$E1
  662. paddd @MSG0[1],@MSG0[0] # counters--
  663. paddd 0x40(%rsp),$ABCD0
  664. paddd 0x50(%rsp),$E0
  665. paddd 0x60(%rsp),$ABCD1
  666. paddd 0x70(%rsp),$E1
  667. movq @MSG0[0],(%rbx) # save counters
  668. dec $num
  669. jnz .Loop_shaext
  670. mov `$REG_SZ*17+8`(%rsp),$num
  671. pshufd \$0b00011011,$ABCD0,$ABCD0
  672. pshufd \$0b00011011,$ABCD1,$ABCD1
  673. movdqa $ABCD0,@MSG0[0]
  674. punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0
  675. punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0
  676. punpckhdq $E1,$E0 # e1.e0.xx.xx
  677. movq $ABCD0,0x00-0x40($ctx) # a1.a0
  678. psrldq \$8,$ABCD0
  679. movq @MSG0[0],0x40-0x40($ctx)# c1.c0
  680. psrldq \$8,@MSG0[0]
  681. movq $ABCD0,0x20-0x40($ctx) # b1.b0
  682. psrldq \$8,$E0
  683. movq @MSG0[0],0x60-0x40($ctx)# d1.d0
  684. movq $E0,0x80-0x40($ctx) # e1.e0
  685. lea `$REG_SZ/2`($ctx),$ctx
  686. lea `$inp_elm_size*2`($inp),$inp
  687. dec $num
  688. jnz .Loop_grande_shaext
  689. .Ldone_shaext:
  690. #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  691. ___
  692. $code.=<<___ if ($win64);
  693. movaps -0xb8(%rax),%xmm6
  694. movaps -0xa8(%rax),%xmm7
  695. movaps -0x98(%rax),%xmm8
  696. movaps -0x88(%rax),%xmm9
  697. movaps -0x78(%rax),%xmm10
  698. movaps -0x68(%rax),%xmm11
  699. movaps -0x58(%rax),%xmm12
  700. movaps -0x48(%rax),%xmm13
  701. movaps -0x38(%rax),%xmm14
  702. movaps -0x28(%rax),%xmm15
  703. ___
  704. $code.=<<___;
  705. mov -16(%rax),%rbp
  706. .cfi_restore %rbp
  707. mov -8(%rax),%rbx
  708. .cfi_restore %rbx
  709. lea (%rax),%rsp
  710. .cfi_def_cfa_register %rsp
  711. .Lepilogue_shaext:
  712. ret
  713. .cfi_endproc
  714. .size sha1_multi_block_shaext,.-sha1_multi_block_shaext
  715. ___
  716. }}}
  717. if ($avx) {{{
  718. sub BODY_00_19_avx {
  719. my ($i,$a,$b,$c,$d,$e)=@_;
  720. my $j=$i+1;
  721. my $k=$i+2;
  722. my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
  723. my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
  724. $code.=<<___ if ($i==0 && $REG_SZ==16);
  725. vmovd (@ptr[0]),@Xi[0]
  726. lea `16*4`(@ptr[0]),@ptr[0]
  727. vmovd (@ptr[1]),@Xi[2] # borrow Xi[2]
  728. lea `16*4`(@ptr[1]),@ptr[1]
  729. vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
  730. lea `16*4`(@ptr[2]),@ptr[2]
  731. vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2]
  732. lea `16*4`(@ptr[3]),@ptr[3]
  733. vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
  734. vpunpckldq @Xi[2],@Xi[0],@Xi[0]
  735. vmovd `4*$j-16*4`($ptr_n),$t3
  736. vpshufb $tx,@Xi[0],@Xi[0]
  737. ___
  738. $code.=<<___ if ($i<15 && $REG_SZ==16); # just load input
  739. vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
  740. vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
  741. ___
  742. $code.=<<___ if ($i==0 && $REG_SZ==32);
  743. vmovd (@ptr[0]),@Xi[0]
  744. lea `16*4`(@ptr[0]),@ptr[0]
  745. vmovd (@ptr[4]),@Xi[2] # borrow Xi[2]
  746. lea `16*4`(@ptr[4]),@ptr[4]
  747. vmovd (@ptr[1]),$t2
  748. lea `16*4`(@ptr[1]),@ptr[1]
  749. vmovd (@ptr[5]),$t1
  750. lea `16*4`(@ptr[5]),@ptr[5]
  751. vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
  752. lea `16*4`(@ptr[2]),@ptr[2]
  753. vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2]
  754. lea `16*4`(@ptr[6]),@ptr[6]
  755. vpinsrd \$1,(@ptr[3]),$t2,$t2
  756. lea `16*4`(@ptr[3]),@ptr[3]
  757. vpunpckldq $t2,@Xi[0],@Xi[0]
  758. vpinsrd \$1,(@ptr[7]),$t1,$t1
  759. lea `16*4`(@ptr[7]),@ptr[7]
  760. vpunpckldq $t1,@Xi[2],@Xi[2]
  761. vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
  762. vinserti128 @Xi[2],@Xi[0],@Xi[0]
  763. vmovd `4*$j-16*4`($ptr_n),$t3
  764. vpshufb $tx,@Xi[0],@Xi[0]
  765. ___
  766. $code.=<<___ if ($i<15 && $REG_SZ==32); # just load input
  767. vmovd `4*$j-16*4`(@ptr[1]),$t2
  768. vmovd `4*$j-16*4`(@ptr[5]),$t1
  769. vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
  770. vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
  771. vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
  772. vpunpckldq $t2,@Xi[1],@Xi[1]
  773. vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
  774. vpunpckldq $t1,$t3,$t3
  775. ___
  776. $code.=<<___ if ($i<14);
  777. vpaddd $K,$e,$e # e+=K_00_19
  778. vpslld \$5,$a,$t2
  779. vpandn $d,$b,$t1
  780. vpand $c,$b,$t0
  781. vmovdqa @Xi[0],`&Xi_off($i)`
  782. vpaddd @Xi[0],$e,$e # e+=X[i]
  783. $vpack $t3,@Xi[1],@Xi[1]
  784. vpsrld \$27,$a,$t3
  785. vpxor $t1,$t0,$t0 # Ch(b,c,d)
  786. vmovd `4*$k-16*4`(@ptr[0]),@Xi[2]
  787. vpslld \$30,$b,$t1
  788. vpor $t3,$t2,$t2 # rol(a,5)
  789. vmovd `4*$k-16*4`($ptr_n),$t3
  790. vpaddd $t0,$e,$e # e+=Ch(b,c,d)
  791. vpsrld \$2,$b,$b
  792. vpaddd $t2,$e,$e # e+=rol(a,5)
  793. vpshufb $tx,@Xi[1],@Xi[1]
  794. vpor $t1,$b,$b # b=rol(b,30)
  795. ___
  796. $code.=<<___ if ($i==14);
  797. vpaddd $K,$e,$e # e+=K_00_19
  798. prefetcht0 63(@ptr[0])
  799. vpslld \$5,$a,$t2
  800. vpandn $d,$b,$t1
  801. vpand $c,$b,$t0
  802. vmovdqa @Xi[0],`&Xi_off($i)`
  803. vpaddd @Xi[0],$e,$e # e+=X[i]
  804. $vpack $t3,@Xi[1],@Xi[1]
  805. vpsrld \$27,$a,$t3
  806. prefetcht0 63(@ptr[1])
  807. vpxor $t1,$t0,$t0 # Ch(b,c,d)
  808. vpslld \$30,$b,$t1
  809. vpor $t3,$t2,$t2 # rol(a,5)
  810. prefetcht0 63(@ptr[2])
  811. vpaddd $t0,$e,$e # e+=Ch(b,c,d)
  812. vpsrld \$2,$b,$b
  813. vpaddd $t2,$e,$e # e+=rol(a,5)
  814. prefetcht0 63(@ptr[3])
  815. vpshufb $tx,@Xi[1],@Xi[1]
  816. vpor $t1,$b,$b # b=rol(b,30)
  817. ___
  818. $code.=<<___ if ($i>=13 && $i<15);
  819. vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
  820. ___
  821. $code.=<<___ if ($i>=15); # apply Xupdate
  822. vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
  823. vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  824. vpaddd $K,$e,$e # e+=K_00_19
  825. vpslld \$5,$a,$t2
  826. vpandn $d,$b,$t1
  827. `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
  828. vpand $c,$b,$t0
  829. vmovdqa @Xi[0],`&Xi_off($i)`
  830. vpaddd @Xi[0],$e,$e # e+=X[i]
  831. vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
  832. vpsrld \$27,$a,$t3
  833. vpxor $t1,$t0,$t0 # Ch(b,c,d)
  834. vpxor @Xi[3],@Xi[1],@Xi[1]
  835. `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
  836. vpslld \$30,$b,$t1
  837. vpor $t3,$t2,$t2 # rol(a,5)
  838. vpaddd $t0,$e,$e # e+=Ch(b,c,d)
  839. `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
  840. vpsrld \$31,@Xi[1],$tx
  841. vpaddd @Xi[1],@Xi[1],@Xi[1]
  842. vpsrld \$2,$b,$b
  843. `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
  844. vpaddd $t2,$e,$e # e+=rol(a,5)
  845. vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
  846. vpor $t1,$b,$b # b=rol(b,30)
  847. ___
  848. push(@Xi,shift(@Xi));
  849. }
  850. sub BODY_20_39_avx {
  851. my ($i,$a,$b,$c,$d,$e)=@_;
  852. my $j=$i+1;
  853. $code.=<<___ if ($i<79);
  854. vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
  855. vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  856. vpslld \$5,$a,$t2
  857. vpaddd $K,$e,$e # e+=K_20_39
  858. vpxor $b,$d,$t0
  859. ___
  860. $code.=<<___ if ($i<72);
  861. vmovdqa @Xi[0],`&Xi_off($i)`
  862. ___
  863. $code.=<<___ if ($i<79);
  864. vpaddd @Xi[0],$e,$e # e+=X[i]
  865. vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
  866. vpsrld \$27,$a,$t3
  867. vpxor $c,$t0,$t0 # Parity(b,c,d)
  868. vpxor @Xi[3],@Xi[1],@Xi[1]
  869. vpslld \$30,$b,$t1
  870. vpor $t3,$t2,$t2 # rol(a,5)
  871. vpaddd $t0,$e,$e # e+=Parity(b,c,d)
  872. vpsrld \$31,@Xi[1],$tx
  873. vpaddd @Xi[1],@Xi[1],@Xi[1]
  874. vpsrld \$2,$b,$b
  875. vpaddd $t2,$e,$e # e+=rol(a,5)
  876. vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1)
  877. vpor $t1,$b,$b # b=rol(b,30)
  878. ___
  879. $code.=<<___ if ($i==79);
  880. vpslld \$5,$a,$t2
  881. vpaddd $K,$e,$e # e+=K_20_39
  882. vpxor $b,$d,$t0
  883. vpsrld \$27,$a,$t3
  884. vpaddd @Xi[0],$e,$e # e+=X[i]
  885. vpxor $c,$t0,$t0 # Parity(b,c,d)
  886. vpslld \$30,$b,$t1
  887. vpor $t3,$t2,$t2 # rol(a,5)
  888. vpaddd $t0,$e,$e # e+=Parity(b,c,d)
  889. vpsrld \$2,$b,$b
  890. vpaddd $t2,$e,$e # e+=rol(a,5)
  891. vpor $t1,$b,$b # b=rol(b,30)
  892. ___
  893. push(@Xi,shift(@Xi));
  894. }
  895. sub BODY_40_59_avx {
  896. my ($i,$a,$b,$c,$d,$e)=@_;
  897. my $j=$i+1;
  898. $code.=<<___;
  899. vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
  900. vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  901. vpaddd $K,$e,$e # e+=K_40_59
  902. vpslld \$5,$a,$t2
  903. vpand $c,$d,$t1
  904. vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
  905. vpaddd $t1,$e,$e
  906. vpsrld \$27,$a,$t3
  907. vpxor $c,$d,$t0
  908. vpxor @Xi[3],@Xi[1],@Xi[1]
  909. vmovdqu @Xi[0],`&Xi_off($i)`
  910. vpaddd @Xi[0],$e,$e # e+=X[i]
  911. vpor $t3,$t2,$t2 # rol(a,5)
  912. vpsrld \$31,@Xi[1],$tx
  913. vpand $b,$t0,$t0
  914. vpaddd @Xi[1],@Xi[1],@Xi[1]
  915. vpslld \$30,$b,$t1
  916. vpaddd $t0,$e,$e # e+=Maj(b,d,c)
  917. vpsrld \$2,$b,$b
  918. vpaddd $t2,$e,$e # e+=rol(a,5)
  919. vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1)
  920. vpor $t1,$b,$b # b=rol(b,30)
  921. ___
  922. push(@Xi,shift(@Xi));
  923. }
  924. $code.=<<___;
  925. .type sha1_multi_block_avx,\@function,3
  926. .align 32
  927. sha1_multi_block_avx:
  928. .cfi_startproc
  929. _avx_shortcut:
  930. ___
  931. $code.=<<___ if ($avx>1);
  932. shr \$32,%rcx
  933. cmp \$2,$num
  934. jb .Lavx
  935. test \$`1<<5`,%ecx
  936. jnz _avx2_shortcut
  937. jmp .Lavx
  938. .align 32
  939. .Lavx:
  940. ___
  941. $code.=<<___;
  942. mov %rsp,%rax
  943. .cfi_def_cfa_register %rax
  944. push %rbx
  945. .cfi_push %rbx
  946. push %rbp
  947. .cfi_push %rbp
  948. ___
  949. $code.=<<___ if ($win64);
  950. lea -0xa8(%rsp),%rsp
  951. movaps %xmm6,(%rsp)
  952. movaps %xmm7,0x10(%rsp)
  953. movaps %xmm8,0x20(%rsp)
  954. movaps %xmm9,0x30(%rsp)
  955. movaps %xmm10,-0x78(%rax)
  956. movaps %xmm11,-0x68(%rax)
  957. movaps %xmm12,-0x58(%rax)
  958. movaps %xmm13,-0x48(%rax)
  959. movaps %xmm14,-0x38(%rax)
  960. movaps %xmm15,-0x28(%rax)
  961. ___
  962. $code.=<<___;
  963. sub \$`$REG_SZ*18`, %rsp
  964. and \$-256,%rsp
  965. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  966. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  967. .Lbody_avx:
  968. lea K_XX_XX(%rip),$Tbl
  969. lea `$REG_SZ*16`(%rsp),%rbx
  970. vzeroupper
  971. .Loop_grande_avx:
  972. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  973. xor $num,$num
  974. ___
  975. for($i=0;$i<4;$i++) {
  976. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  977. $code.=<<___;
  978. # input pointer
  979. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  980. # number of blocks
  981. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  982. cmp $num,%ecx
  983. cmovg %ecx,$num # find maximum
  984. test %ecx,%ecx
  985. mov %ecx,`4*$i`(%rbx) # initialize counters
  986. cmovle $Tbl,@ptr[$i] # cancel input
  987. ___
  988. }
  989. $code.=<<___;
  990. test $num,$num
  991. jz .Ldone_avx
  992. vmovdqu 0x00($ctx),$A # load context
  993. lea 128(%rsp),%rax
  994. vmovdqu 0x20($ctx),$B
  995. vmovdqu 0x40($ctx),$C
  996. vmovdqu 0x60($ctx),$D
  997. vmovdqu 0x80($ctx),$E
  998. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  999. jmp .Loop_avx
  1000. .align 32
  1001. .Loop_avx:
  1002. ___
  1003. $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
  1004. for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
  1005. $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
  1006. for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1007. $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
  1008. for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
  1009. $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
  1010. for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1011. $code.=<<___;
  1012. mov \$1,%ecx
  1013. ___
  1014. for($i=0;$i<4;$i++) {
  1015. $code.=<<___;
  1016. cmp `4*$i`(%rbx),%ecx # examine counters
  1017. cmovge $Tbl,@ptr[$i] # cancel input
  1018. ___
  1019. }
  1020. $code.=<<___;
  1021. vmovdqu (%rbx),$t0 # pull counters
  1022. vpxor $t2,$t2,$t2
  1023. vmovdqa $t0,$t1
  1024. vpcmpgtd $t2,$t1,$t1 # mask value
  1025. vpaddd $t1,$t0,$t0 # counters--
  1026. vpand $t1,$A,$A
  1027. vpand $t1,$B,$B
  1028. vpaddd 0x00($ctx),$A,$A
  1029. vpand $t1,$C,$C
  1030. vpaddd 0x20($ctx),$B,$B
  1031. vpand $t1,$D,$D
  1032. vpaddd 0x40($ctx),$C,$C
  1033. vpand $t1,$E,$E
  1034. vpaddd 0x60($ctx),$D,$D
  1035. vpaddd 0x80($ctx),$E,$E
  1036. vmovdqu $A,0x00($ctx)
  1037. vmovdqu $B,0x20($ctx)
  1038. vmovdqu $C,0x40($ctx)
  1039. vmovdqu $D,0x60($ctx)
  1040. vmovdqu $E,0x80($ctx)
  1041. vmovdqu $t0,(%rbx) # save counters
  1042. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  1043. dec $num
  1044. jnz .Loop_avx
  1045. mov `$REG_SZ*17+8`(%rsp),$num
  1046. lea $REG_SZ($ctx),$ctx
  1047. lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  1048. dec $num
  1049. jnz .Loop_grande_avx
  1050. .Ldone_avx:
  1051. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1052. .cfi_def_cfa %rax,8
  1053. vzeroupper
  1054. ___
  1055. $code.=<<___ if ($win64);
  1056. movaps -0xb8(%rax),%xmm6
  1057. movaps -0xa8(%rax),%xmm7
  1058. movaps -0x98(%rax),%xmm8
  1059. movaps -0x88(%rax),%xmm9
  1060. movaps -0x78(%rax),%xmm10
  1061. movaps -0x68(%rax),%xmm11
  1062. movaps -0x58(%rax),%xmm12
  1063. movaps -0x48(%rax),%xmm13
  1064. movaps -0x38(%rax),%xmm14
  1065. movaps -0x28(%rax),%xmm15
  1066. ___
  1067. $code.=<<___;
  1068. mov -16(%rax),%rbp
  1069. .cfi_restore %rbp
  1070. mov -8(%rax),%rbx
  1071. .cfi_restore %rbx
  1072. lea (%rax),%rsp
  1073. .cfi_def_cfa_register %rsp
  1074. .Lepilogue_avx:
  1075. ret
  1076. .cfi_endproc
  1077. .size sha1_multi_block_avx,.-sha1_multi_block_avx
  1078. ___
  1079. if ($avx>1) {
  1080. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1081. $REG_SZ=32;
  1082. @ptr=map("%r$_",(12..15,8..11));
  1083. @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
  1084. ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
  1085. @Xi=map("%ymm$_",(10..14));
  1086. $K="%ymm15";
  1087. $code.=<<___;
  1088. .type sha1_multi_block_avx2,\@function,3
  1089. .align 32
  1090. sha1_multi_block_avx2:
  1091. .cfi_startproc
  1092. _avx2_shortcut:
  1093. mov %rsp,%rax
  1094. .cfi_def_cfa_register %rax
  1095. push %rbx
  1096. .cfi_push %rbx
  1097. push %rbp
  1098. .cfi_push %rbp
  1099. push %r12
  1100. .cfi_push %r12
  1101. push %r13
  1102. .cfi_push %r13
  1103. push %r14
  1104. .cfi_push %r14
  1105. push %r15
  1106. .cfi_push %r15
  1107. ___
  1108. $code.=<<___ if ($win64);
  1109. lea -0xa8(%rsp),%rsp
  1110. movaps %xmm6,(%rsp)
  1111. movaps %xmm7,0x10(%rsp)
  1112. movaps %xmm8,0x20(%rsp)
  1113. movaps %xmm9,0x30(%rsp)
  1114. movaps %xmm10,0x40(%rsp)
  1115. movaps %xmm11,0x50(%rsp)
  1116. movaps %xmm12,-0x78(%rax)
  1117. movaps %xmm13,-0x68(%rax)
  1118. movaps %xmm14,-0x58(%rax)
  1119. movaps %xmm15,-0x48(%rax)
  1120. ___
  1121. $code.=<<___;
  1122. sub \$`$REG_SZ*18`, %rsp
  1123. and \$-256,%rsp
  1124. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  1125. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  1126. .Lbody_avx2:
  1127. lea K_XX_XX(%rip),$Tbl
  1128. shr \$1,$num
  1129. vzeroupper
  1130. .Loop_grande_avx2:
  1131. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  1132. xor $num,$num
  1133. lea `$REG_SZ*16`(%rsp),%rbx
  1134. ___
  1135. for($i=0;$i<8;$i++) {
  1136. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  1137. $code.=<<___;
  1138. # input pointer
  1139. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  1140. # number of blocks
  1141. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  1142. cmp $num,%ecx
  1143. cmovg %ecx,$num # find maximum
  1144. test %ecx,%ecx
  1145. mov %ecx,`4*$i`(%rbx) # initialize counters
  1146. cmovle $Tbl,@ptr[$i] # cancel input
  1147. ___
  1148. }
  1149. $code.=<<___;
  1150. vmovdqu 0x00($ctx),$A # load context
  1151. lea 128(%rsp),%rax
  1152. vmovdqu 0x20($ctx),$B
  1153. lea 256+128(%rsp),%rbx
  1154. vmovdqu 0x40($ctx),$C
  1155. vmovdqu 0x60($ctx),$D
  1156. vmovdqu 0x80($ctx),$E
  1157. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  1158. jmp .Loop_avx2
  1159. .align 32
  1160. .Loop_avx2:
  1161. ___
  1162. $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
  1163. for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
  1164. $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
  1165. for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1166. $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
  1167. for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
  1168. $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
  1169. for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1170. $code.=<<___;
  1171. mov \$1,%ecx
  1172. lea `$REG_SZ*16`(%rsp),%rbx
  1173. ___
  1174. for($i=0;$i<8;$i++) {
  1175. $code.=<<___;
  1176. cmp `4*$i`(%rbx),%ecx # examine counters
  1177. cmovge $Tbl,@ptr[$i] # cancel input
  1178. ___
  1179. }
  1180. $code.=<<___;
  1181. vmovdqu (%rbx),$t0 # pull counters
  1182. vpxor $t2,$t2,$t2
  1183. vmovdqa $t0,$t1
  1184. vpcmpgtd $t2,$t1,$t1 # mask value
  1185. vpaddd $t1,$t0,$t0 # counters--
  1186. vpand $t1,$A,$A
  1187. vpand $t1,$B,$B
  1188. vpaddd 0x00($ctx),$A,$A
  1189. vpand $t1,$C,$C
  1190. vpaddd 0x20($ctx),$B,$B
  1191. vpand $t1,$D,$D
  1192. vpaddd 0x40($ctx),$C,$C
  1193. vpand $t1,$E,$E
  1194. vpaddd 0x60($ctx),$D,$D
  1195. vpaddd 0x80($ctx),$E,$E
  1196. vmovdqu $A,0x00($ctx)
  1197. vmovdqu $B,0x20($ctx)
  1198. vmovdqu $C,0x40($ctx)
  1199. vmovdqu $D,0x60($ctx)
  1200. vmovdqu $E,0x80($ctx)
  1201. vmovdqu $t0,(%rbx) # save counters
  1202. lea 256+128(%rsp),%rbx
  1203. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  1204. dec $num
  1205. jnz .Loop_avx2
  1206. #mov `$REG_SZ*17+8`(%rsp),$num
  1207. #lea $REG_SZ($ctx),$ctx
  1208. #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  1209. #dec $num
  1210. #jnz .Loop_grande_avx2
  1211. .Ldone_avx2:
  1212. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1213. .cfi_def_cfa %rax,8
  1214. vzeroupper
  1215. ___
  1216. $code.=<<___ if ($win64);
  1217. movaps -0xd8(%rax),%xmm6
  1218. movaps -0xc8(%rax),%xmm7
  1219. movaps -0xb8(%rax),%xmm8
  1220. movaps -0xa8(%rax),%xmm9
  1221. movaps -0x98(%rax),%xmm10
  1222. movaps -0x88(%rax),%xmm11
  1223. movaps -0x78(%rax),%xmm12
  1224. movaps -0x68(%rax),%xmm13
  1225. movaps -0x58(%rax),%xmm14
  1226. movaps -0x48(%rax),%xmm15
  1227. ___
  1228. $code.=<<___;
  1229. mov -48(%rax),%r15
  1230. .cfi_restore %r15
  1231. mov -40(%rax),%r14
  1232. .cfi_restore %r14
  1233. mov -32(%rax),%r13
  1234. .cfi_restore %r13
  1235. mov -24(%rax),%r12
  1236. .cfi_restore %r12
  1237. mov -16(%rax),%rbp
  1238. .cfi_restore %rbp
  1239. mov -8(%rax),%rbx
  1240. .cfi_restore %rbx
  1241. lea (%rax),%rsp
  1242. .cfi_def_cfa_register %rsp
  1243. .Lepilogue_avx2:
  1244. ret
  1245. .cfi_endproc
  1246. .size sha1_multi_block_avx2,.-sha1_multi_block_avx2
  1247. ___
  1248. } }}}
  1249. $code.=<<___;
  1250. .align 256
  1251. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1252. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1253. K_XX_XX:
  1254. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1255. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1256. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1257. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1258. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1259. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1260. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1261. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1262. .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
  1263. .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1264. ___
  1265. if ($win64) {
  1266. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1267. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1268. $rec="%rcx";
  1269. $frame="%rdx";
  1270. $context="%r8";
  1271. $disp="%r9";
  1272. $code.=<<___;
  1273. .extern __imp_RtlVirtualUnwind
  1274. .type se_handler,\@abi-omnipotent
  1275. .align 16
  1276. se_handler:
  1277. push %rsi
  1278. push %rdi
  1279. push %rbx
  1280. push %rbp
  1281. push %r12
  1282. push %r13
  1283. push %r14
  1284. push %r15
  1285. pushfq
  1286. sub \$64,%rsp
  1287. mov 120($context),%rax # pull context->Rax
  1288. mov 248($context),%rbx # pull context->Rip
  1289. mov 8($disp),%rsi # disp->ImageBase
  1290. mov 56($disp),%r11 # disp->HandlerData
  1291. mov 0(%r11),%r10d # HandlerData[0]
  1292. lea (%rsi,%r10),%r10 # end of prologue label
  1293. cmp %r10,%rbx # context->Rip<.Lbody
  1294. jb .Lin_prologue
  1295. mov 152($context),%rax # pull context->Rsp
  1296. mov 4(%r11),%r10d # HandlerData[1]
  1297. lea (%rsi,%r10),%r10 # epilogue label
  1298. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1299. jae .Lin_prologue
  1300. mov `16*17`(%rax),%rax # pull saved stack pointer
  1301. mov -8(%rax),%rbx
  1302. mov -16(%rax),%rbp
  1303. mov %rbx,144($context) # restore context->Rbx
  1304. mov %rbp,160($context) # restore context->Rbp
  1305. lea -24-10*16(%rax),%rsi
  1306. lea 512($context),%rdi # &context.Xmm6
  1307. mov \$20,%ecx
  1308. .long 0xa548f3fc # cld; rep movsq
  1309. .Lin_prologue:
  1310. mov 8(%rax),%rdi
  1311. mov 16(%rax),%rsi
  1312. mov %rax,152($context) # restore context->Rsp
  1313. mov %rsi,168($context) # restore context->Rsi
  1314. mov %rdi,176($context) # restore context->Rdi
  1315. mov 40($disp),%rdi # disp->ContextRecord
  1316. mov $context,%rsi # context
  1317. mov \$154,%ecx # sizeof(CONTEXT)
  1318. .long 0xa548f3fc # cld; rep movsq
  1319. mov $disp,%rsi
  1320. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1321. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1322. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1323. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1324. mov 40(%rsi),%r10 # disp->ContextRecord
  1325. lea 56(%rsi),%r11 # &disp->HandlerData
  1326. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1327. mov %r10,32(%rsp) # arg5
  1328. mov %r11,40(%rsp) # arg6
  1329. mov %r12,48(%rsp) # arg7
  1330. mov %rcx,56(%rsp) # arg8, (NULL)
  1331. call *__imp_RtlVirtualUnwind(%rip)
  1332. mov \$1,%eax # ExceptionContinueSearch
  1333. add \$64,%rsp
  1334. popfq
  1335. pop %r15
  1336. pop %r14
  1337. pop %r13
  1338. pop %r12
  1339. pop %rbp
  1340. pop %rbx
  1341. pop %rdi
  1342. pop %rsi
  1343. ret
  1344. .size se_handler,.-se_handler
  1345. ___
  1346. $code.=<<___ if ($avx>1);
  1347. .type avx2_handler,\@abi-omnipotent
  1348. .align 16
  1349. avx2_handler:
  1350. push %rsi
  1351. push %rdi
  1352. push %rbx
  1353. push %rbp
  1354. push %r12
  1355. push %r13
  1356. push %r14
  1357. push %r15
  1358. pushfq
  1359. sub \$64,%rsp
  1360. mov 120($context),%rax # pull context->Rax
  1361. mov 248($context),%rbx # pull context->Rip
  1362. mov 8($disp),%rsi # disp->ImageBase
  1363. mov 56($disp),%r11 # disp->HandlerData
  1364. mov 0(%r11),%r10d # HandlerData[0]
  1365. lea (%rsi,%r10),%r10 # end of prologue label
  1366. cmp %r10,%rbx # context->Rip<body label
  1367. jb .Lin_prologue
  1368. mov 152($context),%rax # pull context->Rsp
  1369. mov 4(%r11),%r10d # HandlerData[1]
  1370. lea (%rsi,%r10),%r10 # epilogue label
  1371. cmp %r10,%rbx # context->Rip>=epilogue label
  1372. jae .Lin_prologue
  1373. mov `32*17`($context),%rax # pull saved stack pointer
  1374. mov -8(%rax),%rbx
  1375. mov -16(%rax),%rbp
  1376. mov -24(%rax),%r12
  1377. mov -32(%rax),%r13
  1378. mov -40(%rax),%r14
  1379. mov -48(%rax),%r15
  1380. mov %rbx,144($context) # restore context->Rbx
  1381. mov %rbp,160($context) # restore context->Rbp
  1382. mov %r12,216($context) # restore context->R12
  1383. mov %r13,224($context) # restore context->R13
  1384. mov %r14,232($context) # restore context->R14
  1385. mov %r15,240($context) # restore context->R15
  1386. lea -56-10*16(%rax),%rsi
  1387. lea 512($context),%rdi # &context.Xmm6
  1388. mov \$20,%ecx
  1389. .long 0xa548f3fc # cld; rep movsq
  1390. jmp .Lin_prologue
  1391. .size avx2_handler,.-avx2_handler
  1392. ___
  1393. $code.=<<___;
  1394. .section .pdata
  1395. .align 4
  1396. .rva .LSEH_begin_sha1_multi_block
  1397. .rva .LSEH_end_sha1_multi_block
  1398. .rva .LSEH_info_sha1_multi_block
  1399. .rva .LSEH_begin_sha1_multi_block_shaext
  1400. .rva .LSEH_end_sha1_multi_block_shaext
  1401. .rva .LSEH_info_sha1_multi_block_shaext
  1402. ___
  1403. $code.=<<___ if ($avx);
  1404. .rva .LSEH_begin_sha1_multi_block_avx
  1405. .rva .LSEH_end_sha1_multi_block_avx
  1406. .rva .LSEH_info_sha1_multi_block_avx
  1407. ___
  1408. $code.=<<___ if ($avx>1);
  1409. .rva .LSEH_begin_sha1_multi_block_avx2
  1410. .rva .LSEH_end_sha1_multi_block_avx2
  1411. .rva .LSEH_info_sha1_multi_block_avx2
  1412. ___
  1413. $code.=<<___;
  1414. .section .xdata
  1415. .align 8
  1416. .LSEH_info_sha1_multi_block:
  1417. .byte 9,0,0,0
  1418. .rva se_handler
  1419. .rva .Lbody,.Lepilogue # HandlerData[]
  1420. .LSEH_info_sha1_multi_block_shaext:
  1421. .byte 9,0,0,0
  1422. .rva se_handler
  1423. .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
  1424. ___
  1425. $code.=<<___ if ($avx);
  1426. .LSEH_info_sha1_multi_block_avx:
  1427. .byte 9,0,0,0
  1428. .rva se_handler
  1429. .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
  1430. ___
  1431. $code.=<<___ if ($avx>1);
  1432. .LSEH_info_sha1_multi_block_avx2:
  1433. .byte 9,0,0,0
  1434. .rva avx2_handler
  1435. .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
  1436. ___
  1437. }
  1438. ####################################################################
  1439. sub rex {
  1440. local *opcode=shift;
  1441. my ($dst,$src)=@_;
  1442. my $rex=0;
  1443. $rex|=0x04 if ($dst>=8);
  1444. $rex|=0x01 if ($src>=8);
  1445. unshift @opcode,$rex|0x40 if ($rex);
  1446. }
  1447. sub sha1rnds4 {
  1448. if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1449. my @opcode=(0x0f,0x3a,0xcc);
  1450. rex(\@opcode,$3,$2);
  1451. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1452. my $c=$1;
  1453. push @opcode,$c=~/^0/?oct($c):$c;
  1454. return ".byte\t".join(',',@opcode);
  1455. } else {
  1456. return "sha1rnds4\t".@_[0];
  1457. }
  1458. }
  1459. sub sha1op38 {
  1460. my $instr = shift;
  1461. my %opcodelet = (
  1462. "sha1nexte" => 0xc8,
  1463. "sha1msg1" => 0xc9,
  1464. "sha1msg2" => 0xca );
  1465. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1466. my @opcode=(0x0f,0x38);
  1467. rex(\@opcode,$2,$1);
  1468. push @opcode,$opcodelet{$instr};
  1469. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1470. return ".byte\t".join(',',@opcode);
  1471. } else {
  1472. return $instr."\t".@_[0];
  1473. }
  1474. }
  1475. foreach (split("\n",$code)) {
  1476. s/\`([^\`]*)\`/eval($1)/ge;
  1477. s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
  1478. s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
  1479. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1480. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1481. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
  1482. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1483. s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
  1484. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1485. print $_,"\n";
  1486. }
  1487. close STDOUT or die "error closing STDOUT: $!";