2
0

sha1-mb-x86_64.pl 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Multi-buffer SHA1 procedure processes n buffers in parallel by
  15. # placing buffer data to designated lane of SIMD register. n is
  16. # naturally limited to 4 on pre-AVX2 processors and to 8 on
  17. # AVX2-capable processors such as Haswell.
  18. #
  19. # this +aesni(i) sha1 aesni-sha1 gain(iv)
  20. # -------------------------------------------------------------------
  21. # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
  22. # Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51%
  23. # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
  24. # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
  25. # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
  26. # Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145%
  27. # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
  28. #
  29. # (i) multi-block CBC encrypt with 128-bit key;
  30. # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
  31. # because of lower AES-NI instruction throughput;
  32. # (iii) "this" is for n=8, when we gather twice as much data, result
  33. # for n=4 is 8.00+4.44=12.4;
  34. # (iv) presented improvement coefficients are asymptotic limits and
  35. # in real-life application are somewhat lower, e.g. for 2KB
  36. # fragments they range from 30% to 100% (on Haswell);
  37. # $output is the last argument if it looks like a file (it has an extension)
  38. # $flavour is the first argument if it doesn't look like a file
  39. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  40. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  41. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45. die "can't locate x86_64-xlate.pl";
  46. push(@INC,"${dir}","${dir}../../perlasm");
  47. require "x86_64-support.pl";
  48. $ptr_size=&pointer_size($flavour);
  49. $avx=0;
  50. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  51. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  52. $avx = ($1>=2.19) + ($1>=2.22);
  53. }
  54. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  55. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  56. $avx = ($1>=2.09) + ($1>=2.10);
  57. }
  58. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60. $avx = ($1>=10) + ($1>=11);
  61. }
  62. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  63. $avx = ($2>=3.0) + ($2>3.0);
  64. }
  65. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  66. or die "can't call $xlate: $!";
  67. *STDOUT=*OUT;
  68. # void sha1_multi_block (
  69. # struct { unsigned int A[8];
  70. # unsigned int B[8];
  71. # unsigned int C[8];
  72. # unsigned int D[8];
  73. # unsigned int E[8]; } *ctx,
  74. # struct { void *ptr; int blocks; } inp[8],
  75. # int num); /* 1 or 2 */
  76. #
  77. $ctx="%rdi"; # 1st arg
  78. $inp="%rsi"; # 2nd arg
  79. $num="%edx";
  80. @ptr=map("%r$_",(8..11));
  81. $Tbl="%rbp";
  82. $inp_elm_size=2*$ptr_size;
  83. @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
  84. ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
  85. @Xi=map("%xmm$_",(10..14));
  86. $K="%xmm15";
  87. if (1) {
  88. # Atom-specific optimization aiming to eliminate pshufb with high
  89. # registers [and thus get rid of 48 cycles accumulated penalty]
  90. @Xi=map("%xmm$_",(0..4));
  91. ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
  92. @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
  93. }
  94. $REG_SZ=16;
  95. sub Xi_off {
  96. my $off = shift;
  97. $off %= 16; $off *= $REG_SZ;
  98. $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
  99. }
  100. sub BODY_00_19 {
  101. my ($i,$a,$b,$c,$d,$e)=@_;
  102. my $j=$i+1;
  103. my $k=$i+2;
  104. # Loads are performed 2+3/4 iterations in advance. 3/4 means that out
  105. # of 4 words you would expect to be loaded per given iteration one is
  106. # spilled to next iteration. In other words indices in four input
  107. # streams are distributed as following:
  108. #
  109. # $i==0: 0,0,0,0,1,1,1,1,2,2,2,
  110. # $i==1: 2,3,3,3,
  111. # $i==2: 3,4,4,4,
  112. # ...
  113. # $i==13: 14,15,15,15,
  114. # $i==14: 15
  115. #
  116. # Then at $i==15 Xupdate is applied one iteration in advance...
  117. $code.=<<___ if ($i==0);
  118. movd (@ptr[0]),@Xi[0]
  119. lea `16*4`(@ptr[0]),@ptr[0]
  120. movd (@ptr[1]),@Xi[2] # borrow @Xi[2]
  121. lea `16*4`(@ptr[1]),@ptr[1]
  122. movd (@ptr[2]),@Xi[3] # borrow @Xi[3]
  123. lea `16*4`(@ptr[2]),@ptr[2]
  124. movd (@ptr[3]),@Xi[4] # borrow @Xi[4]
  125. lea `16*4`(@ptr[3]),@ptr[3]
  126. punpckldq @Xi[3],@Xi[0]
  127. movd `4*$j-16*4`(@ptr[0]),@Xi[1]
  128. punpckldq @Xi[4],@Xi[2]
  129. movd `4*$j-16*4`(@ptr[1]),$t3
  130. punpckldq @Xi[2],@Xi[0]
  131. movd `4*$j-16*4`(@ptr[2]),$t2
  132. pshufb $tx,@Xi[0]
  133. ___
  134. $code.=<<___ if ($i<14); # just load input
  135. movd `4*$j-16*4`(@ptr[3]),$t1
  136. punpckldq $t2,@Xi[1]
  137. movdqa $a,$t2
  138. paddd $K,$e # e+=K_00_19
  139. punpckldq $t1,$t3
  140. movdqa $b,$t1
  141. movdqa $b,$t0
  142. pslld \$5,$t2
  143. pandn $d,$t1
  144. pand $c,$t0
  145. punpckldq $t3,@Xi[1]
  146. movdqa $a,$t3
  147. movdqa @Xi[0],`&Xi_off($i)`
  148. paddd @Xi[0],$e # e+=X[i]
  149. movd `4*$k-16*4`(@ptr[0]),@Xi[2]
  150. psrld \$27,$t3
  151. pxor $t1,$t0 # Ch(b,c,d)
  152. movdqa $b,$t1
  153. por $t3,$t2 # rol(a,5)
  154. movd `4*$k-16*4`(@ptr[1]),$t3
  155. pslld \$30,$t1
  156. paddd $t0,$e # e+=Ch(b,c,d)
  157. psrld \$2,$b
  158. paddd $t2,$e # e+=rol(a,5)
  159. pshufb $tx,@Xi[1]
  160. movd `4*$k-16*4`(@ptr[2]),$t2
  161. por $t1,$b # b=rol(b,30)
  162. ___
  163. $code.=<<___ if ($i==14); # just load input
  164. movd `4*$j-16*4`(@ptr[3]),$t1
  165. punpckldq $t2,@Xi[1]
  166. movdqa $a,$t2
  167. paddd $K,$e # e+=K_00_19
  168. punpckldq $t1,$t3
  169. movdqa $b,$t1
  170. movdqa $b,$t0
  171. pslld \$5,$t2
  172. prefetcht0 63(@ptr[0])
  173. pandn $d,$t1
  174. pand $c,$t0
  175. punpckldq $t3,@Xi[1]
  176. movdqa $a,$t3
  177. movdqa @Xi[0],`&Xi_off($i)`
  178. paddd @Xi[0],$e # e+=X[i]
  179. psrld \$27,$t3
  180. pxor $t1,$t0 # Ch(b,c,d)
  181. movdqa $b,$t1
  182. prefetcht0 63(@ptr[1])
  183. por $t3,$t2 # rol(a,5)
  184. pslld \$30,$t1
  185. paddd $t0,$e # e+=Ch(b,c,d)
  186. prefetcht0 63(@ptr[2])
  187. psrld \$2,$b
  188. paddd $t2,$e # e+=rol(a,5)
  189. pshufb $tx,@Xi[1]
  190. prefetcht0 63(@ptr[3])
  191. por $t1,$b # b=rol(b,30)
  192. ___
  193. $code.=<<___ if ($i>=13 && $i<15);
  194. movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
  195. ___
  196. $code.=<<___ if ($i>=15); # apply Xupdate
  197. pxor @Xi[-2],@Xi[1] # "X[13]"
  198. movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  199. movdqa $a,$t2
  200. pxor `&Xi_off($j+8)`,@Xi[1]
  201. paddd $K,$e # e+=K_00_19
  202. movdqa $b,$t1
  203. pslld \$5,$t2
  204. pxor @Xi[3],@Xi[1]
  205. movdqa $b,$t0
  206. pandn $d,$t1
  207. movdqa @Xi[1],$tx
  208. pand $c,$t0
  209. movdqa $a,$t3
  210. psrld \$31,$tx
  211. paddd @Xi[1],@Xi[1]
  212. movdqa @Xi[0],`&Xi_off($i)`
  213. paddd @Xi[0],$e # e+=X[i]
  214. psrld \$27,$t3
  215. pxor $t1,$t0 # Ch(b,c,d)
  216. movdqa $b,$t1
  217. por $t3,$t2 # rol(a,5)
  218. pslld \$30,$t1
  219. paddd $t0,$e # e+=Ch(b,c,d)
  220. psrld \$2,$b
  221. paddd $t2,$e # e+=rol(a,5)
  222. por $tx,@Xi[1] # rol \$1,@Xi[1]
  223. por $t1,$b # b=rol(b,30)
  224. ___
  225. push(@Xi,shift(@Xi));
  226. }
  227. sub BODY_20_39 {
  228. my ($i,$a,$b,$c,$d,$e)=@_;
  229. my $j=$i+1;
  230. $code.=<<___ if ($i<79);
  231. pxor @Xi[-2],@Xi[1] # "X[13]"
  232. movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  233. movdqa $a,$t2
  234. movdqa $d,$t0
  235. pxor `&Xi_off($j+8)`,@Xi[1]
  236. paddd $K,$e # e+=K_20_39
  237. pslld \$5,$t2
  238. pxor $b,$t0
  239. movdqa $a,$t3
  240. ___
  241. $code.=<<___ if ($i<72);
  242. movdqa @Xi[0],`&Xi_off($i)`
  243. ___
  244. $code.=<<___ if ($i<79);
  245. paddd @Xi[0],$e # e+=X[i]
  246. pxor @Xi[3],@Xi[1]
  247. psrld \$27,$t3
  248. pxor $c,$t0 # Parity(b,c,d)
  249. movdqa $b,$t1
  250. pslld \$30,$t1
  251. movdqa @Xi[1],$tx
  252. por $t3,$t2 # rol(a,5)
  253. psrld \$31,$tx
  254. paddd $t0,$e # e+=Parity(b,c,d)
  255. paddd @Xi[1],@Xi[1]
  256. psrld \$2,$b
  257. paddd $t2,$e # e+=rol(a,5)
  258. por $tx,@Xi[1] # rol(@Xi[1],1)
  259. por $t1,$b # b=rol(b,30)
  260. ___
  261. $code.=<<___ if ($i==79);
  262. movdqa $a,$t2
  263. paddd $K,$e # e+=K_20_39
  264. movdqa $d,$t0
  265. pslld \$5,$t2
  266. pxor $b,$t0
  267. movdqa $a,$t3
  268. paddd @Xi[0],$e # e+=X[i]
  269. psrld \$27,$t3
  270. movdqa $b,$t1
  271. pxor $c,$t0 # Parity(b,c,d)
  272. pslld \$30,$t1
  273. por $t3,$t2 # rol(a,5)
  274. paddd $t0,$e # e+=Parity(b,c,d)
  275. psrld \$2,$b
  276. paddd $t2,$e # e+=rol(a,5)
  277. por $t1,$b # b=rol(b,30)
  278. ___
  279. push(@Xi,shift(@Xi));
  280. }
  281. sub BODY_40_59 {
  282. my ($i,$a,$b,$c,$d,$e)=@_;
  283. my $j=$i+1;
  284. $code.=<<___;
  285. pxor @Xi[-2],@Xi[1] # "X[13]"
  286. movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  287. movdqa $a,$t2
  288. movdqa $d,$t1
  289. pxor `&Xi_off($j+8)`,@Xi[1]
  290. pxor @Xi[3],@Xi[1]
  291. paddd $K,$e # e+=K_40_59
  292. pslld \$5,$t2
  293. movdqa $a,$t3
  294. pand $c,$t1
  295. movdqa $d,$t0
  296. movdqa @Xi[1],$tx
  297. psrld \$27,$t3
  298. paddd $t1,$e
  299. pxor $c,$t0
  300. movdqa @Xi[0],`&Xi_off($i)`
  301. paddd @Xi[0],$e # e+=X[i]
  302. por $t3,$t2 # rol(a,5)
  303. psrld \$31,$tx
  304. pand $b,$t0
  305. movdqa $b,$t1
  306. pslld \$30,$t1
  307. paddd @Xi[1],@Xi[1]
  308. paddd $t0,$e # e+=Maj(b,d,c)
  309. psrld \$2,$b
  310. paddd $t2,$e # e+=rol(a,5)
  311. por $tx,@Xi[1] # rol(@X[1],1)
  312. por $t1,$b # b=rol(b,30)
  313. ___
  314. push(@Xi,shift(@Xi));
  315. }
  316. $code.=<<___;
  317. .text
  318. .extern OPENSSL_ia32cap_P
  319. .globl sha1_multi_block
  320. .type sha1_multi_block,\@function,3
  321. .align 32
  322. sha1_multi_block:
  323. .cfi_startproc
  324. mov OPENSSL_ia32cap_P+4(%rip),%rcx
  325. bt \$61,%rcx # check SHA bit
  326. jc _shaext_shortcut
  327. ___
  328. $code.=<<___ if ($avx);
  329. test \$`1<<28`,%ecx
  330. jnz _avx_shortcut
  331. ___
  332. $code.=<<___;
  333. mov %rsp,%rax
  334. .cfi_def_cfa_register %rax
  335. push %rbx
  336. .cfi_push %rbx
  337. push %rbp
  338. .cfi_push %rbx
  339. ___
  340. $code.=<<___ if ($win64);
  341. lea -0xa8(%rsp),%rsp
  342. movaps %xmm6,(%rsp)
  343. movaps %xmm7,0x10(%rsp)
  344. movaps %xmm8,0x20(%rsp)
  345. movaps %xmm9,0x30(%rsp)
  346. movaps %xmm10,-0x78(%rax)
  347. movaps %xmm11,-0x68(%rax)
  348. movaps %xmm12,-0x58(%rax)
  349. movaps %xmm13,-0x48(%rax)
  350. movaps %xmm14,-0x38(%rax)
  351. movaps %xmm15,-0x28(%rax)
  352. ___
  353. $code.=<<___;
  354. sub \$`$REG_SZ*18`,%rsp
  355. and \$-256,%rsp
  356. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  357. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  358. .Lbody:
  359. lea K_XX_XX(%rip),$Tbl
  360. lea `$REG_SZ*16`(%rsp),%rbx
  361. .Loop_grande:
  362. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  363. xor $num,$num
  364. ___
  365. for($i=0;$i<4;$i++) {
  366. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  367. $code.=<<___;
  368. # input pointer
  369. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  370. # number of blocks
  371. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  372. cmp $num,%ecx
  373. cmovg %ecx,$num # find maximum
  374. test %ecx,%ecx
  375. mov %ecx,`4*$i`(%rbx) # initialize counters
  376. cmovle $Tbl,@ptr[$i] # cancel input
  377. ___
  378. }
  379. $code.=<<___;
  380. test $num,$num
  381. jz .Ldone
  382. movdqu 0x00($ctx),$A # load context
  383. lea 128(%rsp),%rax
  384. movdqu 0x20($ctx),$B
  385. movdqu 0x40($ctx),$C
  386. movdqu 0x60($ctx),$D
  387. movdqu 0x80($ctx),$E
  388. movdqa 0x60($Tbl),$tx # pbswap_mask
  389. movdqa -0x20($Tbl),$K # K_00_19
  390. jmp .Loop
  391. .align 32
  392. .Loop:
  393. ___
  394. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  395. $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
  396. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  397. $code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59
  398. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  399. $code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79
  400. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  401. $code.=<<___;
  402. movdqa (%rbx),@Xi[0] # pull counters
  403. mov \$1,%ecx
  404. cmp 4*0(%rbx),%ecx # examine counters
  405. pxor $t2,$t2
  406. cmovge $Tbl,@ptr[0] # cancel input
  407. cmp 4*1(%rbx),%ecx
  408. movdqa @Xi[0],@Xi[1]
  409. cmovge $Tbl,@ptr[1]
  410. cmp 4*2(%rbx),%ecx
  411. pcmpgtd $t2,@Xi[1] # mask value
  412. cmovge $Tbl,@ptr[2]
  413. cmp 4*3(%rbx),%ecx
  414. paddd @Xi[1],@Xi[0] # counters--
  415. cmovge $Tbl,@ptr[3]
  416. movdqu 0x00($ctx),$t0
  417. pand @Xi[1],$A
  418. movdqu 0x20($ctx),$t1
  419. pand @Xi[1],$B
  420. paddd $t0,$A
  421. movdqu 0x40($ctx),$t2
  422. pand @Xi[1],$C
  423. paddd $t1,$B
  424. movdqu 0x60($ctx),$t3
  425. pand @Xi[1],$D
  426. paddd $t2,$C
  427. movdqu 0x80($ctx),$tx
  428. pand @Xi[1],$E
  429. movdqu $A,0x00($ctx)
  430. paddd $t3,$D
  431. movdqu $B,0x20($ctx)
  432. paddd $tx,$E
  433. movdqu $C,0x40($ctx)
  434. movdqu $D,0x60($ctx)
  435. movdqu $E,0x80($ctx)
  436. movdqa @Xi[0],(%rbx) # save counters
  437. movdqa 0x60($Tbl),$tx # pbswap_mask
  438. movdqa -0x20($Tbl),$K # K_00_19
  439. dec $num
  440. jnz .Loop
  441. mov `$REG_SZ*17+8`(%rsp),$num
  442. lea $REG_SZ($ctx),$ctx
  443. lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  444. dec $num
  445. jnz .Loop_grande
  446. .Ldone:
  447. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  448. .cfi_def_cfa %rax,8
  449. ___
  450. $code.=<<___ if ($win64);
  451. movaps -0xb8(%rax),%xmm6
  452. movaps -0xa8(%rax),%xmm7
  453. movaps -0x98(%rax),%xmm8
  454. movaps -0x88(%rax),%xmm9
  455. movaps -0x78(%rax),%xmm10
  456. movaps -0x68(%rax),%xmm11
  457. movaps -0x58(%rax),%xmm12
  458. movaps -0x48(%rax),%xmm13
  459. movaps -0x38(%rax),%xmm14
  460. movaps -0x28(%rax),%xmm15
  461. ___
  462. $code.=<<___;
  463. mov -16(%rax),%rbp
  464. .cfi_restore %rbp
  465. mov -8(%rax),%rbx
  466. .cfi_restore %rbx
  467. lea (%rax),%rsp
  468. .cfi_def_cfa_register %rsp
  469. .Lepilogue:
  470. ret
  471. .cfi_endproc
  472. .size sha1_multi_block,.-sha1_multi_block
  473. ___
  474. {{{
  475. my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
  476. my @MSG0=map("%xmm$_",(4..7));
  477. my @MSG1=map("%xmm$_",(11..14));
  478. $code.=<<___;
  479. .type sha1_multi_block_shaext,\@function,3
  480. .align 32
  481. sha1_multi_block_shaext:
  482. .cfi_startproc
  483. _shaext_shortcut:
  484. mov %rsp,%rax
  485. .cfi_def_cfa_register %rax
  486. push %rbx
  487. .cfi_push %rbx
  488. push %rbp
  489. .cfi_push %rbp
  490. ___
  491. $code.=<<___ if ($win64);
  492. lea -0xa8(%rsp),%rsp
  493. movaps %xmm6,(%rsp)
  494. movaps %xmm7,0x10(%rsp)
  495. movaps %xmm8,0x20(%rsp)
  496. movaps %xmm9,0x30(%rsp)
  497. movaps %xmm10,-0x78(%rax)
  498. movaps %xmm11,-0x68(%rax)
  499. movaps %xmm12,-0x58(%rax)
  500. movaps %xmm13,-0x48(%rax)
  501. movaps %xmm14,-0x38(%rax)
  502. movaps %xmm15,-0x28(%rax)
  503. ___
  504. $code.=<<___;
  505. sub \$`$REG_SZ*18`,%rsp
  506. shl \$1,$num # we process pair at a time
  507. and \$-256,%rsp
  508. lea 0x40($ctx),$ctx # size optimization
  509. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  510. .Lbody_shaext:
  511. lea `$REG_SZ*16`(%rsp),%rbx
  512. movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap
  513. .Loop_grande_shaext:
  514. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  515. xor $num,$num
  516. ___
  517. for($i=0;$i<2;$i++) {
  518. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  519. $code.=<<___;
  520. # input pointer
  521. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  522. # number of blocks
  523. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  524. cmp $num,%ecx
  525. cmovg %ecx,$num # find maximum
  526. test %ecx,%ecx
  527. mov %ecx,`4*$i`(%rbx) # initialize counters
  528. cmovle %rsp,@ptr[$i] # cancel input
  529. ___
  530. }
  531. $code.=<<___;
  532. test $num,$num
  533. jz .Ldone_shaext
  534. movq 0x00-0x40($ctx),$ABCD0 # a1.a0
  535. movq 0x20-0x40($ctx),@MSG0[0]# b1.b0
  536. movq 0x40-0x40($ctx),@MSG0[1]# c1.c0
  537. movq 0x60-0x40($ctx),@MSG0[2]# d1.d0
  538. movq 0x80-0x40($ctx),@MSG0[3]# e1.e0
  539. punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0
  540. punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0
  541. movdqa $ABCD0,$ABCD1
  542. punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0
  543. punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1
  544. pshufd \$0b00111111,@MSG0[3],$E0
  545. pshufd \$0b01111111,@MSG0[3],$E1
  546. pshufd \$0b00011011,$ABCD0,$ABCD0
  547. pshufd \$0b00011011,$ABCD1,$ABCD1
  548. jmp .Loop_shaext
  549. .align 32
  550. .Loop_shaext:
  551. movdqu 0x00(@ptr[0]),@MSG0[0]
  552. movdqu 0x00(@ptr[1]),@MSG1[0]
  553. movdqu 0x10(@ptr[0]),@MSG0[1]
  554. movdqu 0x10(@ptr[1]),@MSG1[1]
  555. movdqu 0x20(@ptr[0]),@MSG0[2]
  556. pshufb $BSWAP,@MSG0[0]
  557. movdqu 0x20(@ptr[1]),@MSG1[2]
  558. pshufb $BSWAP,@MSG1[0]
  559. movdqu 0x30(@ptr[0]),@MSG0[3]
  560. lea 0x40(@ptr[0]),@ptr[0]
  561. pshufb $BSWAP,@MSG0[1]
  562. movdqu 0x30(@ptr[1]),@MSG1[3]
  563. lea 0x40(@ptr[1]),@ptr[1]
  564. pshufb $BSWAP,@MSG1[1]
  565. movdqa $E0,0x50(%rsp) # offload
  566. paddd @MSG0[0],$E0
  567. movdqa $E1,0x70(%rsp)
  568. paddd @MSG1[0],$E1
  569. movdqa $ABCD0,0x40(%rsp) # offload
  570. movdqa $ABCD0,$E0_
  571. movdqa $ABCD1,0x60(%rsp)
  572. movdqa $ABCD1,$E1_
  573. sha1rnds4 \$0,$E0,$ABCD0 # 0-3
  574. sha1nexte @MSG0[1],$E0_
  575. sha1rnds4 \$0,$E1,$ABCD1 # 0-3
  576. sha1nexte @MSG1[1],$E1_
  577. pshufb $BSWAP,@MSG0[2]
  578. prefetcht0 127(@ptr[0])
  579. sha1msg1 @MSG0[1],@MSG0[0]
  580. pshufb $BSWAP,@MSG1[2]
  581. prefetcht0 127(@ptr[1])
  582. sha1msg1 @MSG1[1],@MSG1[0]
  583. pshufb $BSWAP,@MSG0[3]
  584. movdqa $ABCD0,$E0
  585. pshufb $BSWAP,@MSG1[3]
  586. movdqa $ABCD1,$E1
  587. sha1rnds4 \$0,$E0_,$ABCD0 # 4-7
  588. sha1nexte @MSG0[2],$E0
  589. sha1rnds4 \$0,$E1_,$ABCD1 # 4-7
  590. sha1nexte @MSG1[2],$E1
  591. pxor @MSG0[2],@MSG0[0]
  592. sha1msg1 @MSG0[2],@MSG0[1]
  593. pxor @MSG1[2],@MSG1[0]
  594. sha1msg1 @MSG1[2],@MSG1[1]
  595. ___
  596. for($i=2;$i<20-4;$i++) {
  597. $code.=<<___;
  598. movdqa $ABCD0,$E0_
  599. movdqa $ABCD1,$E1_
  600. sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11
  601. sha1nexte @MSG0[3],$E0_
  602. sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11
  603. sha1nexte @MSG1[3],$E1_
  604. sha1msg2 @MSG0[3],@MSG0[0]
  605. sha1msg2 @MSG1[3],@MSG1[0]
  606. pxor @MSG0[3],@MSG0[1]
  607. sha1msg1 @MSG0[3],@MSG0[2]
  608. pxor @MSG1[3],@MSG1[1]
  609. sha1msg1 @MSG1[3],@MSG1[2]
  610. ___
  611. ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1);
  612. push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
  613. }
  614. $code.=<<___;
  615. movdqa $ABCD0,$E0_
  616. movdqa $ABCD1,$E1_
  617. sha1rnds4 \$3,$E0,$ABCD0 # 64-67
  618. sha1nexte @MSG0[3],$E0_
  619. sha1rnds4 \$3,$E1,$ABCD1 # 64-67
  620. sha1nexte @MSG1[3],$E1_
  621. sha1msg2 @MSG0[3],@MSG0[0]
  622. sha1msg2 @MSG1[3],@MSG1[0]
  623. pxor @MSG0[3],@MSG0[1]
  624. pxor @MSG1[3],@MSG1[1]
  625. mov \$1,%ecx
  626. pxor @MSG0[2],@MSG0[2] # zero
  627. cmp 4*0(%rbx),%ecx # examine counters
  628. cmovge %rsp,@ptr[0] # cancel input
  629. movdqa $ABCD0,$E0
  630. movdqa $ABCD1,$E1
  631. sha1rnds4 \$3,$E0_,$ABCD0 # 68-71
  632. sha1nexte @MSG0[0],$E0
  633. sha1rnds4 \$3,$E1_,$ABCD1 # 68-71
  634. sha1nexte @MSG1[0],$E1
  635. sha1msg2 @MSG0[0],@MSG0[1]
  636. sha1msg2 @MSG1[0],@MSG1[1]
  637. cmp 4*1(%rbx),%ecx
  638. cmovge %rsp,@ptr[1]
  639. movq (%rbx),@MSG0[0] # pull counters
  640. movdqa $ABCD0,$E0_
  641. movdqa $ABCD1,$E1_
  642. sha1rnds4 \$3,$E0,$ABCD0 # 72-75
  643. sha1nexte @MSG0[1],$E0_
  644. sha1rnds4 \$3,$E1,$ABCD1 # 72-75
  645. sha1nexte @MSG1[1],$E1_
  646. pshufd \$0x00,@MSG0[0],@MSG1[2]
  647. pshufd \$0x55,@MSG0[0],@MSG1[3]
  648. movdqa @MSG0[0],@MSG0[1]
  649. pcmpgtd @MSG0[2],@MSG1[2]
  650. pcmpgtd @MSG0[2],@MSG1[3]
  651. movdqa $ABCD0,$E0
  652. movdqa $ABCD1,$E1
  653. sha1rnds4 \$3,$E0_,$ABCD0 # 76-79
  654. sha1nexte $MSG0[2],$E0
  655. sha1rnds4 \$3,$E1_,$ABCD1 # 76-79
  656. sha1nexte $MSG0[2],$E1
  657. pcmpgtd @MSG0[2],@MSG0[1] # counter mask
  658. pand @MSG1[2],$ABCD0
  659. pand @MSG1[2],$E0
  660. pand @MSG1[3],$ABCD1
  661. pand @MSG1[3],$E1
  662. paddd @MSG0[1],@MSG0[0] # counters--
  663. paddd 0x40(%rsp),$ABCD0
  664. paddd 0x50(%rsp),$E0
  665. paddd 0x60(%rsp),$ABCD1
  666. paddd 0x70(%rsp),$E1
  667. movq @MSG0[0],(%rbx) # save counters
  668. dec $num
  669. jnz .Loop_shaext
  670. mov `$REG_SZ*17+8`(%rsp),$num
  671. pshufd \$0b00011011,$ABCD0,$ABCD0
  672. pshufd \$0b00011011,$ABCD1,$ABCD1
  673. movdqa $ABCD0,@MSG0[0]
  674. punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0
  675. punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0
  676. punpckhdq $E1,$E0 # e1.e0.xx.xx
  677. movq $ABCD0,0x00-0x40($ctx) # a1.a0
  678. psrldq \$8,$ABCD0
  679. movq @MSG0[0],0x40-0x40($ctx)# c1.c0
  680. psrldq \$8,@MSG0[0]
  681. movq $ABCD0,0x20-0x40($ctx) # b1.b0
  682. psrldq \$8,$E0
  683. movq @MSG0[0],0x60-0x40($ctx)# d1.d0
  684. movq $E0,0x80-0x40($ctx) # e1.e0
  685. lea `$REG_SZ/2`($ctx),$ctx
  686. lea `$inp_elm_size*2`($inp),$inp
  687. dec $num
  688. jnz .Loop_grande_shaext
  689. .Ldone_shaext:
  690. #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  691. ___
  692. $code.=<<___ if ($win64);
  693. movaps -0xb8(%rax),%xmm6
  694. movaps -0xa8(%rax),%xmm7
  695. movaps -0x98(%rax),%xmm8
  696. movaps -0x88(%rax),%xmm9
  697. movaps -0x78(%rax),%xmm10
  698. movaps -0x68(%rax),%xmm11
  699. movaps -0x58(%rax),%xmm12
  700. movaps -0x48(%rax),%xmm13
  701. movaps -0x38(%rax),%xmm14
  702. movaps -0x28(%rax),%xmm15
  703. ___
  704. $code.=<<___;
  705. mov -16(%rax),%rbp
  706. .cfi_restore %rbp
  707. mov -8(%rax),%rbx
  708. .cfi_restore %rbx
  709. lea (%rax),%rsp
  710. .cfi_def_cfa_register %rsp
  711. .Lepilogue_shaext:
  712. ret
  713. .cfi_endproc
  714. .size sha1_multi_block_shaext,.-sha1_multi_block_shaext
  715. ___
  716. }}}
  717. if ($avx) {{{
  718. sub BODY_00_19_avx {
  719. my ($i,$a,$b,$c,$d,$e)=@_;
  720. my $j=$i+1;
  721. my $k=$i+2;
  722. my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
  723. my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
  724. $code.=<<___ if ($i==0 && $REG_SZ==16);
  725. vmovd (@ptr[0]),@Xi[0]
  726. lea `16*4`(@ptr[0]),@ptr[0]
  727. vmovd (@ptr[1]),@Xi[2] # borrow Xi[2]
  728. lea `16*4`(@ptr[1]),@ptr[1]
  729. vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
  730. lea `16*4`(@ptr[2]),@ptr[2]
  731. vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2]
  732. lea `16*4`(@ptr[3]),@ptr[3]
  733. vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
  734. vpunpckldq @Xi[2],@Xi[0],@Xi[0]
  735. vmovd `4*$j-16*4`($ptr_n),$t3
  736. vpshufb $tx,@Xi[0],@Xi[0]
  737. ___
  738. $code.=<<___ if ($i<15 && $REG_SZ==16); # just load input
  739. vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
  740. vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
  741. ___
  742. $code.=<<___ if ($i==0 && $REG_SZ==32);
  743. vmovd (@ptr[0]),@Xi[0]
  744. lea `16*4`(@ptr[0]),@ptr[0]
  745. vmovd (@ptr[4]),@Xi[2] # borrow Xi[2]
  746. lea `16*4`(@ptr[4]),@ptr[4]
  747. vmovd (@ptr[1]),$t2
  748. lea `16*4`(@ptr[1]),@ptr[1]
  749. vmovd (@ptr[5]),$t1
  750. lea `16*4`(@ptr[5]),@ptr[5]
  751. vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
  752. lea `16*4`(@ptr[2]),@ptr[2]
  753. vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2]
  754. lea `16*4`(@ptr[6]),@ptr[6]
  755. vpinsrd \$1,(@ptr[3]),$t2,$t2
  756. lea `16*4`(@ptr[3]),@ptr[3]
  757. vpunpckldq $t2,@Xi[0],@Xi[0]
  758. vpinsrd \$1,(@ptr[7]),$t1,$t1
  759. lea `16*4`(@ptr[7]),@ptr[7]
  760. vpunpckldq $t1,@Xi[2],@Xi[2]
  761. vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
  762. vinserti128 @Xi[2],@Xi[0],@Xi[0]
  763. vmovd `4*$j-16*4`($ptr_n),$t3
  764. vpshufb $tx,@Xi[0],@Xi[0]
  765. ___
  766. $code.=<<___ if ($i<15 && $REG_SZ==32); # just load input
  767. vmovd `4*$j-16*4`(@ptr[1]),$t2
  768. vmovd `4*$j-16*4`(@ptr[5]),$t1
  769. vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
  770. vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
  771. vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
  772. vpunpckldq $t2,@Xi[1],@Xi[1]
  773. vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
  774. vpunpckldq $t1,$t3,$t3
  775. ___
  776. $code.=<<___ if ($i<14);
  777. vpaddd $K,$e,$e # e+=K_00_19
  778. vpslld \$5,$a,$t2
  779. vpandn $d,$b,$t1
  780. vpand $c,$b,$t0
  781. vmovdqa @Xi[0],`&Xi_off($i)`
  782. vpaddd @Xi[0],$e,$e # e+=X[i]
  783. $vpack $t3,@Xi[1],@Xi[1]
  784. vpsrld \$27,$a,$t3
  785. vpxor $t1,$t0,$t0 # Ch(b,c,d)
  786. vmovd `4*$k-16*4`(@ptr[0]),@Xi[2]
  787. vpslld \$30,$b,$t1
  788. vpor $t3,$t2,$t2 # rol(a,5)
  789. vmovd `4*$k-16*4`($ptr_n),$t3
  790. vpaddd $t0,$e,$e # e+=Ch(b,c,d)
  791. vpsrld \$2,$b,$b
  792. vpaddd $t2,$e,$e # e+=rol(a,5)
  793. vpshufb $tx,@Xi[1],@Xi[1]
  794. vpor $t1,$b,$b # b=rol(b,30)
  795. ___
  796. $code.=<<___ if ($i==14);
  797. vpaddd $K,$e,$e # e+=K_00_19
  798. prefetcht0 63(@ptr[0])
  799. vpslld \$5,$a,$t2
  800. vpandn $d,$b,$t1
  801. vpand $c,$b,$t0
  802. vmovdqa @Xi[0],`&Xi_off($i)`
  803. vpaddd @Xi[0],$e,$e # e+=X[i]
  804. $vpack $t3,@Xi[1],@Xi[1]
  805. vpsrld \$27,$a,$t3
  806. prefetcht0 63(@ptr[1])
  807. vpxor $t1,$t0,$t0 # Ch(b,c,d)
  808. vpslld \$30,$b,$t1
  809. vpor $t3,$t2,$t2 # rol(a,5)
  810. prefetcht0 63(@ptr[2])
  811. vpaddd $t0,$e,$e # e+=Ch(b,c,d)
  812. vpsrld \$2,$b,$b
  813. vpaddd $t2,$e,$e # e+=rol(a,5)
  814. prefetcht0 63(@ptr[3])
  815. vpshufb $tx,@Xi[1],@Xi[1]
  816. vpor $t1,$b,$b # b=rol(b,30)
  817. ___
  818. $code.=<<___ if ($i>=13 && $i<15);
  819. vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
  820. ___
  821. $code.=<<___ if ($i>=15); # apply Xupdate
  822. vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
  823. vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  824. vpaddd $K,$e,$e # e+=K_00_19
  825. vpslld \$5,$a,$t2
  826. vpandn $d,$b,$t1
  827. `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
  828. vpand $c,$b,$t0
  829. vmovdqa @Xi[0],`&Xi_off($i)`
  830. vpaddd @Xi[0],$e,$e # e+=X[i]
  831. vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
  832. vpsrld \$27,$a,$t3
  833. vpxor $t1,$t0,$t0 # Ch(b,c,d)
  834. vpxor @Xi[3],@Xi[1],@Xi[1]
  835. `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
  836. vpslld \$30,$b,$t1
  837. vpor $t3,$t2,$t2 # rol(a,5)
  838. vpaddd $t0,$e,$e # e+=Ch(b,c,d)
  839. `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
  840. vpsrld \$31,@Xi[1],$tx
  841. vpaddd @Xi[1],@Xi[1],@Xi[1]
  842. vpsrld \$2,$b,$b
  843. `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
  844. vpaddd $t2,$e,$e # e+=rol(a,5)
  845. vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
  846. vpor $t1,$b,$b # b=rol(b,30)
  847. ___
  848. push(@Xi,shift(@Xi));
  849. }
  850. sub BODY_20_39_avx {
  851. my ($i,$a,$b,$c,$d,$e)=@_;
  852. my $j=$i+1;
  853. $code.=<<___ if ($i<79);
  854. vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
  855. vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  856. vpslld \$5,$a,$t2
  857. vpaddd $K,$e,$e # e+=K_20_39
  858. vpxor $b,$d,$t0
  859. ___
  860. $code.=<<___ if ($i<72);
  861. vmovdqa @Xi[0],`&Xi_off($i)`
  862. ___
  863. $code.=<<___ if ($i<79);
  864. vpaddd @Xi[0],$e,$e # e+=X[i]
  865. vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
  866. vpsrld \$27,$a,$t3
  867. vpxor $c,$t0,$t0 # Parity(b,c,d)
  868. vpxor @Xi[3],@Xi[1],@Xi[1]
  869. vpslld \$30,$b,$t1
  870. vpor $t3,$t2,$t2 # rol(a,5)
  871. vpaddd $t0,$e,$e # e+=Parity(b,c,d)
  872. vpsrld \$31,@Xi[1],$tx
  873. vpaddd @Xi[1],@Xi[1],@Xi[1]
  874. vpsrld \$2,$b,$b
  875. vpaddd $t2,$e,$e # e+=rol(a,5)
  876. vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1)
  877. vpor $t1,$b,$b # b=rol(b,30)
  878. ___
  879. $code.=<<___ if ($i==79);
  880. vpslld \$5,$a,$t2
  881. vpaddd $K,$e,$e # e+=K_20_39
  882. vpxor $b,$d,$t0
  883. vpsrld \$27,$a,$t3
  884. vpaddd @Xi[0],$e,$e # e+=X[i]
  885. vpxor $c,$t0,$t0 # Parity(b,c,d)
  886. vpslld \$30,$b,$t1
  887. vpor $t3,$t2,$t2 # rol(a,5)
  888. vpaddd $t0,$e,$e # e+=Parity(b,c,d)
  889. vpsrld \$2,$b,$b
  890. vpaddd $t2,$e,$e # e+=rol(a,5)
  891. vpor $t1,$b,$b # b=rol(b,30)
  892. ___
  893. push(@Xi,shift(@Xi));
  894. }
  895. sub BODY_40_59_avx {
  896. my ($i,$a,$b,$c,$d,$e)=@_;
  897. my $j=$i+1;
  898. $code.=<<___;
  899. vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
  900. vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
  901. vpaddd $K,$e,$e # e+=K_40_59
  902. vpslld \$5,$a,$t2
  903. vpand $c,$d,$t1
  904. vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
  905. vpaddd $t1,$e,$e
  906. vpsrld \$27,$a,$t3
  907. vpxor $c,$d,$t0
  908. vpxor @Xi[3],@Xi[1],@Xi[1]
  909. vmovdqu @Xi[0],`&Xi_off($i)`
  910. vpaddd @Xi[0],$e,$e # e+=X[i]
  911. vpor $t3,$t2,$t2 # rol(a,5)
  912. vpsrld \$31,@Xi[1],$tx
  913. vpand $b,$t0,$t0
  914. vpaddd @Xi[1],@Xi[1],@Xi[1]
  915. vpslld \$30,$b,$t1
  916. vpaddd $t0,$e,$e # e+=Maj(b,d,c)
  917. vpsrld \$2,$b,$b
  918. vpaddd $t2,$e,$e # e+=rol(a,5)
  919. vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1)
  920. vpor $t1,$b,$b # b=rol(b,30)
  921. ___
  922. push(@Xi,shift(@Xi));
  923. }
  924. $code.=<<___;
  925. .type sha1_multi_block_avx,\@function,3
  926. .align 32
  927. sha1_multi_block_avx:
  928. .cfi_startproc
  929. _avx_shortcut:
  930. ___
  931. $code.=<<___ if ($avx>1);
  932. shr \$32,%rcx
  933. cmp \$2,$num
  934. jb .Lavx
  935. test \$`1<<5`,%ecx
  936. jnz _avx2_shortcut
  937. jmp .Lavx
  938. .align 32
  939. .Lavx:
  940. ___
  941. $code.=<<___;
  942. mov %rsp,%rax
  943. .cfi_def_cfa_register %rax
  944. push %rbx
  945. .cfi_push %rbx
  946. push %rbp
  947. .cfi_push %rbp
  948. ___
  949. $code.=<<___ if ($win64);
  950. lea -0xa8(%rsp),%rsp
  951. movaps %xmm6,(%rsp)
  952. movaps %xmm7,0x10(%rsp)
  953. movaps %xmm8,0x20(%rsp)
  954. movaps %xmm9,0x30(%rsp)
  955. movaps %xmm10,-0x78(%rax)
  956. movaps %xmm11,-0x68(%rax)
  957. movaps %xmm12,-0x58(%rax)
  958. movaps %xmm13,-0x48(%rax)
  959. movaps %xmm14,-0x38(%rax)
  960. movaps %xmm15,-0x28(%rax)
  961. ___
  962. $code.=<<___;
  963. sub \$`$REG_SZ*18`, %rsp
  964. and \$-256,%rsp
  965. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  966. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  967. .Lbody_avx:
  968. lea K_XX_XX(%rip),$Tbl
  969. lea `$REG_SZ*16`(%rsp),%rbx
  970. vzeroupper
  971. .Loop_grande_avx:
  972. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  973. xor $num,$num
  974. ___
  975. for($i=0;$i<4;$i++) {
  976. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  977. $code.=<<___;
  978. # input pointer
  979. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  980. # number of blocks
  981. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  982. cmp $num,%ecx
  983. cmovg %ecx,$num # find maximum
  984. test %ecx,%ecx
  985. mov %ecx,`4*$i`(%rbx) # initialize counters
  986. cmovle $Tbl,@ptr[$i] # cancel input
  987. ___
  988. }
  989. $code.=<<___;
  990. test $num,$num
  991. jz .Ldone_avx
  992. vmovdqu 0x00($ctx),$A # load context
  993. lea 128(%rsp),%rax
  994. vmovdqu 0x20($ctx),$B
  995. vmovdqu 0x40($ctx),$C
  996. vmovdqu 0x60($ctx),$D
  997. vmovdqu 0x80($ctx),$E
  998. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  999. jmp .Loop_avx
  1000. .align 32
  1001. .Loop_avx:
  1002. ___
  1003. $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
  1004. for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
  1005. $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
  1006. for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1007. $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
  1008. for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
  1009. $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
  1010. for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1011. $code.=<<___;
  1012. mov \$1,%ecx
  1013. ___
  1014. for($i=0;$i<4;$i++) {
  1015. $code.=<<___;
  1016. cmp `4*$i`(%rbx),%ecx # examine counters
  1017. cmovge $Tbl,@ptr[$i] # cancel input
  1018. ___
  1019. }
  1020. $code.=<<___;
  1021. vmovdqu (%rbx),$t0 # pull counters
  1022. vpxor $t2,$t2,$t2
  1023. vmovdqa $t0,$t1
  1024. vpcmpgtd $t2,$t1,$t1 # mask value
  1025. vpaddd $t1,$t0,$t0 # counters--
  1026. vpand $t1,$A,$A
  1027. vpand $t1,$B,$B
  1028. vpaddd 0x00($ctx),$A,$A
  1029. vpand $t1,$C,$C
  1030. vpaddd 0x20($ctx),$B,$B
  1031. vpand $t1,$D,$D
  1032. vpaddd 0x40($ctx),$C,$C
  1033. vpand $t1,$E,$E
  1034. vpaddd 0x60($ctx),$D,$D
  1035. vpaddd 0x80($ctx),$E,$E
  1036. vmovdqu $A,0x00($ctx)
  1037. vmovdqu $B,0x20($ctx)
  1038. vmovdqu $C,0x40($ctx)
  1039. vmovdqu $D,0x60($ctx)
  1040. vmovdqu $E,0x80($ctx)
  1041. vmovdqu $t0,(%rbx) # save counters
  1042. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  1043. dec $num
  1044. jnz .Loop_avx
  1045. mov `$REG_SZ*17+8`(%rsp),$num
  1046. lea $REG_SZ($ctx),$ctx
  1047. lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  1048. dec $num
  1049. jnz .Loop_grande_avx
  1050. .Ldone_avx:
  1051. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1052. .cfi_def_cfa %rax,8
  1053. vzeroupper
  1054. ___
  1055. $code.=<<___ if ($win64);
  1056. movaps -0xb8(%rax),%xmm6
  1057. movaps -0xa8(%rax),%xmm7
  1058. movaps -0x98(%rax),%xmm8
  1059. movaps -0x88(%rax),%xmm9
  1060. movaps -0x78(%rax),%xmm10
  1061. movaps -0x68(%rax),%xmm11
  1062. movaps -0x58(%rax),%xmm12
  1063. movaps -0x48(%rax),%xmm13
  1064. movaps -0x38(%rax),%xmm14
  1065. movaps -0x28(%rax),%xmm15
  1066. ___
  1067. $code.=<<___;
  1068. mov -16(%rax),%rbp
  1069. .cfi_restore %rbp
  1070. mov -8(%rax),%rbx
  1071. .cfi_restore %rbx
  1072. lea (%rax),%rsp
  1073. .cfi_def_cfa_register %rsp
  1074. .Lepilogue_avx:
  1075. ret
  1076. .cfi_endproc
  1077. .size sha1_multi_block_avx,.-sha1_multi_block_avx
  1078. ___
  1079. if ($avx>1) {
  1080. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1081. $REG_SZ=32;
  1082. @ptr=map("%r$_",(12..15,8..11));
  1083. @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
  1084. ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
  1085. @Xi=map("%ymm$_",(10..14));
  1086. $K="%ymm15";
  1087. $code.=<<___;
  1088. .type sha1_multi_block_avx2,\@function,3
  1089. .align 32
  1090. sha1_multi_block_avx2:
  1091. .cfi_startproc
  1092. _avx2_shortcut:
  1093. mov %rsp,%rax
  1094. .cfi_def_cfa_register %rax
  1095. push %rbx
  1096. .cfi_push %rbx
  1097. push %rbp
  1098. .cfi_push %rbp
  1099. push %r12
  1100. .cfi_push %r12
  1101. push %r13
  1102. .cfi_push %r13
  1103. push %r14
  1104. .cfi_push %r14
  1105. push %r15
  1106. .cfi_push %r15
  1107. ___
  1108. $code.=<<___ if ($win64);
  1109. lea -0xa8(%rsp),%rsp
  1110. movaps %xmm6,(%rsp)
  1111. movaps %xmm7,0x10(%rsp)
  1112. movaps %xmm8,0x20(%rsp)
  1113. movaps %xmm9,0x30(%rsp)
  1114. movaps %xmm10,0x40(%rsp)
  1115. movaps %xmm11,0x50(%rsp)
  1116. movaps %xmm12,-0x78(%rax)
  1117. movaps %xmm13,-0x68(%rax)
  1118. movaps %xmm14,-0x58(%rax)
  1119. movaps %xmm15,-0x48(%rax)
  1120. ___
  1121. $code.=<<___;
  1122. sub \$`$REG_SZ*18`, %rsp
  1123. and \$-256,%rsp
  1124. mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
  1125. .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
  1126. .Lbody_avx2:
  1127. lea K_XX_XX(%rip),$Tbl
  1128. shr \$1,$num
  1129. vzeroupper
  1130. .Loop_grande_avx2:
  1131. mov $num,`$REG_SZ*17+8`(%rsp) # original $num
  1132. xor $num,$num
  1133. lea `$REG_SZ*16`(%rsp),%rbx
  1134. ___
  1135. for($i=0;$i<8;$i++) {
  1136. $ptr_reg=&pointer_register($flavour,@ptr[$i]);
  1137. $code.=<<___;
  1138. # input pointer
  1139. mov `$inp_elm_size*$i+0`($inp),$ptr_reg
  1140. # number of blocks
  1141. mov `$inp_elm_size*$i+$ptr_size`($inp),%ecx
  1142. cmp $num,%ecx
  1143. cmovg %ecx,$num # find maximum
  1144. test %ecx,%ecx
  1145. mov %ecx,`4*$i`(%rbx) # initialize counters
  1146. cmovle $Tbl,@ptr[$i] # cancel input
  1147. ___
  1148. }
  1149. $code.=<<___;
  1150. vmovdqu 0x00($ctx),$A # load context
  1151. lea 128(%rsp),%rax
  1152. vmovdqu 0x20($ctx),$B
  1153. lea 256+128(%rsp),%rbx
  1154. vmovdqu 0x40($ctx),$C
  1155. vmovdqu 0x60($ctx),$D
  1156. vmovdqu 0x80($ctx),$E
  1157. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  1158. jmp .Loop_avx2
  1159. .align 32
  1160. .Loop_avx2:
  1161. ___
  1162. $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
  1163. for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
  1164. $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
  1165. for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1166. $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
  1167. for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
  1168. $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
  1169. for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
  1170. $code.=<<___;
  1171. mov \$1,%ecx
  1172. lea `$REG_SZ*16`(%rsp),%rbx
  1173. ___
  1174. for($i=0;$i<8;$i++) {
  1175. $code.=<<___;
  1176. cmp `4*$i`(%rbx),%ecx # examine counters
  1177. cmovge $Tbl,@ptr[$i] # cancel input
  1178. ___
  1179. }
  1180. $code.=<<___;
  1181. vmovdqu (%rbx),$t0 # pull counters
  1182. vpxor $t2,$t2,$t2
  1183. vmovdqa $t0,$t1
  1184. vpcmpgtd $t2,$t1,$t1 # mask value
  1185. vpaddd $t1,$t0,$t0 # counters--
  1186. vpand $t1,$A,$A
  1187. vpand $t1,$B,$B
  1188. vpaddd 0x00($ctx),$A,$A
  1189. vpand $t1,$C,$C
  1190. vpaddd 0x20($ctx),$B,$B
  1191. vpand $t1,$D,$D
  1192. vpaddd 0x40($ctx),$C,$C
  1193. vpand $t1,$E,$E
  1194. vpaddd 0x60($ctx),$D,$D
  1195. vpaddd 0x80($ctx),$E,$E
  1196. vmovdqu $A,0x00($ctx)
  1197. vmovdqu $B,0x20($ctx)
  1198. vmovdqu $C,0x40($ctx)
  1199. vmovdqu $D,0x60($ctx)
  1200. vmovdqu $E,0x80($ctx)
  1201. vmovdqu $t0,(%rbx) # save counters
  1202. lea 256+128(%rsp),%rbx
  1203. vmovdqu 0x60($Tbl),$tx # pbswap_mask
  1204. dec $num
  1205. jnz .Loop_avx2
  1206. #mov `$REG_SZ*17+8`(%rsp),$num
  1207. #lea $REG_SZ($ctx),$ctx
  1208. #lea `$inp_elm_size*$REG_SZ/4`($inp),$inp
  1209. #dec $num
  1210. #jnz .Loop_grande_avx2
  1211. .Ldone_avx2:
  1212. mov `$REG_SZ*17`(%rsp),%rax # original %rsp
  1213. .cfi_def_cfa %rax,8
  1214. vzeroupper
  1215. ___
  1216. $code.=<<___ if ($win64);
  1217. movaps -0xd8(%rax),%xmm6
  1218. movaps -0xc8(%rax),%xmm7
  1219. movaps -0xb8(%rax),%xmm8
  1220. movaps -0xa8(%rax),%xmm9
  1221. movaps -0x98(%rax),%xmm10
  1222. movaps -0x88(%rax),%xmm11
  1223. movaps -0x78(%rax),%xmm12
  1224. movaps -0x68(%rax),%xmm13
  1225. movaps -0x58(%rax),%xmm14
  1226. movaps -0x48(%rax),%xmm15
  1227. ___
  1228. $code.=<<___;
  1229. mov -48(%rax),%r15
  1230. .cfi_restore %r15
  1231. mov -40(%rax),%r14
  1232. .cfi_restore %r14
  1233. mov -32(%rax),%r13
  1234. .cfi_restore %r13
  1235. mov -24(%rax),%r12
  1236. .cfi_restore %r12
  1237. mov -16(%rax),%rbp
  1238. .cfi_restore %rbp
  1239. mov -8(%rax),%rbx
  1240. .cfi_restore %rbx
  1241. lea (%rax),%rsp
  1242. .cfi_def_cfa_register %rsp
  1243. .Lepilogue_avx2:
  1244. ret
  1245. .cfi_endproc
  1246. .size sha1_multi_block_avx2,.-sha1_multi_block_avx2
  1247. ___
  1248. } }}}
  1249. $code.=<<___;
  1250. .section .rodata align=256
  1251. .align 256
  1252. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1253. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1254. K_XX_XX:
  1255. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1256. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1257. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1258. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1259. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1260. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1261. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1262. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
  1263. .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
  1264. .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1265. .previous
  1266. ___
  1267. if ($win64) {
  1268. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1269. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1270. $rec="%rcx";
  1271. $frame="%rdx";
  1272. $context="%r8";
  1273. $disp="%r9";
  1274. $code.=<<___;
  1275. .extern __imp_RtlVirtualUnwind
  1276. .type se_handler,\@abi-omnipotent
  1277. .align 16
  1278. se_handler:
  1279. push %rsi
  1280. push %rdi
  1281. push %rbx
  1282. push %rbp
  1283. push %r12
  1284. push %r13
  1285. push %r14
  1286. push %r15
  1287. pushfq
  1288. sub \$64,%rsp
  1289. mov 120($context),%rax # pull context->Rax
  1290. mov 248($context),%rbx # pull context->Rip
  1291. mov 8($disp),%rsi # disp->ImageBase
  1292. mov 56($disp),%r11 # disp->HandlerData
  1293. mov 0(%r11),%r10d # HandlerData[0]
  1294. lea (%rsi,%r10),%r10 # end of prologue label
  1295. cmp %r10,%rbx # context->Rip<.Lbody
  1296. jb .Lin_prologue
  1297. mov 152($context),%rax # pull context->Rsp
  1298. mov 4(%r11),%r10d # HandlerData[1]
  1299. lea (%rsi,%r10),%r10 # epilogue label
  1300. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1301. jae .Lin_prologue
  1302. mov `16*17`(%rax),%rax # pull saved stack pointer
  1303. mov -8(%rax),%rbx
  1304. mov -16(%rax),%rbp
  1305. mov %rbx,144($context) # restore context->Rbx
  1306. mov %rbp,160($context) # restore context->Rbp
  1307. lea -24-10*16(%rax),%rsi
  1308. lea 512($context),%rdi # &context.Xmm6
  1309. mov \$20,%ecx
  1310. .long 0xa548f3fc # cld; rep movsq
  1311. .Lin_prologue:
  1312. mov 8(%rax),%rdi
  1313. mov 16(%rax),%rsi
  1314. mov %rax,152($context) # restore context->Rsp
  1315. mov %rsi,168($context) # restore context->Rsi
  1316. mov %rdi,176($context) # restore context->Rdi
  1317. mov 40($disp),%rdi # disp->ContextRecord
  1318. mov $context,%rsi # context
  1319. mov \$154,%ecx # sizeof(CONTEXT)
  1320. .long 0xa548f3fc # cld; rep movsq
  1321. mov $disp,%rsi
  1322. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1323. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1324. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1325. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1326. mov 40(%rsi),%r10 # disp->ContextRecord
  1327. lea 56(%rsi),%r11 # &disp->HandlerData
  1328. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1329. mov %r10,32(%rsp) # arg5
  1330. mov %r11,40(%rsp) # arg6
  1331. mov %r12,48(%rsp) # arg7
  1332. mov %rcx,56(%rsp) # arg8, (NULL)
  1333. call *__imp_RtlVirtualUnwind(%rip)
  1334. mov \$1,%eax # ExceptionContinueSearch
  1335. add \$64,%rsp
  1336. popfq
  1337. pop %r15
  1338. pop %r14
  1339. pop %r13
  1340. pop %r12
  1341. pop %rbp
  1342. pop %rbx
  1343. pop %rdi
  1344. pop %rsi
  1345. ret
  1346. .size se_handler,.-se_handler
  1347. ___
  1348. $code.=<<___ if ($avx>1);
  1349. .type avx2_handler,\@abi-omnipotent
  1350. .align 16
  1351. avx2_handler:
  1352. push %rsi
  1353. push %rdi
  1354. push %rbx
  1355. push %rbp
  1356. push %r12
  1357. push %r13
  1358. push %r14
  1359. push %r15
  1360. pushfq
  1361. sub \$64,%rsp
  1362. mov 120($context),%rax # pull context->Rax
  1363. mov 248($context),%rbx # pull context->Rip
  1364. mov 8($disp),%rsi # disp->ImageBase
  1365. mov 56($disp),%r11 # disp->HandlerData
  1366. mov 0(%r11),%r10d # HandlerData[0]
  1367. lea (%rsi,%r10),%r10 # end of prologue label
  1368. cmp %r10,%rbx # context->Rip<body label
  1369. jb .Lin_prologue
  1370. mov 152($context),%rax # pull context->Rsp
  1371. mov 4(%r11),%r10d # HandlerData[1]
  1372. lea (%rsi,%r10),%r10 # epilogue label
  1373. cmp %r10,%rbx # context->Rip>=epilogue label
  1374. jae .Lin_prologue
  1375. mov `32*17`($context),%rax # pull saved stack pointer
  1376. mov -8(%rax),%rbx
  1377. mov -16(%rax),%rbp
  1378. mov -24(%rax),%r12
  1379. mov -32(%rax),%r13
  1380. mov -40(%rax),%r14
  1381. mov -48(%rax),%r15
  1382. mov %rbx,144($context) # restore context->Rbx
  1383. mov %rbp,160($context) # restore context->Rbp
  1384. mov %r12,216($context) # restore context->R12
  1385. mov %r13,224($context) # restore context->R13
  1386. mov %r14,232($context) # restore context->R14
  1387. mov %r15,240($context) # restore context->R15
  1388. lea -56-10*16(%rax),%rsi
  1389. lea 512($context),%rdi # &context.Xmm6
  1390. mov \$20,%ecx
  1391. .long 0xa548f3fc # cld; rep movsq
  1392. jmp .Lin_prologue
  1393. .size avx2_handler,.-avx2_handler
  1394. ___
  1395. $code.=<<___;
  1396. .section .pdata
  1397. .align 4
  1398. .rva .LSEH_begin_sha1_multi_block
  1399. .rva .LSEH_end_sha1_multi_block
  1400. .rva .LSEH_info_sha1_multi_block
  1401. .rva .LSEH_begin_sha1_multi_block_shaext
  1402. .rva .LSEH_end_sha1_multi_block_shaext
  1403. .rva .LSEH_info_sha1_multi_block_shaext
  1404. ___
  1405. $code.=<<___ if ($avx);
  1406. .rva .LSEH_begin_sha1_multi_block_avx
  1407. .rva .LSEH_end_sha1_multi_block_avx
  1408. .rva .LSEH_info_sha1_multi_block_avx
  1409. ___
  1410. $code.=<<___ if ($avx>1);
  1411. .rva .LSEH_begin_sha1_multi_block_avx2
  1412. .rva .LSEH_end_sha1_multi_block_avx2
  1413. .rva .LSEH_info_sha1_multi_block_avx2
  1414. ___
  1415. $code.=<<___;
  1416. .section .xdata
  1417. .align 8
  1418. .LSEH_info_sha1_multi_block:
  1419. .byte 9,0,0,0
  1420. .rva se_handler
  1421. .rva .Lbody,.Lepilogue # HandlerData[]
  1422. .LSEH_info_sha1_multi_block_shaext:
  1423. .byte 9,0,0,0
  1424. .rva se_handler
  1425. .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
  1426. ___
  1427. $code.=<<___ if ($avx);
  1428. .LSEH_info_sha1_multi_block_avx:
  1429. .byte 9,0,0,0
  1430. .rva se_handler
  1431. .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
  1432. ___
  1433. $code.=<<___ if ($avx>1);
  1434. .LSEH_info_sha1_multi_block_avx2:
  1435. .byte 9,0,0,0
  1436. .rva avx2_handler
  1437. .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
  1438. ___
  1439. }
  1440. ####################################################################
  1441. sub rex {
  1442. local *opcode=shift;
  1443. my ($dst,$src)=@_;
  1444. my $rex=0;
  1445. $rex|=0x04 if ($dst>=8);
  1446. $rex|=0x01 if ($src>=8);
  1447. unshift @opcode,$rex|0x40 if ($rex);
  1448. }
  1449. sub sha1rnds4 {
  1450. if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1451. my @opcode=(0x0f,0x3a,0xcc);
  1452. rex(\@opcode,$3,$2);
  1453. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1454. my $c=$1;
  1455. push @opcode,$c=~/^0/?oct($c):$c;
  1456. return ".byte\t".join(',',@opcode);
  1457. } else {
  1458. return "sha1rnds4\t".@_[0];
  1459. }
  1460. }
  1461. sub sha1op38 {
  1462. my $instr = shift;
  1463. my %opcodelet = (
  1464. "sha1nexte" => 0xc8,
  1465. "sha1msg1" => 0xc9,
  1466. "sha1msg2" => 0xca );
  1467. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1468. my @opcode=(0x0f,0x38);
  1469. rex(\@opcode,$2,$1);
  1470. push @opcode,$opcodelet{$instr};
  1471. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1472. return ".byte\t".join(',',@opcode);
  1473. } else {
  1474. return $instr."\t".@_[0];
  1475. }
  1476. }
  1477. foreach (split("\n",$code)) {
  1478. s/\`([^\`]*)\`/eval($1)/ge;
  1479. s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
  1480. s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or
  1481. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1482. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1483. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
  1484. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1485. s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
  1486. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1487. print $_,"\n";
  1488. }
  1489. close STDOUT or die "error closing STDOUT: $!";