sha1-x86_64.pl 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # sha1_block procedure for x86_64.
  11. #
  12. # It was brought to my attention that on EM64T compiler-generated code
  13. # was far behind 32-bit assembler implementation. This is unlike on
  14. # Opteron where compiler-generated code was only 15% behind 32-bit
  15. # assembler, which originally made it hard to motivate the effort.
  16. # There was suggestion to mechanically translate 32-bit code, but I
  17. # dismissed it, reasoning that x86_64 offers enough register bank
  18. # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  19. # implementation:-) However! While 64-bit code does perform better
  20. # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  21. # x86_64 does offer larger *addressable* bank, but out-of-order core
  22. # reaches for even more registers through dynamic aliasing, and EM64T
  23. # core must have managed to run-time optimize even 32-bit code just as
  24. # good as 64-bit one. Performance improvement is summarized in the
  25. # following table:
  26. #
  27. # gcc 3.4 32-bit asm cycles/byte
  28. # Opteron +45% +20% 6.8
  29. # Xeon P4 +65% +0% 9.9
  30. # Core2 +60% +10% 7.0
  31. # August 2009.
  32. #
  33. # The code was revised to minimize code size and to maximize
  34. # "distance" between instructions producing input to 'lea'
  35. # instruction and the 'lea' instruction itself, which is essential
  36. # for Intel Atom core.
  37. # October 2010.
  38. #
  39. # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
  40. # is to offload message schedule denoted by Wt in NIST specification,
  41. # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
  42. # for background and implementation details. The only difference from
  43. # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
  44. # to free temporary registers.
  45. # April 2011.
  46. #
  47. # Add AVX code path. See sha1-586.pl for further information.
  48. ######################################################################
  49. # Current performance is summarized in following table. Numbers are
  50. # CPU clock cycles spent to process single byte (less is better).
  51. #
  52. # x86_64 SSSE3 AVX
  53. # P4 9.8 -
  54. # Opteron 6.6 -
  55. # Core2 6.7 6.1/+10% -
  56. # Atom 11.0 9.7/+13% -
  57. # Westmere 7.1 5.6/+27% -
  58. # Sandy Bridge 7.9 6.3/+25% 5.2/+51%
  59. $flavour = shift;
  60. $output = shift;
  61. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  62. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  63. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  64. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  65. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  66. die "can't locate x86_64-xlate.pl";
  67. $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  68. =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  69. $1>=2.19);
  70. $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  71. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  72. $1>=2.09);
  73. $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  74. `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  75. $1>=10);
  76. open STDOUT,"| $^X $xlate $flavour $output";
  77. $ctx="%rdi"; # 1st arg
  78. $inp="%rsi"; # 2nd arg
  79. $num="%rdx"; # 3rd arg
  80. # reassign arguments in order to produce more compact code
  81. $ctx="%r8";
  82. $inp="%r9";
  83. $num="%r10";
  84. $t0="%eax";
  85. $t1="%ebx";
  86. $t2="%ecx";
  87. @xi=("%edx","%ebp");
  88. $A="%esi";
  89. $B="%edi";
  90. $C="%r11d";
  91. $D="%r12d";
  92. $E="%r13d";
  93. @V=($A,$B,$C,$D,$E);
  94. sub BODY_00_19 {
  95. my ($i,$a,$b,$c,$d,$e)=@_;
  96. my $j=$i+1;
  97. $code.=<<___ if ($i==0);
  98. mov `4*$i`($inp),$xi[0]
  99. bswap $xi[0]
  100. mov $xi[0],`4*$i`(%rsp)
  101. ___
  102. $code.=<<___ if ($i<15);
  103. mov $c,$t0
  104. mov `4*$j`($inp),$xi[1]
  105. mov $a,$t2
  106. xor $d,$t0
  107. bswap $xi[1]
  108. rol \$5,$t2
  109. lea 0x5a827999($xi[0],$e),$e
  110. and $b,$t0
  111. mov $xi[1],`4*$j`(%rsp)
  112. add $t2,$e
  113. xor $d,$t0
  114. rol \$30,$b
  115. add $t0,$e
  116. ___
  117. $code.=<<___ if ($i>=15);
  118. mov `4*($j%16)`(%rsp),$xi[1]
  119. mov $c,$t0
  120. mov $a,$t2
  121. xor `4*(($j+2)%16)`(%rsp),$xi[1]
  122. xor $d,$t0
  123. rol \$5,$t2
  124. xor `4*(($j+8)%16)`(%rsp),$xi[1]
  125. and $b,$t0
  126. lea 0x5a827999($xi[0],$e),$e
  127. xor `4*(($j+13)%16)`(%rsp),$xi[1]
  128. xor $d,$t0
  129. rol \$1,$xi[1]
  130. add $t2,$e
  131. rol \$30,$b
  132. mov $xi[1],`4*($j%16)`(%rsp)
  133. add $t0,$e
  134. ___
  135. unshift(@xi,pop(@xi));
  136. }
  137. sub BODY_20_39 {
  138. my ($i,$a,$b,$c,$d,$e)=@_;
  139. my $j=$i+1;
  140. my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
  141. $code.=<<___ if ($i<79);
  142. mov `4*($j%16)`(%rsp),$xi[1]
  143. mov $c,$t0
  144. mov $a,$t2
  145. xor `4*(($j+2)%16)`(%rsp),$xi[1]
  146. xor $b,$t0
  147. rol \$5,$t2
  148. lea $K($xi[0],$e),$e
  149. xor `4*(($j+8)%16)`(%rsp),$xi[1]
  150. xor $d,$t0
  151. add $t2,$e
  152. xor `4*(($j+13)%16)`(%rsp),$xi[1]
  153. rol \$30,$b
  154. add $t0,$e
  155. rol \$1,$xi[1]
  156. ___
  157. $code.=<<___ if ($i<76);
  158. mov $xi[1],`4*($j%16)`(%rsp)
  159. ___
  160. $code.=<<___ if ($i==79);
  161. mov $c,$t0
  162. mov $a,$t2
  163. xor $b,$t0
  164. lea $K($xi[0],$e),$e
  165. rol \$5,$t2
  166. xor $d,$t0
  167. add $t2,$e
  168. rol \$30,$b
  169. add $t0,$e
  170. ___
  171. unshift(@xi,pop(@xi));
  172. }
  173. sub BODY_40_59 {
  174. my ($i,$a,$b,$c,$d,$e)=@_;
  175. my $j=$i+1;
  176. $code.=<<___;
  177. mov `4*($j%16)`(%rsp),$xi[1]
  178. mov $c,$t0
  179. mov $c,$t1
  180. xor `4*(($j+2)%16)`(%rsp),$xi[1]
  181. and $d,$t0
  182. mov $a,$t2
  183. xor `4*(($j+8)%16)`(%rsp),$xi[1]
  184. xor $d,$t1
  185. lea 0x8f1bbcdc($xi[0],$e),$e
  186. rol \$5,$t2
  187. xor `4*(($j+13)%16)`(%rsp),$xi[1]
  188. add $t0,$e
  189. and $b,$t1
  190. rol \$1,$xi[1]
  191. add $t1,$e
  192. rol \$30,$b
  193. mov $xi[1],`4*($j%16)`(%rsp)
  194. add $t2,$e
  195. ___
  196. unshift(@xi,pop(@xi));
  197. }
  198. $code.=<<___;
  199. .text
  200. .extern OPENSSL_ia32cap_P
  201. .globl sha1_block_data_order
  202. .type sha1_block_data_order,\@function,3
  203. .align 16
  204. sha1_block_data_order:
  205. mov OPENSSL_ia32cap_P+0(%rip),%r9d
  206. mov OPENSSL_ia32cap_P+4(%rip),%r8d
  207. test \$`1<<9`,%r8d # check SSSE3 bit
  208. jz .Lialu
  209. ___
  210. $code.=<<___ if ($avx);
  211. and \$`1<<28`,%r8d # mask AVX bit
  212. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  213. or %r9d,%r8d
  214. cmp \$`1<<28|1<<30`,%r8d
  215. je _avx_shortcut
  216. ___
  217. $code.=<<___;
  218. jmp _ssse3_shortcut
  219. .align 16
  220. .Lialu:
  221. push %rbx
  222. push %rbp
  223. push %r12
  224. push %r13
  225. mov %rsp,%r11
  226. mov %rdi,$ctx # reassigned argument
  227. sub \$`8+16*4`,%rsp
  228. mov %rsi,$inp # reassigned argument
  229. and \$-64,%rsp
  230. mov %rdx,$num # reassigned argument
  231. mov %r11,`16*4`(%rsp)
  232. .Lprologue:
  233. mov 0($ctx),$A
  234. mov 4($ctx),$B
  235. mov 8($ctx),$C
  236. mov 12($ctx),$D
  237. mov 16($ctx),$E
  238. jmp .Lloop
  239. .align 16
  240. .Lloop:
  241. ___
  242. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  243. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  244. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  245. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  246. $code.=<<___;
  247. add 0($ctx),$A
  248. add 4($ctx),$B
  249. add 8($ctx),$C
  250. add 12($ctx),$D
  251. add 16($ctx),$E
  252. mov $A,0($ctx)
  253. mov $B,4($ctx)
  254. mov $C,8($ctx)
  255. mov $D,12($ctx)
  256. mov $E,16($ctx)
  257. sub \$1,$num
  258. lea `16*4`($inp),$inp
  259. jnz .Lloop
  260. mov `16*4`(%rsp),%rsi
  261. mov (%rsi),%r13
  262. mov 8(%rsi),%r12
  263. mov 16(%rsi),%rbp
  264. mov 24(%rsi),%rbx
  265. lea 32(%rsi),%rsp
  266. .Lepilogue:
  267. ret
  268. .size sha1_block_data_order,.-sha1_block_data_order
  269. ___
  270. {{{
  271. my $Xi=4;
  272. my @X=map("%xmm$_",(4..7,0..3));
  273. my @Tx=map("%xmm$_",(8..10));
  274. my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
  275. my @T=("%esi","%edi");
  276. my $j=0;
  277. my $K_XX_XX="%r11";
  278. my $_rol=sub { &rol(@_) };
  279. my $_ror=sub { &ror(@_) };
  280. $code.=<<___;
  281. .type sha1_block_data_order_ssse3,\@function,3
  282. .align 16
  283. sha1_block_data_order_ssse3:
  284. _ssse3_shortcut:
  285. push %rbx
  286. push %rbp
  287. push %r12
  288. lea `-64-($win64?5*16:0)`(%rsp),%rsp
  289. ___
  290. $code.=<<___ if ($win64);
  291. movaps %xmm6,64+0(%rsp)
  292. movaps %xmm7,64+16(%rsp)
  293. movaps %xmm8,64+32(%rsp)
  294. movaps %xmm9,64+48(%rsp)
  295. movaps %xmm10,64+64(%rsp)
  296. .Lprologue_ssse3:
  297. ___
  298. $code.=<<___;
  299. mov %rdi,$ctx # reassigned argument
  300. mov %rsi,$inp # reassigned argument
  301. mov %rdx,$num # reassigned argument
  302. shl \$6,$num
  303. add $inp,$num
  304. lea K_XX_XX(%rip),$K_XX_XX
  305. mov 0($ctx),$A # load context
  306. mov 4($ctx),$B
  307. mov 8($ctx),$C
  308. mov 12($ctx),$D
  309. mov $B,@T[0] # magic seed
  310. mov 16($ctx),$E
  311. movdqa 64($K_XX_XX),@X[2] # pbswap mask
  312. movdqa 0($K_XX_XX),@Tx[1] # K_00_19
  313. movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  314. movdqu 16($inp),@X[-3&7]
  315. movdqu 32($inp),@X[-2&7]
  316. movdqu 48($inp),@X[-1&7]
  317. pshufb @X[2],@X[-4&7] # byte swap
  318. add \$64,$inp
  319. pshufb @X[2],@X[-3&7]
  320. pshufb @X[2],@X[-2&7]
  321. pshufb @X[2],@X[-1&7]
  322. paddd @Tx[1],@X[-4&7] # add K_00_19
  323. paddd @Tx[1],@X[-3&7]
  324. paddd @Tx[1],@X[-2&7]
  325. movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
  326. psubd @Tx[1],@X[-4&7] # restore X[]
  327. movdqa @X[-3&7],16(%rsp)
  328. psubd @Tx[1],@X[-3&7]
  329. movdqa @X[-2&7],32(%rsp)
  330. psubd @Tx[1],@X[-2&7]
  331. jmp .Loop_ssse3
  332. ___
  333. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  334. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  335. my $arg = pop;
  336. $arg = "\$$arg" if ($arg*1 eq $arg);
  337. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  338. }
  339. sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  340. { use integer;
  341. my $body = shift;
  342. my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
  343. my ($a,$b,$c,$d,$e);
  344. &movdqa (@X[0],@X[-3&7]);
  345. eval(shift(@insns));
  346. eval(shift(@insns));
  347. &movdqa (@Tx[0],@X[-1&7]);
  348. &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. &paddd (@Tx[1],@X[-1&7]);
  352. eval(shift(@insns));
  353. eval(shift(@insns));
  354. &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
  355. eval(shift(@insns));
  356. eval(shift(@insns));
  357. &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  358. eval(shift(@insns));
  359. eval(shift(@insns));
  360. &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  361. eval(shift(@insns));
  362. eval(shift(@insns));
  363. eval(shift(@insns));
  364. eval(shift(@insns));
  365. &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  366. eval(shift(@insns));
  367. eval(shift(@insns));
  368. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  369. eval(shift(@insns));
  370. eval(shift(@insns));
  371. &movdqa (@Tx[2],@X[0]);
  372. &movdqa (@Tx[0],@X[0]);
  373. eval(shift(@insns));
  374. eval(shift(@insns));
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
  378. &paddd (@X[0],@X[0]);
  379. eval(shift(@insns));
  380. eval(shift(@insns));
  381. eval(shift(@insns));
  382. eval(shift(@insns));
  383. &psrld (@Tx[0],31);
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. &movdqa (@Tx[1],@Tx[2]);
  387. eval(shift(@insns));
  388. eval(shift(@insns));
  389. &psrld (@Tx[2],30);
  390. &por (@X[0],@Tx[0]); # "X[0]"<<<=1
  391. eval(shift(@insns));
  392. eval(shift(@insns));
  393. eval(shift(@insns));
  394. eval(shift(@insns));
  395. &pslld (@Tx[1],2);
  396. &pxor (@X[0],@Tx[2]);
  397. eval(shift(@insns));
  398. eval(shift(@insns));
  399. &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
  400. eval(shift(@insns));
  401. eval(shift(@insns));
  402. &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  403. foreach (@insns) { eval; } # remaining instructions [if any]
  404. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  405. push(@Tx,shift(@Tx));
  406. }
  407. sub Xupdate_ssse3_32_79()
  408. { use integer;
  409. my $body = shift;
  410. my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
  411. my ($a,$b,$c,$d,$e);
  412. &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
  413. eval(shift(@insns)); # body_20_39
  414. &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  415. &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
  416. eval(shift(@insns));
  417. eval(shift(@insns));
  418. eval(shift(@insns)); # rol
  419. &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  420. eval(shift(@insns));
  421. eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
  422. if ($Xi%5) {
  423. &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
  424. } else { # ... or load next one
  425. &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
  426. }
  427. &paddd (@Tx[1],@X[-1&7]);
  428. eval(shift(@insns)); # ror
  429. eval(shift(@insns));
  430. &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  431. eval(shift(@insns)); # body_20_39
  432. eval(shift(@insns));
  433. eval(shift(@insns));
  434. eval(shift(@insns)); # rol
  435. &movdqa (@Tx[0],@X[0]);
  436. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  437. eval(shift(@insns));
  438. eval(shift(@insns));
  439. eval(shift(@insns)); # ror
  440. eval(shift(@insns));
  441. &pslld (@X[0],2);
  442. eval(shift(@insns)); # body_20_39
  443. eval(shift(@insns));
  444. &psrld (@Tx[0],30);
  445. eval(shift(@insns));
  446. eval(shift(@insns)); # rol
  447. eval(shift(@insns));
  448. eval(shift(@insns));
  449. eval(shift(@insns)); # ror
  450. eval(shift(@insns));
  451. &por (@X[0],@Tx[0]); # "X[0]"<<<=2
  452. eval(shift(@insns)); # body_20_39
  453. eval(shift(@insns));
  454. &movdqa (@Tx[1],@X[0]) if ($Xi<19);
  455. eval(shift(@insns));
  456. eval(shift(@insns)); # rol
  457. eval(shift(@insns));
  458. eval(shift(@insns));
  459. eval(shift(@insns)); # rol
  460. eval(shift(@insns));
  461. foreach (@insns) { eval; } # remaining instructions
  462. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  463. push(@Tx,shift(@Tx));
  464. }
  465. sub Xuplast_ssse3_80()
  466. { use integer;
  467. my $body = shift;
  468. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  469. my ($a,$b,$c,$d,$e);
  470. eval(shift(@insns));
  471. &paddd (@Tx[1],@X[-1&7]);
  472. eval(shift(@insns));
  473. eval(shift(@insns));
  474. eval(shift(@insns));
  475. eval(shift(@insns));
  476. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  477. foreach (@insns) { eval; } # remaining instructions
  478. &cmp ($inp,$num);
  479. &je (".Ldone_ssse3");
  480. unshift(@Tx,pop(@Tx));
  481. &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
  482. &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
  483. &movdqu (@X[-4&7],"0($inp)"); # load input
  484. &movdqu (@X[-3&7],"16($inp)");
  485. &movdqu (@X[-2&7],"32($inp)");
  486. &movdqu (@X[-1&7],"48($inp)");
  487. &pshufb (@X[-4&7],@X[2]); # byte swap
  488. &add ($inp,64);
  489. $Xi=0;
  490. }
  491. sub Xloop_ssse3()
  492. { use integer;
  493. my $body = shift;
  494. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  495. my ($a,$b,$c,$d,$e);
  496. eval(shift(@insns));
  497. eval(shift(@insns));
  498. &pshufb (@X[($Xi-3)&7],@X[2]);
  499. eval(shift(@insns));
  500. eval(shift(@insns));
  501. &paddd (@X[($Xi-4)&7],@Tx[1]);
  502. eval(shift(@insns));
  503. eval(shift(@insns));
  504. eval(shift(@insns));
  505. eval(shift(@insns));
  506. &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
  507. eval(shift(@insns));
  508. eval(shift(@insns));
  509. &psubd (@X[($Xi-4)&7],@Tx[1]);
  510. foreach (@insns) { eval; }
  511. $Xi++;
  512. }
  513. sub Xtail_ssse3()
  514. { use integer;
  515. my $body = shift;
  516. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  517. my ($a,$b,$c,$d,$e);
  518. foreach (@insns) { eval; }
  519. }
  520. sub body_00_19 () {
  521. (
  522. '($a,$b,$c,$d,$e)=@V;'.
  523. '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
  524. '&xor ($c,$d);',
  525. '&mov (@T[1],$a);', # $b in next round
  526. '&$_rol ($a,5);',
  527. '&and (@T[0],$c);', # ($b&($c^$d))
  528. '&xor ($c,$d);', # restore $c
  529. '&xor (@T[0],$d);',
  530. '&add ($e,$a);',
  531. '&$_ror ($b,$j?7:2);', # $b>>>2
  532. '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  533. );
  534. }
  535. sub body_20_39 () {
  536. (
  537. '($a,$b,$c,$d,$e)=@V;'.
  538. '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
  539. '&xor (@T[0],$d);', # ($b^$d)
  540. '&mov (@T[1],$a);', # $b in next round
  541. '&$_rol ($a,5);',
  542. '&xor (@T[0],$c);', # ($b^$d^$c)
  543. '&add ($e,$a);',
  544. '&$_ror ($b,7);', # $b>>>2
  545. '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  546. );
  547. }
  548. sub body_40_59 () {
  549. (
  550. '($a,$b,$c,$d,$e)=@V;'.
  551. '&mov (@T[1],$c);',
  552. '&xor ($c,$d);',
  553. '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
  554. '&and (@T[1],$d);',
  555. '&and (@T[0],$c);', # ($b&($c^$d))
  556. '&$_ror ($b,7);', # $b>>>2
  557. '&add ($e,@T[1]);',
  558. '&mov (@T[1],$a);', # $b in next round
  559. '&$_rol ($a,5);',
  560. '&add ($e,@T[0]);',
  561. '&xor ($c,$d);', # restore $c
  562. '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  563. );
  564. }
  565. $code.=<<___;
  566. .align 16
  567. .Loop_ssse3:
  568. ___
  569. &Xupdate_ssse3_16_31(\&body_00_19);
  570. &Xupdate_ssse3_16_31(\&body_00_19);
  571. &Xupdate_ssse3_16_31(\&body_00_19);
  572. &Xupdate_ssse3_16_31(\&body_00_19);
  573. &Xupdate_ssse3_32_79(\&body_00_19);
  574. &Xupdate_ssse3_32_79(\&body_20_39);
  575. &Xupdate_ssse3_32_79(\&body_20_39);
  576. &Xupdate_ssse3_32_79(\&body_20_39);
  577. &Xupdate_ssse3_32_79(\&body_20_39);
  578. &Xupdate_ssse3_32_79(\&body_20_39);
  579. &Xupdate_ssse3_32_79(\&body_40_59);
  580. &Xupdate_ssse3_32_79(\&body_40_59);
  581. &Xupdate_ssse3_32_79(\&body_40_59);
  582. &Xupdate_ssse3_32_79(\&body_40_59);
  583. &Xupdate_ssse3_32_79(\&body_40_59);
  584. &Xupdate_ssse3_32_79(\&body_20_39);
  585. &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
  586. $saved_j=$j; @saved_V=@V;
  587. &Xloop_ssse3(\&body_20_39);
  588. &Xloop_ssse3(\&body_20_39);
  589. &Xloop_ssse3(\&body_20_39);
  590. $code.=<<___;
  591. add 0($ctx),$A # update context
  592. add 4($ctx),@T[0]
  593. add 8($ctx),$C
  594. add 12($ctx),$D
  595. mov $A,0($ctx)
  596. add 16($ctx),$E
  597. mov @T[0],4($ctx)
  598. mov @T[0],$B # magic seed
  599. mov $C,8($ctx)
  600. mov $D,12($ctx)
  601. mov $E,16($ctx)
  602. jmp .Loop_ssse3
  603. .align 16
  604. .Ldone_ssse3:
  605. ___
  606. $j=$saved_j; @V=@saved_V;
  607. &Xtail_ssse3(\&body_20_39);
  608. &Xtail_ssse3(\&body_20_39);
  609. &Xtail_ssse3(\&body_20_39);
  610. $code.=<<___;
  611. add 0($ctx),$A # update context
  612. add 4($ctx),@T[0]
  613. add 8($ctx),$C
  614. mov $A,0($ctx)
  615. add 12($ctx),$D
  616. mov @T[0],4($ctx)
  617. add 16($ctx),$E
  618. mov $C,8($ctx)
  619. mov $D,12($ctx)
  620. mov $E,16($ctx)
  621. ___
  622. $code.=<<___ if ($win64);
  623. movaps 64+0(%rsp),%xmm6
  624. movaps 64+16(%rsp),%xmm7
  625. movaps 64+32(%rsp),%xmm8
  626. movaps 64+48(%rsp),%xmm9
  627. movaps 64+64(%rsp),%xmm10
  628. ___
  629. $code.=<<___;
  630. lea `64+($win64?5*16:0)`(%rsp),%rsi
  631. mov 0(%rsi),%r12
  632. mov 8(%rsi),%rbp
  633. mov 16(%rsi),%rbx
  634. lea 24(%rsi),%rsp
  635. .Lepilogue_ssse3:
  636. ret
  637. .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
  638. ___
  639. if ($avx) {
  640. my $Xi=4;
  641. my @X=map("%xmm$_",(4..7,0..3));
  642. my @Tx=map("%xmm$_",(8..10));
  643. my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
  644. my @T=("%esi","%edi");
  645. my $j=0;
  646. my $K_XX_XX="%r11";
  647. my $_rol=sub { &shld(@_[0],@_) };
  648. my $_ror=sub { &shrd(@_[0],@_) };
  649. $code.=<<___;
  650. .type sha1_block_data_order_avx,\@function,3
  651. .align 16
  652. sha1_block_data_order_avx:
  653. _avx_shortcut:
  654. push %rbx
  655. push %rbp
  656. push %r12
  657. lea `-64-($win64?5*16:0)`(%rsp),%rsp
  658. ___
  659. $code.=<<___ if ($win64);
  660. movaps %xmm6,64+0(%rsp)
  661. movaps %xmm7,64+16(%rsp)
  662. movaps %xmm8,64+32(%rsp)
  663. movaps %xmm9,64+48(%rsp)
  664. movaps %xmm10,64+64(%rsp)
  665. .Lprologue_avx:
  666. ___
  667. $code.=<<___;
  668. mov %rdi,$ctx # reassigned argument
  669. mov %rsi,$inp # reassigned argument
  670. mov %rdx,$num # reassigned argument
  671. vzeroall
  672. shl \$6,$num
  673. add $inp,$num
  674. lea K_XX_XX(%rip),$K_XX_XX
  675. mov 0($ctx),$A # load context
  676. mov 4($ctx),$B
  677. mov 8($ctx),$C
  678. mov 12($ctx),$D
  679. mov $B,@T[0] # magic seed
  680. mov 16($ctx),$E
  681. vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
  682. vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
  683. vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  684. vmovdqu 16($inp),@X[-3&7]
  685. vmovdqu 32($inp),@X[-2&7]
  686. vmovdqu 48($inp),@X[-1&7]
  687. vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
  688. add \$64,$inp
  689. vpshufb @X[2],@X[-3&7],@X[-3&7]
  690. vpshufb @X[2],@X[-2&7],@X[-2&7]
  691. vpshufb @X[2],@X[-1&7],@X[-1&7]
  692. vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
  693. vpaddd @Tx[1],@X[-3&7],@X[1]
  694. vpaddd @Tx[1],@X[-2&7],@X[2]
  695. vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
  696. vmovdqa @X[1],16(%rsp)
  697. vmovdqa @X[2],32(%rsp)
  698. jmp .Loop_avx
  699. ___
  700. sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
  701. { use integer;
  702. my $body = shift;
  703. my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
  704. my ($a,$b,$c,$d,$e);
  705. eval(shift(@insns));
  706. eval(shift(@insns));
  707. &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
  708. eval(shift(@insns));
  709. eval(shift(@insns));
  710. &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
  711. eval(shift(@insns));
  712. eval(shift(@insns));
  713. &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
  714. eval(shift(@insns));
  715. eval(shift(@insns));
  716. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  717. eval(shift(@insns));
  718. eval(shift(@insns));
  719. &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  720. eval(shift(@insns));
  721. eval(shift(@insns));
  722. eval(shift(@insns));
  723. eval(shift(@insns));
  724. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  725. eval(shift(@insns));
  726. eval(shift(@insns));
  727. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  728. eval(shift(@insns));
  729. eval(shift(@insns));
  730. &vpsrld (@Tx[0],@X[0],31);
  731. eval(shift(@insns));
  732. eval(shift(@insns));
  733. eval(shift(@insns));
  734. eval(shift(@insns));
  735. &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
  736. &vpaddd (@X[0],@X[0],@X[0]);
  737. eval(shift(@insns));
  738. eval(shift(@insns));
  739. eval(shift(@insns));
  740. eval(shift(@insns));
  741. &vpsrld (@Tx[1],@Tx[2],30);
  742. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
  743. eval(shift(@insns));
  744. eval(shift(@insns));
  745. eval(shift(@insns));
  746. eval(shift(@insns));
  747. &vpslld (@Tx[2],@Tx[2],2);
  748. &vpxor (@X[0],@X[0],@Tx[1]);
  749. eval(shift(@insns));
  750. eval(shift(@insns));
  751. eval(shift(@insns));
  752. eval(shift(@insns));
  753. &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
  754. eval(shift(@insns));
  755. eval(shift(@insns));
  756. &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
  757. eval(shift(@insns));
  758. eval(shift(@insns));
  759. foreach (@insns) { eval; } # remaining instructions [if any]
  760. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  761. push(@Tx,shift(@Tx));
  762. }
  763. sub Xupdate_avx_32_79()
  764. { use integer;
  765. my $body = shift;
  766. my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
  767. my ($a,$b,$c,$d,$e);
  768. &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
  769. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  770. eval(shift(@insns)); # body_20_39
  771. eval(shift(@insns));
  772. eval(shift(@insns));
  773. eval(shift(@insns)); # rol
  774. &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  775. eval(shift(@insns));
  776. eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
  777. if ($Xi%5) {
  778. &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
  779. } else { # ... or load next one
  780. &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
  781. }
  782. &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
  783. eval(shift(@insns)); # ror
  784. eval(shift(@insns));
  785. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  786. eval(shift(@insns)); # body_20_39
  787. eval(shift(@insns));
  788. eval(shift(@insns));
  789. eval(shift(@insns)); # rol
  790. &vpsrld (@Tx[0],@X[0],30);
  791. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  792. eval(shift(@insns));
  793. eval(shift(@insns));
  794. eval(shift(@insns)); # ror
  795. eval(shift(@insns));
  796. &vpslld (@X[0],@X[0],2);
  797. eval(shift(@insns)); # body_20_39
  798. eval(shift(@insns));
  799. eval(shift(@insns));
  800. eval(shift(@insns)); # rol
  801. eval(shift(@insns));
  802. eval(shift(@insns));
  803. eval(shift(@insns)); # ror
  804. eval(shift(@insns));
  805. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
  806. eval(shift(@insns)); # body_20_39
  807. eval(shift(@insns));
  808. &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
  809. eval(shift(@insns));
  810. eval(shift(@insns)); # rol
  811. eval(shift(@insns));
  812. eval(shift(@insns));
  813. eval(shift(@insns)); # rol
  814. eval(shift(@insns));
  815. foreach (@insns) { eval; } # remaining instructions
  816. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  817. push(@Tx,shift(@Tx));
  818. }
  819. sub Xuplast_avx_80()
  820. { use integer;
  821. my $body = shift;
  822. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  823. my ($a,$b,$c,$d,$e);
  824. eval(shift(@insns));
  825. &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
  826. eval(shift(@insns));
  827. eval(shift(@insns));
  828. eval(shift(@insns));
  829. eval(shift(@insns));
  830. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  831. foreach (@insns) { eval; } # remaining instructions
  832. &cmp ($inp,$num);
  833. &je (".Ldone_avx");
  834. unshift(@Tx,pop(@Tx));
  835. &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
  836. &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
  837. &vmovdqu(@X[-4&7],"0($inp)"); # load input
  838. &vmovdqu(@X[-3&7],"16($inp)");
  839. &vmovdqu(@X[-2&7],"32($inp)");
  840. &vmovdqu(@X[-1&7],"48($inp)");
  841. &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
  842. &add ($inp,64);
  843. $Xi=0;
  844. }
  845. sub Xloop_avx()
  846. { use integer;
  847. my $body = shift;
  848. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  849. my ($a,$b,$c,$d,$e);
  850. eval(shift(@insns));
  851. eval(shift(@insns));
  852. &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
  853. eval(shift(@insns));
  854. eval(shift(@insns));
  855. &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
  856. eval(shift(@insns));
  857. eval(shift(@insns));
  858. eval(shift(@insns));
  859. eval(shift(@insns));
  860. &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
  861. eval(shift(@insns));
  862. eval(shift(@insns));
  863. foreach (@insns) { eval; }
  864. $Xi++;
  865. }
  866. sub Xtail_avx()
  867. { use integer;
  868. my $body = shift;
  869. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  870. my ($a,$b,$c,$d,$e);
  871. foreach (@insns) { eval; }
  872. }
  873. $code.=<<___;
  874. .align 16
  875. .Loop_avx:
  876. ___
  877. &Xupdate_avx_16_31(\&body_00_19);
  878. &Xupdate_avx_16_31(\&body_00_19);
  879. &Xupdate_avx_16_31(\&body_00_19);
  880. &Xupdate_avx_16_31(\&body_00_19);
  881. &Xupdate_avx_32_79(\&body_00_19);
  882. &Xupdate_avx_32_79(\&body_20_39);
  883. &Xupdate_avx_32_79(\&body_20_39);
  884. &Xupdate_avx_32_79(\&body_20_39);
  885. &Xupdate_avx_32_79(\&body_20_39);
  886. &Xupdate_avx_32_79(\&body_20_39);
  887. &Xupdate_avx_32_79(\&body_40_59);
  888. &Xupdate_avx_32_79(\&body_40_59);
  889. &Xupdate_avx_32_79(\&body_40_59);
  890. &Xupdate_avx_32_79(\&body_40_59);
  891. &Xupdate_avx_32_79(\&body_40_59);
  892. &Xupdate_avx_32_79(\&body_20_39);
  893. &Xuplast_avx_80(\&body_20_39); # can jump to "done"
  894. $saved_j=$j; @saved_V=@V;
  895. &Xloop_avx(\&body_20_39);
  896. &Xloop_avx(\&body_20_39);
  897. &Xloop_avx(\&body_20_39);
  898. $code.=<<___;
  899. add 0($ctx),$A # update context
  900. add 4($ctx),@T[0]
  901. add 8($ctx),$C
  902. add 12($ctx),$D
  903. mov $A,0($ctx)
  904. add 16($ctx),$E
  905. mov @T[0],4($ctx)
  906. mov @T[0],$B # magic seed
  907. mov $C,8($ctx)
  908. mov $D,12($ctx)
  909. mov $E,16($ctx)
  910. jmp .Loop_avx
  911. .align 16
  912. .Ldone_avx:
  913. ___
  914. $j=$saved_j; @V=@saved_V;
  915. &Xtail_avx(\&body_20_39);
  916. &Xtail_avx(\&body_20_39);
  917. &Xtail_avx(\&body_20_39);
  918. $code.=<<___;
  919. vzeroall
  920. add 0($ctx),$A # update context
  921. add 4($ctx),@T[0]
  922. add 8($ctx),$C
  923. mov $A,0($ctx)
  924. add 12($ctx),$D
  925. mov @T[0],4($ctx)
  926. add 16($ctx),$E
  927. mov $C,8($ctx)
  928. mov $D,12($ctx)
  929. mov $E,16($ctx)
  930. ___
  931. $code.=<<___ if ($win64);
  932. movaps 64+0(%rsp),%xmm6
  933. movaps 64+16(%rsp),%xmm7
  934. movaps 64+32(%rsp),%xmm8
  935. movaps 64+48(%rsp),%xmm9
  936. movaps 64+64(%rsp),%xmm10
  937. ___
  938. $code.=<<___;
  939. lea `64+($win64?5*16:0)`(%rsp),%rsi
  940. mov 0(%rsi),%r12
  941. mov 8(%rsi),%rbp
  942. mov 16(%rsi),%rbx
  943. lea 24(%rsi),%rsp
  944. .Lepilogue_avx:
  945. ret
  946. .size sha1_block_data_order_avx,.-sha1_block_data_order_avx
  947. ___
  948. }
  949. $code.=<<___;
  950. .align 64
  951. K_XX_XX:
  952. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  953. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  954. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  955. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  956. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
  957. ___
  958. }}}
  959. $code.=<<___;
  960. .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  961. .align 64
  962. ___
  963. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  964. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  965. if ($win64) {
  966. $rec="%rcx";
  967. $frame="%rdx";
  968. $context="%r8";
  969. $disp="%r9";
  970. $code.=<<___;
  971. .extern __imp_RtlVirtualUnwind
  972. .type se_handler,\@abi-omnipotent
  973. .align 16
  974. se_handler:
  975. push %rsi
  976. push %rdi
  977. push %rbx
  978. push %rbp
  979. push %r12
  980. push %r13
  981. push %r14
  982. push %r15
  983. pushfq
  984. sub \$64,%rsp
  985. mov 120($context),%rax # pull context->Rax
  986. mov 248($context),%rbx # pull context->Rip
  987. lea .Lprologue(%rip),%r10
  988. cmp %r10,%rbx # context->Rip<.Lprologue
  989. jb .Lcommon_seh_tail
  990. mov 152($context),%rax # pull context->Rsp
  991. lea .Lepilogue(%rip),%r10
  992. cmp %r10,%rbx # context->Rip>=.Lepilogue
  993. jae .Lcommon_seh_tail
  994. mov `16*4`(%rax),%rax # pull saved stack pointer
  995. lea 32(%rax),%rax
  996. mov -8(%rax),%rbx
  997. mov -16(%rax),%rbp
  998. mov -24(%rax),%r12
  999. mov -32(%rax),%r13
  1000. mov %rbx,144($context) # restore context->Rbx
  1001. mov %rbp,160($context) # restore context->Rbp
  1002. mov %r12,216($context) # restore context->R12
  1003. mov %r13,224($context) # restore context->R13
  1004. jmp .Lcommon_seh_tail
  1005. .size se_handler,.-se_handler
  1006. .type ssse3_handler,\@abi-omnipotent
  1007. .align 16
  1008. ssse3_handler:
  1009. push %rsi
  1010. push %rdi
  1011. push %rbx
  1012. push %rbp
  1013. push %r12
  1014. push %r13
  1015. push %r14
  1016. push %r15
  1017. pushfq
  1018. sub \$64,%rsp
  1019. mov 120($context),%rax # pull context->Rax
  1020. mov 248($context),%rbx # pull context->Rip
  1021. mov 8($disp),%rsi # disp->ImageBase
  1022. mov 56($disp),%r11 # disp->HandlerData
  1023. mov 0(%r11),%r10d # HandlerData[0]
  1024. lea (%rsi,%r10),%r10 # prologue label
  1025. cmp %r10,%rbx # context->Rip<prologue label
  1026. jb .Lcommon_seh_tail
  1027. mov 152($context),%rax # pull context->Rsp
  1028. mov 4(%r11),%r10d # HandlerData[1]
  1029. lea (%rsi,%r10),%r10 # epilogue label
  1030. cmp %r10,%rbx # context->Rip>=epilogue label
  1031. jae .Lcommon_seh_tail
  1032. lea 64(%rax),%rsi
  1033. lea 512($context),%rdi # &context.Xmm6
  1034. mov \$10,%ecx
  1035. .long 0xa548f3fc # cld; rep movsq
  1036. lea `24+64+5*16`(%rax),%rax # adjust stack pointer
  1037. mov -8(%rax),%rbx
  1038. mov -16(%rax),%rbp
  1039. mov -24(%rax),%r12
  1040. mov %rbx,144($context) # restore context->Rbx
  1041. mov %rbp,160($context) # restore context->Rbp
  1042. mov %r12,216($context) # restore cotnext->R12
  1043. .Lcommon_seh_tail:
  1044. mov 8(%rax),%rdi
  1045. mov 16(%rax),%rsi
  1046. mov %rax,152($context) # restore context->Rsp
  1047. mov %rsi,168($context) # restore context->Rsi
  1048. mov %rdi,176($context) # restore context->Rdi
  1049. mov 40($disp),%rdi # disp->ContextRecord
  1050. mov $context,%rsi # context
  1051. mov \$154,%ecx # sizeof(CONTEXT)
  1052. .long 0xa548f3fc # cld; rep movsq
  1053. mov $disp,%rsi
  1054. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1055. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1056. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1057. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1058. mov 40(%rsi),%r10 # disp->ContextRecord
  1059. lea 56(%rsi),%r11 # &disp->HandlerData
  1060. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1061. mov %r10,32(%rsp) # arg5
  1062. mov %r11,40(%rsp) # arg6
  1063. mov %r12,48(%rsp) # arg7
  1064. mov %rcx,56(%rsp) # arg8, (NULL)
  1065. call *__imp_RtlVirtualUnwind(%rip)
  1066. mov \$1,%eax # ExceptionContinueSearch
  1067. add \$64,%rsp
  1068. popfq
  1069. pop %r15
  1070. pop %r14
  1071. pop %r13
  1072. pop %r12
  1073. pop %rbp
  1074. pop %rbx
  1075. pop %rdi
  1076. pop %rsi
  1077. ret
  1078. .size ssse3_handler,.-ssse3_handler
  1079. .section .pdata
  1080. .align 4
  1081. .rva .LSEH_begin_sha1_block_data_order
  1082. .rva .LSEH_end_sha1_block_data_order
  1083. .rva .LSEH_info_sha1_block_data_order
  1084. .rva .LSEH_begin_sha1_block_data_order_ssse3
  1085. .rva .LSEH_end_sha1_block_data_order_ssse3
  1086. .rva .LSEH_info_sha1_block_data_order_ssse3
  1087. ___
  1088. $code.=<<___ if ($avx);
  1089. .rva .LSEH_begin_sha1_block_data_order_avx
  1090. .rva .LSEH_end_sha1_block_data_order_avx
  1091. .rva .LSEH_info_sha1_block_data_order_avx
  1092. ___
  1093. $code.=<<___;
  1094. .section .xdata
  1095. .align 8
  1096. .LSEH_info_sha1_block_data_order:
  1097. .byte 9,0,0,0
  1098. .rva se_handler
  1099. .LSEH_info_sha1_block_data_order_ssse3:
  1100. .byte 9,0,0,0
  1101. .rva ssse3_handler
  1102. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  1103. ___
  1104. $code.=<<___ if ($avx);
  1105. .LSEH_info_sha1_block_data_order_avx:
  1106. .byte 9,0,0,0
  1107. .rva ssse3_handler
  1108. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  1109. ___
  1110. }
  1111. ####################################################################
  1112. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1113. print $code;
  1114. close STDOUT;