sha512-x86_64.pl 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. Rights for redistribution and usage in source and binary
  12. # forms are granted according to the License.
  13. # ====================================================================
  14. #
  15. # sha256/512_block procedure for x86_64.
  16. #
  17. # 40% improvement over compiler-generated code on Opteron. On EM64T
  18. # sha256 was observed to run >80% faster and sha512 - >40%. No magical
  19. # tricks, just straight implementation... I really wonder why gcc
  20. # [being armed with inline assembler] fails to generate as fast code.
  21. # The only thing which is cool about this module is that it's very
  22. # same instruction sequence used for both SHA-256 and SHA-512. In
  23. # former case the instructions operate on 32-bit operands, while in
  24. # latter - on 64-bit ones. All I had to do is to get one flavor right,
  25. # the other one passed the test right away:-)
  26. #
  27. # sha256_block runs in ~1005 cycles on Opteron, which gives you
  28. # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  29. # frequency in GHz. sha512_block runs in ~1275 cycles, which results
  30. # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  31. # Well, if you compare it to IA-64 implementation, which maintains
  32. # X[16] in register bank[!], tends to 4 instructions per CPU clock
  33. # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  34. # issue Opteron pipeline and X[16] maintained in memory. So that *if*
  35. # there is a way to improve it, *then* the only way would be to try to
  36. # offload X[16] updates to SSE unit, but that would require "deeper"
  37. # loop unroll, which in turn would naturally cause size blow-up, not
  38. # to mention increased complexity! And once again, only *if* it's
  39. # actually possible to noticeably improve overall ILP, instruction
  40. # level parallelism, on a given CPU implementation in this case.
  41. #
  42. # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  43. # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
  44. # [currently available] EM64T CPUs apparently are far from it. On the
  45. # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  46. # sha256_block:-( This is presumably because 64-bit shifts/rotates
  47. # apparently are not atomic instructions, but implemented in microcode.
  48. #
  49. # May 2012.
  50. #
  51. # Optimization including one of Pavel Semjanov's ideas, alternative
  52. # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
  53. # unfortunately -2% SHA512 on P4 [which nobody should care about
  54. # that much].
  55. #
  56. # June 2012.
  57. #
  58. # Add SIMD code paths, see below for improvement coefficients. SSSE3
  59. # code path was not attempted for SHA512, because improvement is not
  60. # estimated to be high enough, noticeably less than 9%, to justify
  61. # the effort, not on pre-AVX processors. [Obviously with exclusion
  62. # for VIA Nano, but it has SHA512 instruction that is faster and
  63. # should be used instead.] For reference, corresponding estimated
  64. # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
  65. # higher coefficients are observed on VIA Nano and Bulldozer has more
  66. # to do with specifics of their architecture [which is topic for
  67. # separate discussion].
  68. #
  69. # November 2012.
  70. #
  71. # Add AVX2 code path. Two consecutive input blocks are loaded to
  72. # 256-bit %ymm registers, with data from first block to least
  73. # significant 128-bit halves and data from second to most significant.
  74. # The data is then processed with same SIMD instruction sequence as
  75. # for AVX, but with %ymm as operands. Side effect is increased stack
  76. # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
  77. # code size increase.
  78. #
  79. # March 2014.
  80. #
  81. # Add support for Intel SHA Extensions.
  82. ######################################################################
  83. # Current performance in cycles per processed byte (less is better):
  84. #
  85. # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
  86. #
  87. # AMD K8 14.9 - - 9.57 -
  88. # P4 17.3 - - 30.8 -
  89. # Core 2 15.6 13.8(+13%) - 9.97 -
  90. # Westmere 14.8 12.3(+19%) - 9.58 -
  91. # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
  92. # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
  93. # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
  94. # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
  95. # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
  96. # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
  97. # VIA Nano 23.0 16.5(+39%) - 14.7 -
  98. # Atom 23.0 18.9(+22%) - 14.7 -
  99. # Silvermont 27.4 20.6(+33%) - 17.5 -
  100. # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
  101. # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
  102. #
  103. # (*) whichever best applicable, including SHAEXT;
  104. # (**) switch from ror to shrd stands for fair share of improvement;
  105. # (***) execution time is fully determined by remaining integer-only
  106. # part, body_00_15; reducing the amount of SIMD instructions
  107. # below certain limit makes no difference/sense; to conserve
  108. # space SHA256 XOP code path is therefore omitted;
  109. # $output is the last argument if it looks like a file (it has an extension)
  110. # $flavour is the first argument if it doesn't look like a file
  111. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  112. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  113. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  114. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  115. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  116. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  117. die "can't locate x86_64-xlate.pl";
  118. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  119. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  120. $avx = ($1>=2.19) + ($1>=2.22);
  121. }
  122. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  123. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  124. $avx = ($1>=2.09) + ($1>=2.10);
  125. }
  126. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  127. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  128. $avx = ($1>=10) + ($1>=11);
  129. }
  130. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  131. $avx = ($2>=3.0) + ($2>3.0);
  132. }
  133. $shaext=1; ### set to zero if compiling for 1.0.1
  134. $avx=1 if (!$shaext && $avx);
  135. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  136. or die "can't call $xlate: $!";
  137. *STDOUT=*OUT;
  138. if ($output =~ /512/) {
  139. $func="sha512_block_data_order";
  140. $TABLE="K512";
  141. $SZ=8;
  142. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
  143. "%r8", "%r9", "%r10","%r11");
  144. ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
  145. @Sigma0=(28,34,39);
  146. @Sigma1=(14,18,41);
  147. @sigma0=(1, 8, 7);
  148. @sigma1=(19,61, 6);
  149. $rounds=80;
  150. } else {
  151. $func="sha256_block_data_order";
  152. $TABLE="K256";
  153. $SZ=4;
  154. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  155. "%r8d","%r9d","%r10d","%r11d");
  156. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
  157. @Sigma0=( 2,13,22);
  158. @Sigma1=( 6,11,25);
  159. @sigma0=( 7,18, 3);
  160. @sigma1=(17,19,10);
  161. $rounds=64;
  162. }
  163. $ctx="%rdi"; # 1st arg, zapped by $a3
  164. $inp="%rsi"; # 2nd arg
  165. $Tbl="%rbp";
  166. $_ctx="16*$SZ+0*8(%rsp)";
  167. $_inp="16*$SZ+1*8(%rsp)";
  168. $_end="16*$SZ+2*8(%rsp)";
  169. $_rsp="`16*$SZ+3*8`(%rsp)";
  170. $framesz="16*$SZ+4*8";
  171. sub ROUND_00_15()
  172. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  173. my $STRIDE=$SZ;
  174. $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
  175. $code.=<<___;
  176. ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
  177. mov $f,$a2
  178. xor $e,$a0
  179. ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
  180. xor $g,$a2 # f^g
  181. mov $T1,`$SZ*($i&0xf)`(%rsp)
  182. xor $a,$a1
  183. and $e,$a2 # (f^g)&e
  184. ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
  185. add $h,$T1 # T1+=h
  186. xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
  187. ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
  188. xor $e,$a0
  189. add $a2,$T1 # T1+=Ch(e,f,g)
  190. mov $a,$a2
  191. add ($Tbl),$T1 # T1+=K[round]
  192. xor $a,$a1
  193. xor $b,$a2 # a^b, b^c in next round
  194. ror \$$Sigma1[0],$a0 # Sigma1(e)
  195. mov $b,$h
  196. and $a2,$a3
  197. ror \$$Sigma0[0],$a1 # Sigma0(a)
  198. add $a0,$T1 # T1+=Sigma1(e)
  199. xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  200. add $T1,$d # d+=T1
  201. add $T1,$h # h+=T1
  202. lea $STRIDE($Tbl),$Tbl # round++
  203. ___
  204. $code.=<<___ if ($i<15);
  205. add $a1,$h # h+=Sigma0(a)
  206. ___
  207. ($a2,$a3) = ($a3,$a2);
  208. }
  209. sub ROUND_16_XX()
  210. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  211. $code.=<<___;
  212. mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
  213. mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
  214. mov $a0,$T1
  215. ror \$`$sigma0[1]-$sigma0[0]`,$a0
  216. add $a1,$a # modulo-scheduled h+=Sigma0(a)
  217. mov $a2,$a1
  218. ror \$`$sigma1[1]-$sigma1[0]`,$a2
  219. xor $T1,$a0
  220. shr \$$sigma0[2],$T1
  221. ror \$$sigma0[0],$a0
  222. xor $a1,$a2
  223. shr \$$sigma1[2],$a1
  224. ror \$$sigma1[0],$a2
  225. xor $a0,$T1 # sigma0(X[(i+1)&0xf])
  226. xor $a1,$a2 # sigma1(X[(i+14)&0xf])
  227. add `$SZ*(($i+9)&0xf)`(%rsp),$T1
  228. add `$SZ*($i&0xf)`(%rsp),$T1
  229. mov $e,$a0
  230. add $a2,$T1
  231. mov $a,$a1
  232. ___
  233. &ROUND_00_15(@_);
  234. }
  235. $code=<<___;
  236. .text
  237. .extern OPENSSL_ia32cap_P
  238. .globl $func
  239. .type $func,\@function,3
  240. .align 16
  241. $func:
  242. .cfi_startproc
  243. ___
  244. $code.=<<___ if ($SZ==4 || $avx);
  245. lea OPENSSL_ia32cap_P(%rip),%r11
  246. mov 0(%r11),%r9d
  247. mov 4(%r11),%r10d
  248. mov 8(%r11),%r11d
  249. ___
  250. $code.=<<___ if ($SZ==4 && $shaext);
  251. test \$`1<<29`,%r11d # check for SHA
  252. jnz _shaext_shortcut
  253. ___
  254. $code.=<<___ if ($avx && $SZ==8);
  255. test \$`1<<11`,%r10d # check for XOP
  256. jnz .Lxop_shortcut
  257. ___
  258. $code.=<<___ if ($avx>1);
  259. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  260. cmp \$`1<<8|1<<5|1<<3`,%r11d
  261. je .Lavx2_shortcut
  262. ___
  263. $code.=<<___ if ($avx);
  264. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  265. and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
  266. or %r9d,%r10d
  267. cmp \$`1<<28|1<<9|1<<30`,%r10d
  268. je .Lavx_shortcut
  269. ___
  270. $code.=<<___ if ($SZ==4);
  271. test \$`1<<9`,%r10d
  272. jnz .Lssse3_shortcut
  273. ___
  274. $code.=<<___;
  275. mov %rsp,%rax # copy %rsp
  276. .cfi_def_cfa_register %rax
  277. push %rbx
  278. .cfi_push %rbx
  279. push %rbp
  280. .cfi_push %rbp
  281. push %r12
  282. .cfi_push %r12
  283. push %r13
  284. .cfi_push %r13
  285. push %r14
  286. .cfi_push %r14
  287. push %r15
  288. .cfi_push %r15
  289. shl \$4,%rdx # num*16
  290. sub \$$framesz,%rsp
  291. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  292. and \$-64,%rsp # align stack frame
  293. mov $ctx,$_ctx # save ctx, 1st arg
  294. mov $inp,$_inp # save inp, 2nd arh
  295. mov %rdx,$_end # save end pointer, "3rd" arg
  296. mov %rax,$_rsp # save copy of %rsp
  297. .cfi_cfa_expression $_rsp,deref,+8
  298. .Lprologue:
  299. mov $SZ*0($ctx),$A
  300. mov $SZ*1($ctx),$B
  301. mov $SZ*2($ctx),$C
  302. mov $SZ*3($ctx),$D
  303. mov $SZ*4($ctx),$E
  304. mov $SZ*5($ctx),$F
  305. mov $SZ*6($ctx),$G
  306. mov $SZ*7($ctx),$H
  307. jmp .Lloop
  308. .align 16
  309. .Lloop:
  310. mov $B,$a3
  311. lea $TABLE(%rip),$Tbl
  312. xor $C,$a3 # magic
  313. ___
  314. for($i=0;$i<16;$i++) {
  315. $code.=" mov $SZ*$i($inp),$T1\n";
  316. $code.=" mov @ROT[4],$a0\n";
  317. $code.=" mov @ROT[0],$a1\n";
  318. $code.=" bswap $T1\n";
  319. &ROUND_00_15($i,@ROT);
  320. unshift(@ROT,pop(@ROT));
  321. }
  322. $code.=<<___;
  323. jmp .Lrounds_16_xx
  324. .align 16
  325. .Lrounds_16_xx:
  326. ___
  327. for(;$i<32;$i++) {
  328. &ROUND_16_XX($i,@ROT);
  329. unshift(@ROT,pop(@ROT));
  330. }
  331. $code.=<<___;
  332. cmpb \$0,`$SZ-1`($Tbl)
  333. jnz .Lrounds_16_xx
  334. mov $_ctx,$ctx
  335. add $a1,$A # modulo-scheduled h+=Sigma0(a)
  336. lea 16*$SZ($inp),$inp
  337. add $SZ*0($ctx),$A
  338. add $SZ*1($ctx),$B
  339. add $SZ*2($ctx),$C
  340. add $SZ*3($ctx),$D
  341. add $SZ*4($ctx),$E
  342. add $SZ*5($ctx),$F
  343. add $SZ*6($ctx),$G
  344. add $SZ*7($ctx),$H
  345. cmp $_end,$inp
  346. mov $A,$SZ*0($ctx)
  347. mov $B,$SZ*1($ctx)
  348. mov $C,$SZ*2($ctx)
  349. mov $D,$SZ*3($ctx)
  350. mov $E,$SZ*4($ctx)
  351. mov $F,$SZ*5($ctx)
  352. mov $G,$SZ*6($ctx)
  353. mov $H,$SZ*7($ctx)
  354. jb .Lloop
  355. mov $_rsp,%rsi
  356. .cfi_def_cfa %rsi,8
  357. mov -48(%rsi),%r15
  358. .cfi_restore %r15
  359. mov -40(%rsi),%r14
  360. .cfi_restore %r14
  361. mov -32(%rsi),%r13
  362. .cfi_restore %r13
  363. mov -24(%rsi),%r12
  364. .cfi_restore %r12
  365. mov -16(%rsi),%rbp
  366. .cfi_restore %rbp
  367. mov -8(%rsi),%rbx
  368. .cfi_restore %rbx
  369. lea (%rsi),%rsp
  370. .cfi_def_cfa_register %rsp
  371. .Lepilogue:
  372. ret
  373. .cfi_endproc
  374. .size $func,.-$func
  375. ___
  376. if ($SZ==4) {
  377. $code.=<<___;
  378. .align 64
  379. .type $TABLE,\@object
  380. $TABLE:
  381. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  382. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  383. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  384. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  385. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  386. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  387. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  388. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  389. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  390. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  391. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  392. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  393. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  394. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  395. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  396. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  397. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  398. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  399. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  400. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  401. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  402. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  403. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  404. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  405. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  406. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  407. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  408. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  409. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  410. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  411. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  412. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  413. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  414. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  415. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  416. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  417. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  418. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  419. .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  420. ___
  421. } else {
  422. $code.=<<___;
  423. .align 64
  424. .type $TABLE,\@object
  425. $TABLE:
  426. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  427. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  428. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  429. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  430. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  431. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  432. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  433. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  434. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  435. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  436. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  437. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  438. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  439. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  440. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  441. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  442. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  443. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  444. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  445. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  446. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  447. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  448. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  449. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  450. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  451. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  452. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  453. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  454. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  455. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  456. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  457. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  458. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  459. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  460. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  461. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  462. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  463. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  464. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  465. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  466. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  467. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  468. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  469. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  470. .quad 0xd192e819d6ef5218,0xd69906245565a910
  471. .quad 0xd192e819d6ef5218,0xd69906245565a910
  472. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  473. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  474. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  475. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  476. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  477. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  478. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  479. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  480. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  481. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  482. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  483. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  484. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  485. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  486. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  487. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  488. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  489. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  490. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  491. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  492. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  493. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  494. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  495. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  496. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  497. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  498. .quad 0x28db77f523047d84,0x32caab7b40c72493
  499. .quad 0x28db77f523047d84,0x32caab7b40c72493
  500. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  501. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  502. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  503. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  504. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  505. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  506. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  507. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  508. .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  509. ___
  510. }
  511. ######################################################################
  512. # SIMD code paths
  513. #
  514. if ($SZ==4 && $shaext) {{{
  515. ######################################################################
  516. # Intel SHA Extensions implementation of SHA256 update function.
  517. #
  518. my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
  519. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
  520. my @MSG=map("%xmm$_",(3..6));
  521. $code.=<<___;
  522. .type sha256_block_data_order_shaext,\@function,3
  523. .align 64
  524. sha256_block_data_order_shaext:
  525. _shaext_shortcut:
  526. ___
  527. $code.=<<___ if ($win64);
  528. lea `-8-5*16`(%rsp),%rsp
  529. movaps %xmm6,-8-5*16(%rax)
  530. movaps %xmm7,-8-4*16(%rax)
  531. movaps %xmm8,-8-3*16(%rax)
  532. movaps %xmm9,-8-2*16(%rax)
  533. movaps %xmm10,-8-1*16(%rax)
  534. .Lprologue_shaext:
  535. ___
  536. $code.=<<___;
  537. lea K256+0x80(%rip),$Tbl
  538. movdqu ($ctx),$ABEF # DCBA
  539. movdqu 16($ctx),$CDGH # HGFE
  540. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  541. pshufd \$0x1b,$ABEF,$Wi # ABCD
  542. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  543. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  544. movdqa $TMP,$BSWAP # offload
  545. palignr \$8,$CDGH,$ABEF # ABEF
  546. punpcklqdq $Wi,$CDGH # CDGH
  547. jmp .Loop_shaext
  548. .align 16
  549. .Loop_shaext:
  550. movdqu ($inp),@MSG[0]
  551. movdqu 0x10($inp),@MSG[1]
  552. movdqu 0x20($inp),@MSG[2]
  553. pshufb $TMP,@MSG[0]
  554. movdqu 0x30($inp),@MSG[3]
  555. movdqa 0*32-0x80($Tbl),$Wi
  556. paddd @MSG[0],$Wi
  557. pshufb $TMP,@MSG[1]
  558. movdqa $CDGH,$CDGH_SAVE # offload
  559. sha256rnds2 $ABEF,$CDGH # 0-3
  560. pshufd \$0x0e,$Wi,$Wi
  561. nop
  562. movdqa $ABEF,$ABEF_SAVE # offload
  563. sha256rnds2 $CDGH,$ABEF
  564. movdqa 1*32-0x80($Tbl),$Wi
  565. paddd @MSG[1],$Wi
  566. pshufb $TMP,@MSG[2]
  567. sha256rnds2 $ABEF,$CDGH # 4-7
  568. pshufd \$0x0e,$Wi,$Wi
  569. lea 0x40($inp),$inp
  570. sha256msg1 @MSG[1],@MSG[0]
  571. sha256rnds2 $CDGH,$ABEF
  572. movdqa 2*32-0x80($Tbl),$Wi
  573. paddd @MSG[2],$Wi
  574. pshufb $TMP,@MSG[3]
  575. sha256rnds2 $ABEF,$CDGH # 8-11
  576. pshufd \$0x0e,$Wi,$Wi
  577. movdqa @MSG[3],$TMP
  578. palignr \$4,@MSG[2],$TMP
  579. nop
  580. paddd $TMP,@MSG[0]
  581. sha256msg1 @MSG[2],@MSG[1]
  582. sha256rnds2 $CDGH,$ABEF
  583. movdqa 3*32-0x80($Tbl),$Wi
  584. paddd @MSG[3],$Wi
  585. sha256msg2 @MSG[3],@MSG[0]
  586. sha256rnds2 $ABEF,$CDGH # 12-15
  587. pshufd \$0x0e,$Wi,$Wi
  588. movdqa @MSG[0],$TMP
  589. palignr \$4,@MSG[3],$TMP
  590. nop
  591. paddd $TMP,@MSG[1]
  592. sha256msg1 @MSG[3],@MSG[2]
  593. sha256rnds2 $CDGH,$ABEF
  594. ___
  595. for($i=4;$i<16-3;$i++) {
  596. $code.=<<___;
  597. movdqa $i*32-0x80($Tbl),$Wi
  598. paddd @MSG[0],$Wi
  599. sha256msg2 @MSG[0],@MSG[1]
  600. sha256rnds2 $ABEF,$CDGH # 16-19...
  601. pshufd \$0x0e,$Wi,$Wi
  602. movdqa @MSG[1],$TMP
  603. palignr \$4,@MSG[0],$TMP
  604. nop
  605. paddd $TMP,@MSG[2]
  606. sha256msg1 @MSG[0],@MSG[3]
  607. sha256rnds2 $CDGH,$ABEF
  608. ___
  609. push(@MSG,shift(@MSG));
  610. }
  611. $code.=<<___;
  612. movdqa 13*32-0x80($Tbl),$Wi
  613. paddd @MSG[0],$Wi
  614. sha256msg2 @MSG[0],@MSG[1]
  615. sha256rnds2 $ABEF,$CDGH # 52-55
  616. pshufd \$0x0e,$Wi,$Wi
  617. movdqa @MSG[1],$TMP
  618. palignr \$4,@MSG[0],$TMP
  619. sha256rnds2 $CDGH,$ABEF
  620. paddd $TMP,@MSG[2]
  621. movdqa 14*32-0x80($Tbl),$Wi
  622. paddd @MSG[1],$Wi
  623. sha256rnds2 $ABEF,$CDGH # 56-59
  624. pshufd \$0x0e,$Wi,$Wi
  625. sha256msg2 @MSG[1],@MSG[2]
  626. movdqa $BSWAP,$TMP
  627. sha256rnds2 $CDGH,$ABEF
  628. movdqa 15*32-0x80($Tbl),$Wi
  629. paddd @MSG[2],$Wi
  630. nop
  631. sha256rnds2 $ABEF,$CDGH # 60-63
  632. pshufd \$0x0e,$Wi,$Wi
  633. dec $num
  634. nop
  635. sha256rnds2 $CDGH,$ABEF
  636. paddd $CDGH_SAVE,$CDGH
  637. paddd $ABEF_SAVE,$ABEF
  638. jnz .Loop_shaext
  639. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  640. pshufd \$0x1b,$ABEF,$TMP # FEBA
  641. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  642. punpckhqdq $CDGH,$ABEF # DCBA
  643. palignr \$8,$TMP,$CDGH # HGFE
  644. movdqu $ABEF,($ctx)
  645. movdqu $CDGH,16($ctx)
  646. ___
  647. $code.=<<___ if ($win64);
  648. movaps -8-5*16(%rax),%xmm6
  649. movaps -8-4*16(%rax),%xmm7
  650. movaps -8-3*16(%rax),%xmm8
  651. movaps -8-2*16(%rax),%xmm9
  652. movaps -8-1*16(%rax),%xmm10
  653. mov %rax,%rsp
  654. .Lepilogue_shaext:
  655. ___
  656. $code.=<<___;
  657. ret
  658. .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
  659. ___
  660. }}}
  661. {{{
  662. my $a4=$T1;
  663. my ($a,$b,$c,$d,$e,$f,$g,$h);
  664. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  665. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  666. my $arg = pop;
  667. $arg = "\$$arg" if ($arg*1 eq $arg);
  668. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  669. }
  670. sub body_00_15 () {
  671. (
  672. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  673. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  674. '&mov ($a,$a1)',
  675. '&mov ($a4,$f)',
  676. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  677. '&xor ($a0,$e)',
  678. '&xor ($a4,$g)', # f^g
  679. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  680. '&xor ($a1,$a)',
  681. '&and ($a4,$e)', # (f^g)&e
  682. '&xor ($a0,$e)',
  683. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  684. '&mov ($a2,$a)',
  685. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  686. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  687. '&xor ($a2,$b)', # a^b, b^c in next round
  688. '&add ($h,$a4)', # h+=Ch(e,f,g)
  689. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  690. '&and ($a3,$a2)', # (b^c)&(a^b)
  691. '&xor ($a1,$a)',
  692. '&add ($h,$a0)', # h+=Sigma1(e)
  693. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  694. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  695. '&add ($d,$h)', # d+=h
  696. '&add ($h,$a3)', # h+=Maj(a,b,c)
  697. '&mov ($a0,$d)',
  698. '&add ($a1,$h);'. # h+=Sigma0(a)
  699. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  700. );
  701. }
  702. ######################################################################
  703. # SSSE3 code path
  704. #
  705. if ($SZ==4) { # SHA256 only
  706. my @X = map("%xmm$_",(0..3));
  707. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  708. $code.=<<___;
  709. .type ${func}_ssse3,\@function,3
  710. .align 64
  711. ${func}_ssse3:
  712. .cfi_startproc
  713. .Lssse3_shortcut:
  714. mov %rsp,%rax # copy %rsp
  715. .cfi_def_cfa_register %rax
  716. push %rbx
  717. .cfi_push %rbx
  718. push %rbp
  719. .cfi_push %rbp
  720. push %r12
  721. .cfi_push %r12
  722. push %r13
  723. .cfi_push %r13
  724. push %r14
  725. .cfi_push %r14
  726. push %r15
  727. .cfi_push %r15
  728. shl \$4,%rdx # num*16
  729. sub \$`$framesz+$win64*16*4`,%rsp
  730. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  731. and \$-64,%rsp # align stack frame
  732. mov $ctx,$_ctx # save ctx, 1st arg
  733. mov $inp,$_inp # save inp, 2nd arh
  734. mov %rdx,$_end # save end pointer, "3rd" arg
  735. mov %rax,$_rsp # save copy of %rsp
  736. .cfi_cfa_expression $_rsp,deref,+8
  737. ___
  738. $code.=<<___ if ($win64);
  739. movaps %xmm6,16*$SZ+32(%rsp)
  740. movaps %xmm7,16*$SZ+48(%rsp)
  741. movaps %xmm8,16*$SZ+64(%rsp)
  742. movaps %xmm9,16*$SZ+80(%rsp)
  743. ___
  744. $code.=<<___;
  745. .Lprologue_ssse3:
  746. mov $SZ*0($ctx),$A
  747. mov $SZ*1($ctx),$B
  748. mov $SZ*2($ctx),$C
  749. mov $SZ*3($ctx),$D
  750. mov $SZ*4($ctx),$E
  751. mov $SZ*5($ctx),$F
  752. mov $SZ*6($ctx),$G
  753. mov $SZ*7($ctx),$H
  754. ___
  755. $code.=<<___;
  756. #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  757. #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  758. jmp .Lloop_ssse3
  759. .align 16
  760. .Lloop_ssse3:
  761. movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  762. movdqu 0x00($inp),@X[0]
  763. movdqu 0x10($inp),@X[1]
  764. movdqu 0x20($inp),@X[2]
  765. pshufb $t3,@X[0]
  766. movdqu 0x30($inp),@X[3]
  767. lea $TABLE(%rip),$Tbl
  768. pshufb $t3,@X[1]
  769. movdqa 0x00($Tbl),$t0
  770. movdqa 0x20($Tbl),$t1
  771. pshufb $t3,@X[2]
  772. paddd @X[0],$t0
  773. movdqa 0x40($Tbl),$t2
  774. pshufb $t3,@X[3]
  775. movdqa 0x60($Tbl),$t3
  776. paddd @X[1],$t1
  777. paddd @X[2],$t2
  778. paddd @X[3],$t3
  779. movdqa $t0,0x00(%rsp)
  780. mov $A,$a1
  781. movdqa $t1,0x10(%rsp)
  782. mov $B,$a3
  783. movdqa $t2,0x20(%rsp)
  784. xor $C,$a3 # magic
  785. movdqa $t3,0x30(%rsp)
  786. mov $E,$a0
  787. jmp .Lssse3_00_47
  788. .align 16
  789. .Lssse3_00_47:
  790. sub \$`-16*2*$SZ`,$Tbl # size optimization
  791. ___
  792. sub Xupdate_256_SSSE3 () {
  793. (
  794. '&movdqa ($t0,@X[1]);',
  795. '&movdqa ($t3,@X[3])',
  796. '&palignr ($t0,@X[0],$SZ)', # X[1..4]
  797. '&palignr ($t3,@X[2],$SZ);', # X[9..12]
  798. '&movdqa ($t1,$t0)',
  799. '&movdqa ($t2,$t0);',
  800. '&psrld ($t0,$sigma0[2])',
  801. '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
  802. '&psrld ($t2,$sigma0[0])',
  803. '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
  804. '&pslld ($t1,8*$SZ-$sigma0[1]);'.
  805. '&pxor ($t0,$t2)',
  806. '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
  807. '&pxor ($t0,$t1)',
  808. '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
  809. '&pxor ($t0,$t2);',
  810. '&movdqa ($t2,$t3)',
  811. '&pxor ($t0,$t1);', # sigma0(X[1..4])
  812. '&psrld ($t3,$sigma1[2])',
  813. '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
  814. '&psrlq ($t2,$sigma1[0])',
  815. '&pxor ($t3,$t2);',
  816. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  817. '&pxor ($t3,$t2)',
  818. '&pshufb ($t3,$t4)', # sigma1(X[14..15])
  819. '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  820. '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
  821. '&movdqa ($t2,$t3);',
  822. '&psrld ($t3,$sigma1[2])',
  823. '&psrlq ($t2,$sigma1[0])',
  824. '&pxor ($t3,$t2);',
  825. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  826. '&pxor ($t3,$t2);',
  827. '&movdqa ($t2,16*2*$j."($Tbl)")',
  828. '&pshufb ($t3,$t5)',
  829. '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
  830. );
  831. }
  832. sub SSSE3_256_00_47 () {
  833. my $j = shift;
  834. my $body = shift;
  835. my @X = @_;
  836. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  837. if (0) {
  838. foreach (Xupdate_256_SSSE3()) { # 36 instructions
  839. eval;
  840. eval(shift(@insns));
  841. eval(shift(@insns));
  842. eval(shift(@insns));
  843. }
  844. } else { # squeeze extra 4% on Westmere and 19% on Atom
  845. eval(shift(@insns)); #@
  846. &movdqa ($t0,@X[1]);
  847. eval(shift(@insns));
  848. eval(shift(@insns));
  849. &movdqa ($t3,@X[3]);
  850. eval(shift(@insns)); #@
  851. eval(shift(@insns));
  852. eval(shift(@insns));
  853. eval(shift(@insns)); #@
  854. eval(shift(@insns));
  855. &palignr ($t0,@X[0],$SZ); # X[1..4]
  856. eval(shift(@insns));
  857. eval(shift(@insns));
  858. &palignr ($t3,@X[2],$SZ); # X[9..12]
  859. eval(shift(@insns));
  860. eval(shift(@insns));
  861. eval(shift(@insns));
  862. eval(shift(@insns)); #@
  863. &movdqa ($t1,$t0);
  864. eval(shift(@insns));
  865. eval(shift(@insns));
  866. &movdqa ($t2,$t0);
  867. eval(shift(@insns)); #@
  868. eval(shift(@insns));
  869. &psrld ($t0,$sigma0[2]);
  870. eval(shift(@insns));
  871. eval(shift(@insns));
  872. eval(shift(@insns));
  873. &paddd (@X[0],$t3); # X[0..3] += X[9..12]
  874. eval(shift(@insns)); #@
  875. eval(shift(@insns));
  876. &psrld ($t2,$sigma0[0]);
  877. eval(shift(@insns));
  878. eval(shift(@insns));
  879. &pshufd ($t3,@X[3],0b11111010); # X[4..15]
  880. eval(shift(@insns));
  881. eval(shift(@insns)); #@
  882. &pslld ($t1,8*$SZ-$sigma0[1]);
  883. eval(shift(@insns));
  884. eval(shift(@insns));
  885. &pxor ($t0,$t2);
  886. eval(shift(@insns)); #@
  887. eval(shift(@insns));
  888. eval(shift(@insns));
  889. eval(shift(@insns)); #@
  890. &psrld ($t2,$sigma0[1]-$sigma0[0]);
  891. eval(shift(@insns));
  892. &pxor ($t0,$t1);
  893. eval(shift(@insns));
  894. eval(shift(@insns));
  895. &pslld ($t1,$sigma0[1]-$sigma0[0]);
  896. eval(shift(@insns));
  897. eval(shift(@insns));
  898. &pxor ($t0,$t2);
  899. eval(shift(@insns));
  900. eval(shift(@insns)); #@
  901. &movdqa ($t2,$t3);
  902. eval(shift(@insns));
  903. eval(shift(@insns));
  904. &pxor ($t0,$t1); # sigma0(X[1..4])
  905. eval(shift(@insns)); #@
  906. eval(shift(@insns));
  907. eval(shift(@insns));
  908. &psrld ($t3,$sigma1[2]);
  909. eval(shift(@insns));
  910. eval(shift(@insns));
  911. &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  912. eval(shift(@insns)); #@
  913. eval(shift(@insns));
  914. &psrlq ($t2,$sigma1[0]);
  915. eval(shift(@insns));
  916. eval(shift(@insns));
  917. eval(shift(@insns));
  918. &pxor ($t3,$t2);
  919. eval(shift(@insns)); #@
  920. eval(shift(@insns));
  921. eval(shift(@insns));
  922. eval(shift(@insns)); #@
  923. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  924. eval(shift(@insns));
  925. eval(shift(@insns));
  926. &pxor ($t3,$t2);
  927. eval(shift(@insns)); #@
  928. eval(shift(@insns));
  929. eval(shift(@insns));
  930. #&pshufb ($t3,$t4); # sigma1(X[14..15])
  931. &pshufd ($t3,$t3,0b10000000);
  932. eval(shift(@insns));
  933. eval(shift(@insns));
  934. eval(shift(@insns));
  935. &psrldq ($t3,8);
  936. eval(shift(@insns));
  937. eval(shift(@insns)); #@
  938. eval(shift(@insns));
  939. eval(shift(@insns));
  940. eval(shift(@insns)); #@
  941. &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  942. eval(shift(@insns));
  943. eval(shift(@insns));
  944. eval(shift(@insns));
  945. &pshufd ($t3,@X[0],0b01010000); # X[16..17]
  946. eval(shift(@insns));
  947. eval(shift(@insns)); #@
  948. eval(shift(@insns));
  949. &movdqa ($t2,$t3);
  950. eval(shift(@insns));
  951. eval(shift(@insns));
  952. &psrld ($t3,$sigma1[2]);
  953. eval(shift(@insns));
  954. eval(shift(@insns)); #@
  955. &psrlq ($t2,$sigma1[0]);
  956. eval(shift(@insns));
  957. eval(shift(@insns));
  958. &pxor ($t3,$t2);
  959. eval(shift(@insns)); #@
  960. eval(shift(@insns));
  961. eval(shift(@insns));
  962. eval(shift(@insns)); #@
  963. eval(shift(@insns));
  964. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  965. eval(shift(@insns));
  966. eval(shift(@insns));
  967. eval(shift(@insns));
  968. &pxor ($t3,$t2);
  969. eval(shift(@insns));
  970. eval(shift(@insns));
  971. eval(shift(@insns)); #@
  972. #&pshufb ($t3,$t5);
  973. &pshufd ($t3,$t3,0b00001000);
  974. eval(shift(@insns));
  975. eval(shift(@insns));
  976. &movdqa ($t2,16*2*$j."($Tbl)");
  977. eval(shift(@insns)); #@
  978. eval(shift(@insns));
  979. &pslldq ($t3,8);
  980. eval(shift(@insns));
  981. eval(shift(@insns));
  982. eval(shift(@insns));
  983. &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  984. eval(shift(@insns)); #@
  985. eval(shift(@insns));
  986. eval(shift(@insns));
  987. }
  988. &paddd ($t2,@X[0]);
  989. foreach (@insns) { eval; } # remaining instructions
  990. &movdqa (16*$j."(%rsp)",$t2);
  991. }
  992. for ($i=0,$j=0; $j<4; $j++) {
  993. &SSSE3_256_00_47($j,\&body_00_15,@X);
  994. push(@X,shift(@X)); # rotate(@X)
  995. }
  996. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  997. &jne (".Lssse3_00_47");
  998. for ($i=0; $i<16; ) {
  999. foreach(body_00_15()) { eval; }
  1000. }
  1001. $code.=<<___;
  1002. mov $_ctx,$ctx
  1003. mov $a1,$A
  1004. add $SZ*0($ctx),$A
  1005. lea 16*$SZ($inp),$inp
  1006. add $SZ*1($ctx),$B
  1007. add $SZ*2($ctx),$C
  1008. add $SZ*3($ctx),$D
  1009. add $SZ*4($ctx),$E
  1010. add $SZ*5($ctx),$F
  1011. add $SZ*6($ctx),$G
  1012. add $SZ*7($ctx),$H
  1013. cmp $_end,$inp
  1014. mov $A,$SZ*0($ctx)
  1015. mov $B,$SZ*1($ctx)
  1016. mov $C,$SZ*2($ctx)
  1017. mov $D,$SZ*3($ctx)
  1018. mov $E,$SZ*4($ctx)
  1019. mov $F,$SZ*5($ctx)
  1020. mov $G,$SZ*6($ctx)
  1021. mov $H,$SZ*7($ctx)
  1022. jb .Lloop_ssse3
  1023. mov $_rsp,%rsi
  1024. .cfi_def_cfa %rsi,8
  1025. ___
  1026. $code.=<<___ if ($win64);
  1027. movaps 16*$SZ+32(%rsp),%xmm6
  1028. movaps 16*$SZ+48(%rsp),%xmm7
  1029. movaps 16*$SZ+64(%rsp),%xmm8
  1030. movaps 16*$SZ+80(%rsp),%xmm9
  1031. ___
  1032. $code.=<<___;
  1033. mov -48(%rsi),%r15
  1034. .cfi_restore %r15
  1035. mov -40(%rsi),%r14
  1036. .cfi_restore %r14
  1037. mov -32(%rsi),%r13
  1038. .cfi_restore %r13
  1039. mov -24(%rsi),%r12
  1040. .cfi_restore %r12
  1041. mov -16(%rsi),%rbp
  1042. .cfi_restore %rbp
  1043. mov -8(%rsi),%rbx
  1044. .cfi_restore %rbx
  1045. lea (%rsi),%rsp
  1046. .cfi_def_cfa_register %rsp
  1047. .Lepilogue_ssse3:
  1048. ret
  1049. .cfi_endproc
  1050. .size ${func}_ssse3,.-${func}_ssse3
  1051. ___
  1052. }
  1053. if ($avx) {{
  1054. ######################################################################
  1055. # XOP code path
  1056. #
  1057. if ($SZ==8) { # SHA512 only
  1058. $code.=<<___;
  1059. .type ${func}_xop,\@function,3
  1060. .align 64
  1061. ${func}_xop:
  1062. .cfi_startproc
  1063. .Lxop_shortcut:
  1064. mov %rsp,%rax # copy %rsp
  1065. .cfi_def_cfa_register %rax
  1066. push %rbx
  1067. .cfi_push %rbx
  1068. push %rbp
  1069. .cfi_push %rbp
  1070. push %r12
  1071. .cfi_push %r12
  1072. push %r13
  1073. .cfi_push %r13
  1074. push %r14
  1075. .cfi_push %r14
  1076. push %r15
  1077. .cfi_push %r15
  1078. shl \$4,%rdx # num*16
  1079. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1080. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1081. and \$-64,%rsp # align stack frame
  1082. mov $ctx,$_ctx # save ctx, 1st arg
  1083. mov $inp,$_inp # save inp, 2nd arh
  1084. mov %rdx,$_end # save end pointer, "3rd" arg
  1085. mov %rax,$_rsp # save copy of %rsp
  1086. .cfi_cfa_expression $_rsp,deref,+8
  1087. ___
  1088. $code.=<<___ if ($win64);
  1089. movaps %xmm6,16*$SZ+32(%rsp)
  1090. movaps %xmm7,16*$SZ+48(%rsp)
  1091. movaps %xmm8,16*$SZ+64(%rsp)
  1092. movaps %xmm9,16*$SZ+80(%rsp)
  1093. ___
  1094. $code.=<<___ if ($win64 && $SZ>4);
  1095. movaps %xmm10,16*$SZ+96(%rsp)
  1096. movaps %xmm11,16*$SZ+112(%rsp)
  1097. ___
  1098. $code.=<<___;
  1099. .Lprologue_xop:
  1100. vzeroupper
  1101. mov $SZ*0($ctx),$A
  1102. mov $SZ*1($ctx),$B
  1103. mov $SZ*2($ctx),$C
  1104. mov $SZ*3($ctx),$D
  1105. mov $SZ*4($ctx),$E
  1106. mov $SZ*5($ctx),$F
  1107. mov $SZ*6($ctx),$G
  1108. mov $SZ*7($ctx),$H
  1109. jmp .Lloop_xop
  1110. ___
  1111. if ($SZ==4) { # SHA256
  1112. my @X = map("%xmm$_",(0..3));
  1113. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  1114. $code.=<<___;
  1115. .align 16
  1116. .Lloop_xop:
  1117. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1118. vmovdqu 0x00($inp),@X[0]
  1119. vmovdqu 0x10($inp),@X[1]
  1120. vmovdqu 0x20($inp),@X[2]
  1121. vmovdqu 0x30($inp),@X[3]
  1122. vpshufb $t3,@X[0],@X[0]
  1123. lea $TABLE(%rip),$Tbl
  1124. vpshufb $t3,@X[1],@X[1]
  1125. vpshufb $t3,@X[2],@X[2]
  1126. vpaddd 0x00($Tbl),@X[0],$t0
  1127. vpshufb $t3,@X[3],@X[3]
  1128. vpaddd 0x20($Tbl),@X[1],$t1
  1129. vpaddd 0x40($Tbl),@X[2],$t2
  1130. vpaddd 0x60($Tbl),@X[3],$t3
  1131. vmovdqa $t0,0x00(%rsp)
  1132. mov $A,$a1
  1133. vmovdqa $t1,0x10(%rsp)
  1134. mov $B,$a3
  1135. vmovdqa $t2,0x20(%rsp)
  1136. xor $C,$a3 # magic
  1137. vmovdqa $t3,0x30(%rsp)
  1138. mov $E,$a0
  1139. jmp .Lxop_00_47
  1140. .align 16
  1141. .Lxop_00_47:
  1142. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1143. ___
  1144. sub XOP_256_00_47 () {
  1145. my $j = shift;
  1146. my $body = shift;
  1147. my @X = @_;
  1148. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1149. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  1150. eval(shift(@insns));
  1151. eval(shift(@insns));
  1152. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  1153. eval(shift(@insns));
  1154. eval(shift(@insns));
  1155. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  1156. eval(shift(@insns));
  1157. eval(shift(@insns));
  1158. &vpsrld ($t0,$t0,$sigma0[2]);
  1159. eval(shift(@insns));
  1160. eval(shift(@insns));
  1161. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  1162. eval(shift(@insns));
  1163. eval(shift(@insns));
  1164. eval(shift(@insns));
  1165. eval(shift(@insns));
  1166. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1167. eval(shift(@insns));
  1168. eval(shift(@insns));
  1169. &vpxor ($t0,$t0,$t1);
  1170. eval(shift(@insns));
  1171. eval(shift(@insns));
  1172. eval(shift(@insns));
  1173. eval(shift(@insns));
  1174. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  1175. eval(shift(@insns));
  1176. eval(shift(@insns));
  1177. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  1178. eval(shift(@insns));
  1179. eval(shift(@insns));
  1180. &vpsrld ($t2,@X[3],$sigma1[2]);
  1181. eval(shift(@insns));
  1182. eval(shift(@insns));
  1183. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  1184. eval(shift(@insns));
  1185. eval(shift(@insns));
  1186. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1187. eval(shift(@insns));
  1188. eval(shift(@insns));
  1189. &vpxor ($t3,$t3,$t2);
  1190. eval(shift(@insns));
  1191. eval(shift(@insns));
  1192. eval(shift(@insns));
  1193. eval(shift(@insns));
  1194. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1195. eval(shift(@insns));
  1196. eval(shift(@insns));
  1197. eval(shift(@insns));
  1198. eval(shift(@insns));
  1199. &vpsrldq ($t3,$t3,8);
  1200. eval(shift(@insns));
  1201. eval(shift(@insns));
  1202. eval(shift(@insns));
  1203. eval(shift(@insns));
  1204. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1205. eval(shift(@insns));
  1206. eval(shift(@insns));
  1207. eval(shift(@insns));
  1208. eval(shift(@insns));
  1209. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  1210. eval(shift(@insns));
  1211. eval(shift(@insns));
  1212. &vpsrld ($t2,@X[0],$sigma1[2]);
  1213. eval(shift(@insns));
  1214. eval(shift(@insns));
  1215. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1216. eval(shift(@insns));
  1217. eval(shift(@insns));
  1218. &vpxor ($t3,$t3,$t2);
  1219. eval(shift(@insns));
  1220. eval(shift(@insns));
  1221. eval(shift(@insns));
  1222. eval(shift(@insns));
  1223. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  1224. eval(shift(@insns));
  1225. eval(shift(@insns));
  1226. eval(shift(@insns));
  1227. eval(shift(@insns));
  1228. &vpslldq ($t3,$t3,8); # 22 instructions
  1229. eval(shift(@insns));
  1230. eval(shift(@insns));
  1231. eval(shift(@insns));
  1232. eval(shift(@insns));
  1233. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  1234. eval(shift(@insns));
  1235. eval(shift(@insns));
  1236. eval(shift(@insns));
  1237. eval(shift(@insns));
  1238. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1239. foreach (@insns) { eval; } # remaining instructions
  1240. &vmovdqa (16*$j."(%rsp)",$t2);
  1241. }
  1242. for ($i=0,$j=0; $j<4; $j++) {
  1243. &XOP_256_00_47($j,\&body_00_15,@X);
  1244. push(@X,shift(@X)); # rotate(@X)
  1245. }
  1246. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1247. &jne (".Lxop_00_47");
  1248. for ($i=0; $i<16; ) {
  1249. foreach(body_00_15()) { eval; }
  1250. }
  1251. } else { # SHA512
  1252. my @X = map("%xmm$_",(0..7));
  1253. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1254. $code.=<<___;
  1255. .align 16
  1256. .Lloop_xop:
  1257. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1258. vmovdqu 0x00($inp),@X[0]
  1259. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1260. vmovdqu 0x10($inp),@X[1]
  1261. vmovdqu 0x20($inp),@X[2]
  1262. vpshufb $t3,@X[0],@X[0]
  1263. vmovdqu 0x30($inp),@X[3]
  1264. vpshufb $t3,@X[1],@X[1]
  1265. vmovdqu 0x40($inp),@X[4]
  1266. vpshufb $t3,@X[2],@X[2]
  1267. vmovdqu 0x50($inp),@X[5]
  1268. vpshufb $t3,@X[3],@X[3]
  1269. vmovdqu 0x60($inp),@X[6]
  1270. vpshufb $t3,@X[4],@X[4]
  1271. vmovdqu 0x70($inp),@X[7]
  1272. vpshufb $t3,@X[5],@X[5]
  1273. vpaddq -0x80($Tbl),@X[0],$t0
  1274. vpshufb $t3,@X[6],@X[6]
  1275. vpaddq -0x60($Tbl),@X[1],$t1
  1276. vpshufb $t3,@X[7],@X[7]
  1277. vpaddq -0x40($Tbl),@X[2],$t2
  1278. vpaddq -0x20($Tbl),@X[3],$t3
  1279. vmovdqa $t0,0x00(%rsp)
  1280. vpaddq 0x00($Tbl),@X[4],$t0
  1281. vmovdqa $t1,0x10(%rsp)
  1282. vpaddq 0x20($Tbl),@X[5],$t1
  1283. vmovdqa $t2,0x20(%rsp)
  1284. vpaddq 0x40($Tbl),@X[6],$t2
  1285. vmovdqa $t3,0x30(%rsp)
  1286. vpaddq 0x60($Tbl),@X[7],$t3
  1287. vmovdqa $t0,0x40(%rsp)
  1288. mov $A,$a1
  1289. vmovdqa $t1,0x50(%rsp)
  1290. mov $B,$a3
  1291. vmovdqa $t2,0x60(%rsp)
  1292. xor $C,$a3 # magic
  1293. vmovdqa $t3,0x70(%rsp)
  1294. mov $E,$a0
  1295. jmp .Lxop_00_47
  1296. .align 16
  1297. .Lxop_00_47:
  1298. add \$`16*2*$SZ`,$Tbl
  1299. ___
  1300. sub XOP_512_00_47 () {
  1301. my $j = shift;
  1302. my $body = shift;
  1303. my @X = @_;
  1304. my @insns = (&$body,&$body); # 52 instructions
  1305. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
  1306. eval(shift(@insns));
  1307. eval(shift(@insns));
  1308. &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
  1309. eval(shift(@insns));
  1310. eval(shift(@insns));
  1311. &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
  1312. eval(shift(@insns));
  1313. eval(shift(@insns));
  1314. &vpsrlq ($t0,$t0,$sigma0[2]);
  1315. eval(shift(@insns));
  1316. eval(shift(@insns));
  1317. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
  1318. eval(shift(@insns));
  1319. eval(shift(@insns));
  1320. eval(shift(@insns));
  1321. eval(shift(@insns));
  1322. &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1323. eval(shift(@insns));
  1324. eval(shift(@insns));
  1325. &vpxor ($t0,$t0,$t1);
  1326. eval(shift(@insns));
  1327. eval(shift(@insns));
  1328. eval(shift(@insns));
  1329. eval(shift(@insns));
  1330. &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
  1331. eval(shift(@insns));
  1332. eval(shift(@insns));
  1333. &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
  1334. eval(shift(@insns));
  1335. eval(shift(@insns));
  1336. &vpsrlq ($t2,@X[7],$sigma1[2]);
  1337. eval(shift(@insns));
  1338. eval(shift(@insns));
  1339. &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
  1340. eval(shift(@insns));
  1341. eval(shift(@insns));
  1342. &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1343. eval(shift(@insns));
  1344. eval(shift(@insns));
  1345. &vpxor ($t3,$t3,$t2);
  1346. eval(shift(@insns));
  1347. eval(shift(@insns));
  1348. eval(shift(@insns));
  1349. eval(shift(@insns));
  1350. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1351. eval(shift(@insns));
  1352. eval(shift(@insns));
  1353. eval(shift(@insns));
  1354. eval(shift(@insns));
  1355. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1356. eval(shift(@insns));
  1357. eval(shift(@insns));
  1358. eval(shift(@insns));
  1359. eval(shift(@insns));
  1360. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1361. foreach (@insns) { eval; } # remaining instructions
  1362. &vmovdqa (16*$j."(%rsp)",$t2);
  1363. }
  1364. for ($i=0,$j=0; $j<8; $j++) {
  1365. &XOP_512_00_47($j,\&body_00_15,@X);
  1366. push(@X,shift(@X)); # rotate(@X)
  1367. }
  1368. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1369. &jne (".Lxop_00_47");
  1370. for ($i=0; $i<16; ) {
  1371. foreach(body_00_15()) { eval; }
  1372. }
  1373. }
  1374. $code.=<<___;
  1375. mov $_ctx,$ctx
  1376. mov $a1,$A
  1377. add $SZ*0($ctx),$A
  1378. lea 16*$SZ($inp),$inp
  1379. add $SZ*1($ctx),$B
  1380. add $SZ*2($ctx),$C
  1381. add $SZ*3($ctx),$D
  1382. add $SZ*4($ctx),$E
  1383. add $SZ*5($ctx),$F
  1384. add $SZ*6($ctx),$G
  1385. add $SZ*7($ctx),$H
  1386. cmp $_end,$inp
  1387. mov $A,$SZ*0($ctx)
  1388. mov $B,$SZ*1($ctx)
  1389. mov $C,$SZ*2($ctx)
  1390. mov $D,$SZ*3($ctx)
  1391. mov $E,$SZ*4($ctx)
  1392. mov $F,$SZ*5($ctx)
  1393. mov $G,$SZ*6($ctx)
  1394. mov $H,$SZ*7($ctx)
  1395. jb .Lloop_xop
  1396. mov $_rsp,%rsi
  1397. .cfi_def_cfa %rsi,8
  1398. vzeroupper
  1399. ___
  1400. $code.=<<___ if ($win64);
  1401. movaps 16*$SZ+32(%rsp),%xmm6
  1402. movaps 16*$SZ+48(%rsp),%xmm7
  1403. movaps 16*$SZ+64(%rsp),%xmm8
  1404. movaps 16*$SZ+80(%rsp),%xmm9
  1405. ___
  1406. $code.=<<___ if ($win64 && $SZ>4);
  1407. movaps 16*$SZ+96(%rsp),%xmm10
  1408. movaps 16*$SZ+112(%rsp),%xmm11
  1409. ___
  1410. $code.=<<___;
  1411. mov -48(%rsi),%r15
  1412. .cfi_restore %r15
  1413. mov -40(%rsi),%r14
  1414. .cfi_restore %r14
  1415. mov -32(%rsi),%r13
  1416. .cfi_restore %r13
  1417. mov -24(%rsi),%r12
  1418. .cfi_restore %r12
  1419. mov -16(%rsi),%rbp
  1420. .cfi_restore %rbp
  1421. mov -8(%rsi),%rbx
  1422. .cfi_restore %rbx
  1423. lea (%rsi),%rsp
  1424. .cfi_def_cfa_register %rsp
  1425. .Lepilogue_xop:
  1426. ret
  1427. .cfi_endproc
  1428. .size ${func}_xop,.-${func}_xop
  1429. ___
  1430. }
  1431. ######################################################################
  1432. # AVX+shrd code path
  1433. #
  1434. local *ror = sub { &shrd(@_[0],@_) };
  1435. $code.=<<___;
  1436. .type ${func}_avx,\@function,3
  1437. .align 64
  1438. ${func}_avx:
  1439. .cfi_startproc
  1440. .Lavx_shortcut:
  1441. mov %rsp,%rax # copy %rsp
  1442. .cfi_def_cfa_register %rax
  1443. push %rbx
  1444. .cfi_push %rbx
  1445. push %rbp
  1446. .cfi_push %rbp
  1447. push %r12
  1448. .cfi_push %r12
  1449. push %r13
  1450. .cfi_push %r13
  1451. push %r14
  1452. .cfi_push %r14
  1453. push %r15
  1454. .cfi_push %r15
  1455. shl \$4,%rdx # num*16
  1456. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1457. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1458. and \$-64,%rsp # align stack frame
  1459. mov $ctx,$_ctx # save ctx, 1st arg
  1460. mov $inp,$_inp # save inp, 2nd arh
  1461. mov %rdx,$_end # save end pointer, "3rd" arg
  1462. mov %rax,$_rsp # save copy of %rsp
  1463. .cfi_cfa_expression $_rsp,deref,+8
  1464. ___
  1465. $code.=<<___ if ($win64);
  1466. movaps %xmm6,16*$SZ+32(%rsp)
  1467. movaps %xmm7,16*$SZ+48(%rsp)
  1468. movaps %xmm8,16*$SZ+64(%rsp)
  1469. movaps %xmm9,16*$SZ+80(%rsp)
  1470. ___
  1471. $code.=<<___ if ($win64 && $SZ>4);
  1472. movaps %xmm10,16*$SZ+96(%rsp)
  1473. movaps %xmm11,16*$SZ+112(%rsp)
  1474. ___
  1475. $code.=<<___;
  1476. .Lprologue_avx:
  1477. vzeroupper
  1478. mov $SZ*0($ctx),$A
  1479. mov $SZ*1($ctx),$B
  1480. mov $SZ*2($ctx),$C
  1481. mov $SZ*3($ctx),$D
  1482. mov $SZ*4($ctx),$E
  1483. mov $SZ*5($ctx),$F
  1484. mov $SZ*6($ctx),$G
  1485. mov $SZ*7($ctx),$H
  1486. ___
  1487. if ($SZ==4) { # SHA256
  1488. my @X = map("%xmm$_",(0..3));
  1489. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  1490. $code.=<<___;
  1491. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1492. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1493. jmp .Lloop_avx
  1494. .align 16
  1495. .Lloop_avx:
  1496. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1497. vmovdqu 0x00($inp),@X[0]
  1498. vmovdqu 0x10($inp),@X[1]
  1499. vmovdqu 0x20($inp),@X[2]
  1500. vmovdqu 0x30($inp),@X[3]
  1501. vpshufb $t3,@X[0],@X[0]
  1502. lea $TABLE(%rip),$Tbl
  1503. vpshufb $t3,@X[1],@X[1]
  1504. vpshufb $t3,@X[2],@X[2]
  1505. vpaddd 0x00($Tbl),@X[0],$t0
  1506. vpshufb $t3,@X[3],@X[3]
  1507. vpaddd 0x20($Tbl),@X[1],$t1
  1508. vpaddd 0x40($Tbl),@X[2],$t2
  1509. vpaddd 0x60($Tbl),@X[3],$t3
  1510. vmovdqa $t0,0x00(%rsp)
  1511. mov $A,$a1
  1512. vmovdqa $t1,0x10(%rsp)
  1513. mov $B,$a3
  1514. vmovdqa $t2,0x20(%rsp)
  1515. xor $C,$a3 # magic
  1516. vmovdqa $t3,0x30(%rsp)
  1517. mov $E,$a0
  1518. jmp .Lavx_00_47
  1519. .align 16
  1520. .Lavx_00_47:
  1521. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1522. ___
  1523. sub Xupdate_256_AVX () {
  1524. (
  1525. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  1526. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  1527. '&vpsrld ($t2,$t0,$sigma0[0]);',
  1528. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  1529. '&vpsrld ($t3,$t0,$sigma0[2])',
  1530. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  1531. '&vpxor ($t0,$t3,$t2)',
  1532. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  1533. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1534. '&vpxor ($t0,$t0,$t1)',
  1535. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1536. '&vpxor ($t0,$t0,$t2)',
  1537. '&vpsrld ($t2,$t3,$sigma1[2]);',
  1538. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  1539. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  1540. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  1541. '&vpxor ($t2,$t2,$t3);',
  1542. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1543. '&vpxor ($t2,$t2,$t3)',
  1544. '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
  1545. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  1546. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  1547. '&vpsrld ($t2,$t3,$sigma1[2])',
  1548. '&vpsrlq ($t3,$t3,$sigma1[0])',
  1549. '&vpxor ($t2,$t2,$t3);',
  1550. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1551. '&vpxor ($t2,$t2,$t3)',
  1552. '&vpshufb ($t2,$t2,$t5)',
  1553. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  1554. );
  1555. }
  1556. sub AVX_256_00_47 () {
  1557. my $j = shift;
  1558. my $body = shift;
  1559. my @X = @_;
  1560. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1561. foreach (Xupdate_256_AVX()) { # 29 instructions
  1562. eval;
  1563. eval(shift(@insns));
  1564. eval(shift(@insns));
  1565. eval(shift(@insns));
  1566. }
  1567. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1568. foreach (@insns) { eval; } # remaining instructions
  1569. &vmovdqa (16*$j."(%rsp)",$t2);
  1570. }
  1571. for ($i=0,$j=0; $j<4; $j++) {
  1572. &AVX_256_00_47($j,\&body_00_15,@X);
  1573. push(@X,shift(@X)); # rotate(@X)
  1574. }
  1575. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1576. &jne (".Lavx_00_47");
  1577. for ($i=0; $i<16; ) {
  1578. foreach(body_00_15()) { eval; }
  1579. }
  1580. } else { # SHA512
  1581. my @X = map("%xmm$_",(0..7));
  1582. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1583. $code.=<<___;
  1584. jmp .Lloop_avx
  1585. .align 16
  1586. .Lloop_avx:
  1587. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1588. vmovdqu 0x00($inp),@X[0]
  1589. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1590. vmovdqu 0x10($inp),@X[1]
  1591. vmovdqu 0x20($inp),@X[2]
  1592. vpshufb $t3,@X[0],@X[0]
  1593. vmovdqu 0x30($inp),@X[3]
  1594. vpshufb $t3,@X[1],@X[1]
  1595. vmovdqu 0x40($inp),@X[4]
  1596. vpshufb $t3,@X[2],@X[2]
  1597. vmovdqu 0x50($inp),@X[5]
  1598. vpshufb $t3,@X[3],@X[3]
  1599. vmovdqu 0x60($inp),@X[6]
  1600. vpshufb $t3,@X[4],@X[4]
  1601. vmovdqu 0x70($inp),@X[7]
  1602. vpshufb $t3,@X[5],@X[5]
  1603. vpaddq -0x80($Tbl),@X[0],$t0
  1604. vpshufb $t3,@X[6],@X[6]
  1605. vpaddq -0x60($Tbl),@X[1],$t1
  1606. vpshufb $t3,@X[7],@X[7]
  1607. vpaddq -0x40($Tbl),@X[2],$t2
  1608. vpaddq -0x20($Tbl),@X[3],$t3
  1609. vmovdqa $t0,0x00(%rsp)
  1610. vpaddq 0x00($Tbl),@X[4],$t0
  1611. vmovdqa $t1,0x10(%rsp)
  1612. vpaddq 0x20($Tbl),@X[5],$t1
  1613. vmovdqa $t2,0x20(%rsp)
  1614. vpaddq 0x40($Tbl),@X[6],$t2
  1615. vmovdqa $t3,0x30(%rsp)
  1616. vpaddq 0x60($Tbl),@X[7],$t3
  1617. vmovdqa $t0,0x40(%rsp)
  1618. mov $A,$a1
  1619. vmovdqa $t1,0x50(%rsp)
  1620. mov $B,$a3
  1621. vmovdqa $t2,0x60(%rsp)
  1622. xor $C,$a3 # magic
  1623. vmovdqa $t3,0x70(%rsp)
  1624. mov $E,$a0
  1625. jmp .Lavx_00_47
  1626. .align 16
  1627. .Lavx_00_47:
  1628. add \$`16*2*$SZ`,$Tbl
  1629. ___
  1630. sub Xupdate_512_AVX () {
  1631. (
  1632. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
  1633. '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
  1634. '&vpsrlq ($t2,$t0,$sigma0[0])',
  1635. '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
  1636. '&vpsrlq ($t3,$t0,$sigma0[2])',
  1637. '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
  1638. '&vpxor ($t0,$t3,$t2)',
  1639. '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1640. '&vpxor ($t0,$t0,$t1)',
  1641. '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1642. '&vpxor ($t0,$t0,$t2)',
  1643. '&vpsrlq ($t3,@X[7],$sigma1[2]);',
  1644. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
  1645. '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
  1646. '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
  1647. '&vpsrlq ($t1,@X[7],$sigma1[0]);',
  1648. '&vpxor ($t3,$t3,$t2)',
  1649. '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
  1650. '&vpxor ($t3,$t3,$t1)',
  1651. '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
  1652. '&vpxor ($t3,$t3,$t2)',
  1653. '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
  1654. '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  1655. );
  1656. }
  1657. sub AVX_512_00_47 () {
  1658. my $j = shift;
  1659. my $body = shift;
  1660. my @X = @_;
  1661. my @insns = (&$body,&$body); # 52 instructions
  1662. foreach (Xupdate_512_AVX()) { # 23 instructions
  1663. eval;
  1664. eval(shift(@insns));
  1665. eval(shift(@insns));
  1666. }
  1667. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1668. foreach (@insns) { eval; } # remaining instructions
  1669. &vmovdqa (16*$j."(%rsp)",$t2);
  1670. }
  1671. for ($i=0,$j=0; $j<8; $j++) {
  1672. &AVX_512_00_47($j,\&body_00_15,@X);
  1673. push(@X,shift(@X)); # rotate(@X)
  1674. }
  1675. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1676. &jne (".Lavx_00_47");
  1677. for ($i=0; $i<16; ) {
  1678. foreach(body_00_15()) { eval; }
  1679. }
  1680. }
  1681. $code.=<<___;
  1682. mov $_ctx,$ctx
  1683. mov $a1,$A
  1684. add $SZ*0($ctx),$A
  1685. lea 16*$SZ($inp),$inp
  1686. add $SZ*1($ctx),$B
  1687. add $SZ*2($ctx),$C
  1688. add $SZ*3($ctx),$D
  1689. add $SZ*4($ctx),$E
  1690. add $SZ*5($ctx),$F
  1691. add $SZ*6($ctx),$G
  1692. add $SZ*7($ctx),$H
  1693. cmp $_end,$inp
  1694. mov $A,$SZ*0($ctx)
  1695. mov $B,$SZ*1($ctx)
  1696. mov $C,$SZ*2($ctx)
  1697. mov $D,$SZ*3($ctx)
  1698. mov $E,$SZ*4($ctx)
  1699. mov $F,$SZ*5($ctx)
  1700. mov $G,$SZ*6($ctx)
  1701. mov $H,$SZ*7($ctx)
  1702. jb .Lloop_avx
  1703. mov $_rsp,%rsi
  1704. .cfi_def_cfa %rsi,8
  1705. vzeroupper
  1706. ___
  1707. $code.=<<___ if ($win64);
  1708. movaps 16*$SZ+32(%rsp),%xmm6
  1709. movaps 16*$SZ+48(%rsp),%xmm7
  1710. movaps 16*$SZ+64(%rsp),%xmm8
  1711. movaps 16*$SZ+80(%rsp),%xmm9
  1712. ___
  1713. $code.=<<___ if ($win64 && $SZ>4);
  1714. movaps 16*$SZ+96(%rsp),%xmm10
  1715. movaps 16*$SZ+112(%rsp),%xmm11
  1716. ___
  1717. $code.=<<___;
  1718. mov -48(%rsi),%r15
  1719. .cfi_restore %r15
  1720. mov -40(%rsi),%r14
  1721. .cfi_restore %r14
  1722. mov -32(%rsi),%r13
  1723. .cfi_restore %r13
  1724. mov -24(%rsi),%r12
  1725. .cfi_restore %r12
  1726. mov -16(%rsi),%rbp
  1727. .cfi_restore %rbp
  1728. mov -8(%rsi),%rbx
  1729. .cfi_restore %rbx
  1730. lea (%rsi),%rsp
  1731. .cfi_def_cfa_register %rsp
  1732. .Lepilogue_avx:
  1733. ret
  1734. .cfi_endproc
  1735. .size ${func}_avx,.-${func}_avx
  1736. ___
  1737. if ($avx>1) {{
  1738. ######################################################################
  1739. # AVX2+BMI code path
  1740. #
  1741. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  1742. my $PUSH8=8*2*$SZ;
  1743. use integer;
  1744. sub bodyx_00_15 () {
  1745. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  1746. (
  1747. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  1748. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  1749. '&and ($a4,$e)', # f&e
  1750. '&rorx ($a0,$e,$Sigma1[2])',
  1751. '&rorx ($a2,$e,$Sigma1[1])',
  1752. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  1753. '&lea ($h,"($h,$a4)")',
  1754. '&andn ($a4,$e,$g)', # ~e&g
  1755. '&xor ($a0,$a2)',
  1756. '&rorx ($a1,$e,$Sigma1[0])',
  1757. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  1758. '&xor ($a0,$a1)', # Sigma1(e)
  1759. '&mov ($a2,$a)',
  1760. '&rorx ($a4,$a,$Sigma0[2])',
  1761. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  1762. '&xor ($a2,$b)', # a^b, b^c in next round
  1763. '&rorx ($a1,$a,$Sigma0[1])',
  1764. '&rorx ($a0,$a,$Sigma0[0])',
  1765. '&lea ($d,"($d,$h)")', # d+=h
  1766. '&and ($a3,$a2)', # (b^c)&(a^b)
  1767. '&xor ($a1,$a4)',
  1768. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  1769. '&xor ($a1,$a0)', # Sigma0(a)
  1770. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  1771. '&mov ($a4,$e)', # copy of f in future
  1772. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  1773. );
  1774. # and at the finish one has to $a+=$a1
  1775. }
  1776. $code.=<<___;
  1777. .type ${func}_avx2,\@function,3
  1778. .align 64
  1779. ${func}_avx2:
  1780. .cfi_startproc
  1781. .Lavx2_shortcut:
  1782. mov %rsp,%rax # copy %rsp
  1783. .cfi_def_cfa_register %rax
  1784. push %rbx
  1785. .cfi_push %rbx
  1786. push %rbp
  1787. .cfi_push %rbp
  1788. push %r12
  1789. .cfi_push %r12
  1790. push %r13
  1791. .cfi_push %r13
  1792. push %r14
  1793. .cfi_push %r14
  1794. push %r15
  1795. .cfi_push %r15
  1796. sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
  1797. shl \$4,%rdx # num*16
  1798. and \$-256*$SZ,%rsp # align stack frame
  1799. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1800. add \$`2*$SZ*($rounds-8)`,%rsp
  1801. mov $ctx,$_ctx # save ctx, 1st arg
  1802. mov $inp,$_inp # save inp, 2nd arh
  1803. mov %rdx,$_end # save end pointer, "3rd" arg
  1804. mov %rax,$_rsp # save copy of %rsp
  1805. .cfi_cfa_expression $_rsp,deref,+8
  1806. ___
  1807. $code.=<<___ if ($win64);
  1808. movaps %xmm6,16*$SZ+32(%rsp)
  1809. movaps %xmm7,16*$SZ+48(%rsp)
  1810. movaps %xmm8,16*$SZ+64(%rsp)
  1811. movaps %xmm9,16*$SZ+80(%rsp)
  1812. ___
  1813. $code.=<<___ if ($win64 && $SZ>4);
  1814. movaps %xmm10,16*$SZ+96(%rsp)
  1815. movaps %xmm11,16*$SZ+112(%rsp)
  1816. ___
  1817. $code.=<<___;
  1818. .Lprologue_avx2:
  1819. vzeroupper
  1820. sub \$-16*$SZ,$inp # inp++, size optimization
  1821. mov $SZ*0($ctx),$A
  1822. mov $inp,%r12 # borrow $T1
  1823. mov $SZ*1($ctx),$B
  1824. cmp %rdx,$inp # $_end
  1825. mov $SZ*2($ctx),$C
  1826. cmove %rsp,%r12 # next block or random data
  1827. mov $SZ*3($ctx),$D
  1828. mov $SZ*4($ctx),$E
  1829. mov $SZ*5($ctx),$F
  1830. mov $SZ*6($ctx),$G
  1831. mov $SZ*7($ctx),$H
  1832. ___
  1833. if ($SZ==4) { # SHA256
  1834. my @X = map("%ymm$_",(0..3));
  1835. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
  1836. $code.=<<___;
  1837. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1838. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1839. jmp .Loop_avx2
  1840. .align 16
  1841. .Loop_avx2:
  1842. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1843. vmovdqu -16*$SZ+0($inp),%xmm0
  1844. vmovdqu -16*$SZ+16($inp),%xmm1
  1845. vmovdqu -16*$SZ+32($inp),%xmm2
  1846. vmovdqu -16*$SZ+48($inp),%xmm3
  1847. #mov $inp,$_inp # offload $inp
  1848. vinserti128 \$1,(%r12),@X[0],@X[0]
  1849. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1850. vpshufb $t3,@X[0],@X[0]
  1851. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1852. vpshufb $t3,@X[1],@X[1]
  1853. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1854. lea $TABLE(%rip),$Tbl
  1855. vpshufb $t3,@X[2],@X[2]
  1856. vpaddd 0x00($Tbl),@X[0],$t0
  1857. vpshufb $t3,@X[3],@X[3]
  1858. vpaddd 0x20($Tbl),@X[1],$t1
  1859. vpaddd 0x40($Tbl),@X[2],$t2
  1860. vpaddd 0x60($Tbl),@X[3],$t3
  1861. vmovdqa $t0,0x00(%rsp)
  1862. xor $a1,$a1
  1863. vmovdqa $t1,0x20(%rsp)
  1864. ___
  1865. $code.=<<___ if (!$win64);
  1866. # temporarily use %rdi as frame pointer
  1867. mov $_rsp,%rdi
  1868. .cfi_def_cfa %rdi,8
  1869. ___
  1870. $code.=<<___;
  1871. lea -$PUSH8(%rsp),%rsp
  1872. ___
  1873. $code.=<<___ if (!$win64);
  1874. # the frame info is at $_rsp, but the stack is moving...
  1875. # so a second frame pointer is saved at -8(%rsp)
  1876. # that is in the red zone
  1877. mov %rdi,-8(%rsp)
  1878. .cfi_cfa_expression %rsp-8,deref,+8
  1879. ___
  1880. $code.=<<___;
  1881. mov $B,$a3
  1882. vmovdqa $t2,0x00(%rsp)
  1883. xor $C,$a3 # magic
  1884. vmovdqa $t3,0x20(%rsp)
  1885. mov $F,$a4
  1886. sub \$-16*2*$SZ,$Tbl # size optimization
  1887. jmp .Lavx2_00_47
  1888. .align 16
  1889. .Lavx2_00_47:
  1890. ___
  1891. sub AVX2_256_00_47 () {
  1892. my $j = shift;
  1893. my $body = shift;
  1894. my @X = @_;
  1895. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1896. my $base = "+2*$PUSH8(%rsp)";
  1897. if (($j%2)==0) {
  1898. &lea ("%rsp","-$PUSH8(%rsp)");
  1899. $code.=<<___ if (!$win64);
  1900. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  1901. # copy secondary frame pointer to new location again at -8(%rsp)
  1902. pushq $PUSH8-8(%rsp)
  1903. .cfi_cfa_expression %rsp,deref,+8
  1904. lea 8(%rsp),%rsp
  1905. .cfi_cfa_expression %rsp-8,deref,+8
  1906. ___
  1907. }
  1908. foreach (Xupdate_256_AVX()) { # 29 instructions
  1909. eval;
  1910. eval(shift(@insns));
  1911. eval(shift(@insns));
  1912. eval(shift(@insns));
  1913. }
  1914. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1915. foreach (@insns) { eval; } # remaining instructions
  1916. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1917. }
  1918. for ($i=0,$j=0; $j<4; $j++) {
  1919. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1920. push(@X,shift(@X)); # rotate(@X)
  1921. }
  1922. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1923. &cmpb (($SZ-1)."($Tbl)",0);
  1924. &jne (".Lavx2_00_47");
  1925. for ($i=0; $i<16; ) {
  1926. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1927. foreach(bodyx_00_15()) { eval; }
  1928. }
  1929. } else { # SHA512
  1930. my @X = map("%ymm$_",(0..7));
  1931. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
  1932. $code.=<<___;
  1933. jmp .Loop_avx2
  1934. .align 16
  1935. .Loop_avx2:
  1936. vmovdqu -16*$SZ($inp),%xmm0
  1937. vmovdqu -16*$SZ+16($inp),%xmm1
  1938. vmovdqu -16*$SZ+32($inp),%xmm2
  1939. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1940. vmovdqu -16*$SZ+48($inp),%xmm3
  1941. vmovdqu -16*$SZ+64($inp),%xmm4
  1942. vmovdqu -16*$SZ+80($inp),%xmm5
  1943. vmovdqu -16*$SZ+96($inp),%xmm6
  1944. vmovdqu -16*$SZ+112($inp),%xmm7
  1945. #mov $inp,$_inp # offload $inp
  1946. vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
  1947. vinserti128 \$1,(%r12),@X[0],@X[0]
  1948. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1949. vpshufb $t2,@X[0],@X[0]
  1950. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1951. vpshufb $t2,@X[1],@X[1]
  1952. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1953. vpshufb $t2,@X[2],@X[2]
  1954. vinserti128 \$1,64(%r12),@X[4],@X[4]
  1955. vpshufb $t2,@X[3],@X[3]
  1956. vinserti128 \$1,80(%r12),@X[5],@X[5]
  1957. vpshufb $t2,@X[4],@X[4]
  1958. vinserti128 \$1,96(%r12),@X[6],@X[6]
  1959. vpshufb $t2,@X[5],@X[5]
  1960. vinserti128 \$1,112(%r12),@X[7],@X[7]
  1961. vpaddq -0x80($Tbl),@X[0],$t0
  1962. vpshufb $t2,@X[6],@X[6]
  1963. vpaddq -0x60($Tbl),@X[1],$t1
  1964. vpshufb $t2,@X[7],@X[7]
  1965. vpaddq -0x40($Tbl),@X[2],$t2
  1966. vpaddq -0x20($Tbl),@X[3],$t3
  1967. vmovdqa $t0,0x00(%rsp)
  1968. vpaddq 0x00($Tbl),@X[4],$t0
  1969. vmovdqa $t1,0x20(%rsp)
  1970. vpaddq 0x20($Tbl),@X[5],$t1
  1971. vmovdqa $t2,0x40(%rsp)
  1972. vpaddq 0x40($Tbl),@X[6],$t2
  1973. vmovdqa $t3,0x60(%rsp)
  1974. ___
  1975. $code.=<<___ if (!$win64);
  1976. # temporarily use %rdi as frame pointer
  1977. mov $_rsp,%rdi
  1978. .cfi_def_cfa %rdi,8
  1979. ___
  1980. $code.=<<___;
  1981. lea -$PUSH8(%rsp),%rsp
  1982. ___
  1983. $code.=<<___ if (!$win64);
  1984. # the frame info is at $_rsp, but the stack is moving...
  1985. # so a second frame pointer is saved at -8(%rsp)
  1986. # that is in the red zone
  1987. mov %rdi,-8(%rsp)
  1988. .cfi_cfa_expression %rsp-8,deref,+8
  1989. ___
  1990. $code.=<<___;
  1991. vpaddq 0x60($Tbl),@X[7],$t3
  1992. vmovdqa $t0,0x00(%rsp)
  1993. xor $a1,$a1
  1994. vmovdqa $t1,0x20(%rsp)
  1995. mov $B,$a3
  1996. vmovdqa $t2,0x40(%rsp)
  1997. xor $C,$a3 # magic
  1998. vmovdqa $t3,0x60(%rsp)
  1999. mov $F,$a4
  2000. add \$16*2*$SZ,$Tbl
  2001. jmp .Lavx2_00_47
  2002. .align 16
  2003. .Lavx2_00_47:
  2004. ___
  2005. sub AVX2_512_00_47 () {
  2006. my $j = shift;
  2007. my $body = shift;
  2008. my @X = @_;
  2009. my @insns = (&$body,&$body); # 48 instructions
  2010. my $base = "+2*$PUSH8(%rsp)";
  2011. if (($j%4)==0) {
  2012. &lea ("%rsp","-$PUSH8(%rsp)");
  2013. $code.=<<___ if (!$win64);
  2014. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  2015. # copy secondary frame pointer to new location again at -8(%rsp)
  2016. pushq $PUSH8-8(%rsp)
  2017. .cfi_cfa_expression %rsp,deref,+8
  2018. lea 8(%rsp),%rsp
  2019. .cfi_cfa_expression %rsp-8,deref,+8
  2020. ___
  2021. }
  2022. foreach (Xupdate_512_AVX()) { # 23 instructions
  2023. eval;
  2024. if ($_ !~ /\;$/) {
  2025. eval(shift(@insns));
  2026. eval(shift(@insns));
  2027. eval(shift(@insns));
  2028. }
  2029. }
  2030. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  2031. foreach (@insns) { eval; } # remaining instructions
  2032. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  2033. }
  2034. for ($i=0,$j=0; $j<8; $j++) {
  2035. &AVX2_512_00_47($j,\&bodyx_00_15,@X);
  2036. push(@X,shift(@X)); # rotate(@X)
  2037. }
  2038. &lea ($Tbl,16*2*$SZ."($Tbl)");
  2039. &cmpb (($SZ-1-0x80)."($Tbl)",0);
  2040. &jne (".Lavx2_00_47");
  2041. for ($i=0; $i<16; ) {
  2042. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  2043. foreach(bodyx_00_15()) { eval; }
  2044. }
  2045. }
  2046. $code.=<<___;
  2047. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2048. add $a1,$A
  2049. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2050. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  2051. add $SZ*0($ctx),$A
  2052. add $SZ*1($ctx),$B
  2053. add $SZ*2($ctx),$C
  2054. add $SZ*3($ctx),$D
  2055. add $SZ*4($ctx),$E
  2056. add $SZ*5($ctx),$F
  2057. add $SZ*6($ctx),$G
  2058. add $SZ*7($ctx),$H
  2059. mov $A,$SZ*0($ctx)
  2060. mov $B,$SZ*1($ctx)
  2061. mov $C,$SZ*2($ctx)
  2062. mov $D,$SZ*3($ctx)
  2063. mov $E,$SZ*4($ctx)
  2064. mov $F,$SZ*5($ctx)
  2065. mov $G,$SZ*6($ctx)
  2066. mov $H,$SZ*7($ctx)
  2067. cmp `$PUSH8+2*8`($Tbl),$inp # $_end
  2068. je .Ldone_avx2
  2069. xor $a1,$a1
  2070. mov $B,$a3
  2071. xor $C,$a3 # magic
  2072. mov $F,$a4
  2073. jmp .Lower_avx2
  2074. .align 16
  2075. .Lower_avx2:
  2076. ___
  2077. for ($i=0; $i<8; ) {
  2078. my $base="+16($Tbl)";
  2079. foreach(bodyx_00_15()) { eval; }
  2080. }
  2081. $code.=<<___;
  2082. lea -$PUSH8($Tbl),$Tbl
  2083. cmp %rsp,$Tbl
  2084. jae .Lower_avx2
  2085. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2086. add $a1,$A
  2087. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2088. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  2089. # restore frame pointer to original location at $_rsp
  2090. .cfi_cfa_expression $_rsp,deref,+8
  2091. add $SZ*0($ctx),$A
  2092. add $SZ*1($ctx),$B
  2093. add $SZ*2($ctx),$C
  2094. add $SZ*3($ctx),$D
  2095. add $SZ*4($ctx),$E
  2096. add $SZ*5($ctx),$F
  2097. lea `2*16*$SZ`($inp),$inp # inp+=2
  2098. add $SZ*6($ctx),$G
  2099. mov $inp,%r12
  2100. add $SZ*7($ctx),$H
  2101. cmp $_end,$inp
  2102. mov $A,$SZ*0($ctx)
  2103. cmove %rsp,%r12 # next block or stale data
  2104. mov $B,$SZ*1($ctx)
  2105. mov $C,$SZ*2($ctx)
  2106. mov $D,$SZ*3($ctx)
  2107. mov $E,$SZ*4($ctx)
  2108. mov $F,$SZ*5($ctx)
  2109. mov $G,$SZ*6($ctx)
  2110. mov $H,$SZ*7($ctx)
  2111. jbe .Loop_avx2
  2112. lea (%rsp),$Tbl
  2113. # temporarily use $Tbl as index to $_rsp
  2114. # this avoids the need to save a secondary frame pointer at -8(%rsp)
  2115. .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
  2116. .Ldone_avx2:
  2117. mov `16*$SZ+3*8`($Tbl),%rsi
  2118. .cfi_def_cfa %rsi,8
  2119. vzeroupper
  2120. ___
  2121. $code.=<<___ if ($win64);
  2122. movaps 16*$SZ+32($Tbl),%xmm6
  2123. movaps 16*$SZ+48($Tbl),%xmm7
  2124. movaps 16*$SZ+64($Tbl),%xmm8
  2125. movaps 16*$SZ+80($Tbl),%xmm9
  2126. ___
  2127. $code.=<<___ if ($win64 && $SZ>4);
  2128. movaps 16*$SZ+96($Tbl),%xmm10
  2129. movaps 16*$SZ+112($Tbl),%xmm11
  2130. ___
  2131. $code.=<<___;
  2132. mov -48(%rsi),%r15
  2133. .cfi_restore %r15
  2134. mov -40(%rsi),%r14
  2135. .cfi_restore %r14
  2136. mov -32(%rsi),%r13
  2137. .cfi_restore %r13
  2138. mov -24(%rsi),%r12
  2139. .cfi_restore %r12
  2140. mov -16(%rsi),%rbp
  2141. .cfi_restore %rbp
  2142. mov -8(%rsi),%rbx
  2143. .cfi_restore %rbx
  2144. lea (%rsi),%rsp
  2145. .cfi_def_cfa_register %rsp
  2146. .Lepilogue_avx2:
  2147. ret
  2148. .cfi_endproc
  2149. .size ${func}_avx2,.-${func}_avx2
  2150. ___
  2151. }}
  2152. }}}}}
  2153. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  2154. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  2155. if ($win64) {
  2156. $rec="%rcx";
  2157. $frame="%rdx";
  2158. $context="%r8";
  2159. $disp="%r9";
  2160. $code.=<<___;
  2161. .extern __imp_RtlVirtualUnwind
  2162. .type se_handler,\@abi-omnipotent
  2163. .align 16
  2164. se_handler:
  2165. push %rsi
  2166. push %rdi
  2167. push %rbx
  2168. push %rbp
  2169. push %r12
  2170. push %r13
  2171. push %r14
  2172. push %r15
  2173. pushfq
  2174. sub \$64,%rsp
  2175. mov 120($context),%rax # pull context->Rax
  2176. mov 248($context),%rbx # pull context->Rip
  2177. mov 8($disp),%rsi # disp->ImageBase
  2178. mov 56($disp),%r11 # disp->HanderlData
  2179. mov 0(%r11),%r10d # HandlerData[0]
  2180. lea (%rsi,%r10),%r10 # prologue label
  2181. cmp %r10,%rbx # context->Rip<prologue label
  2182. jb .Lin_prologue
  2183. mov 152($context),%rax # pull context->Rsp
  2184. mov 4(%r11),%r10d # HandlerData[1]
  2185. lea (%rsi,%r10),%r10 # epilogue label
  2186. cmp %r10,%rbx # context->Rip>=epilogue label
  2187. jae .Lin_prologue
  2188. ___
  2189. $code.=<<___ if ($avx>1);
  2190. lea .Lavx2_shortcut(%rip),%r10
  2191. cmp %r10,%rbx # context->Rip<avx2_shortcut
  2192. jb .Lnot_in_avx2
  2193. and \$-256*$SZ,%rax
  2194. add \$`2*$SZ*($rounds-8)`,%rax
  2195. .Lnot_in_avx2:
  2196. ___
  2197. $code.=<<___;
  2198. mov %rax,%rsi # put aside Rsp
  2199. mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
  2200. mov -8(%rax),%rbx
  2201. mov -16(%rax),%rbp
  2202. mov -24(%rax),%r12
  2203. mov -32(%rax),%r13
  2204. mov -40(%rax),%r14
  2205. mov -48(%rax),%r15
  2206. mov %rbx,144($context) # restore context->Rbx
  2207. mov %rbp,160($context) # restore context->Rbp
  2208. mov %r12,216($context) # restore context->R12
  2209. mov %r13,224($context) # restore context->R13
  2210. mov %r14,232($context) # restore context->R14
  2211. mov %r15,240($context) # restore context->R15
  2212. lea .Lepilogue(%rip),%r10
  2213. cmp %r10,%rbx
  2214. jb .Lin_prologue # non-AVX code
  2215. lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
  2216. lea 512($context),%rdi # &context.Xmm6
  2217. mov \$`$SZ==4?8:12`,%ecx
  2218. .long 0xa548f3fc # cld; rep movsq
  2219. .Lin_prologue:
  2220. mov 8(%rax),%rdi
  2221. mov 16(%rax),%rsi
  2222. mov %rax,152($context) # restore context->Rsp
  2223. mov %rsi,168($context) # restore context->Rsi
  2224. mov %rdi,176($context) # restore context->Rdi
  2225. mov 40($disp),%rdi # disp->ContextRecord
  2226. mov $context,%rsi # context
  2227. mov \$154,%ecx # sizeof(CONTEXT)
  2228. .long 0xa548f3fc # cld; rep movsq
  2229. mov $disp,%rsi
  2230. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2231. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2232. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2233. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2234. mov 40(%rsi),%r10 # disp->ContextRecord
  2235. lea 56(%rsi),%r11 # &disp->HandlerData
  2236. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2237. mov %r10,32(%rsp) # arg5
  2238. mov %r11,40(%rsp) # arg6
  2239. mov %r12,48(%rsp) # arg7
  2240. mov %rcx,56(%rsp) # arg8, (NULL)
  2241. call *__imp_RtlVirtualUnwind(%rip)
  2242. mov \$1,%eax # ExceptionContinueSearch
  2243. add \$64,%rsp
  2244. popfq
  2245. pop %r15
  2246. pop %r14
  2247. pop %r13
  2248. pop %r12
  2249. pop %rbp
  2250. pop %rbx
  2251. pop %rdi
  2252. pop %rsi
  2253. ret
  2254. .size se_handler,.-se_handler
  2255. ___
  2256. $code.=<<___ if ($SZ==4 && $shaext);
  2257. .type shaext_handler,\@abi-omnipotent
  2258. .align 16
  2259. shaext_handler:
  2260. push %rsi
  2261. push %rdi
  2262. push %rbx
  2263. push %rbp
  2264. push %r12
  2265. push %r13
  2266. push %r14
  2267. push %r15
  2268. pushfq
  2269. sub \$64,%rsp
  2270. mov 120($context),%rax # pull context->Rax
  2271. mov 248($context),%rbx # pull context->Rip
  2272. lea .Lprologue_shaext(%rip),%r10
  2273. cmp %r10,%rbx # context->Rip<.Lprologue
  2274. jb .Lin_prologue
  2275. lea .Lepilogue_shaext(%rip),%r10
  2276. cmp %r10,%rbx # context->Rip>=.Lepilogue
  2277. jae .Lin_prologue
  2278. lea -8-5*16(%rax),%rsi
  2279. lea 512($context),%rdi # &context.Xmm6
  2280. mov \$10,%ecx
  2281. .long 0xa548f3fc # cld; rep movsq
  2282. jmp .Lin_prologue
  2283. .size shaext_handler,.-shaext_handler
  2284. ___
  2285. $code.=<<___;
  2286. .section .pdata
  2287. .align 4
  2288. .rva .LSEH_begin_$func
  2289. .rva .LSEH_end_$func
  2290. .rva .LSEH_info_$func
  2291. ___
  2292. $code.=<<___ if ($SZ==4 && $shaext);
  2293. .rva .LSEH_begin_${func}_shaext
  2294. .rva .LSEH_end_${func}_shaext
  2295. .rva .LSEH_info_${func}_shaext
  2296. ___
  2297. $code.=<<___ if ($SZ==4);
  2298. .rva .LSEH_begin_${func}_ssse3
  2299. .rva .LSEH_end_${func}_ssse3
  2300. .rva .LSEH_info_${func}_ssse3
  2301. ___
  2302. $code.=<<___ if ($avx && $SZ==8);
  2303. .rva .LSEH_begin_${func}_xop
  2304. .rva .LSEH_end_${func}_xop
  2305. .rva .LSEH_info_${func}_xop
  2306. ___
  2307. $code.=<<___ if ($avx);
  2308. .rva .LSEH_begin_${func}_avx
  2309. .rva .LSEH_end_${func}_avx
  2310. .rva .LSEH_info_${func}_avx
  2311. ___
  2312. $code.=<<___ if ($avx>1);
  2313. .rva .LSEH_begin_${func}_avx2
  2314. .rva .LSEH_end_${func}_avx2
  2315. .rva .LSEH_info_${func}_avx2
  2316. ___
  2317. $code.=<<___;
  2318. .section .xdata
  2319. .align 8
  2320. .LSEH_info_$func:
  2321. .byte 9,0,0,0
  2322. .rva se_handler
  2323. .rva .Lprologue,.Lepilogue # HandlerData[]
  2324. ___
  2325. $code.=<<___ if ($SZ==4 && $shaext);
  2326. .LSEH_info_${func}_shaext:
  2327. .byte 9,0,0,0
  2328. .rva shaext_handler
  2329. ___
  2330. $code.=<<___ if ($SZ==4);
  2331. .LSEH_info_${func}_ssse3:
  2332. .byte 9,0,0,0
  2333. .rva se_handler
  2334. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  2335. ___
  2336. $code.=<<___ if ($avx && $SZ==8);
  2337. .LSEH_info_${func}_xop:
  2338. .byte 9,0,0,0
  2339. .rva se_handler
  2340. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  2341. ___
  2342. $code.=<<___ if ($avx);
  2343. .LSEH_info_${func}_avx:
  2344. .byte 9,0,0,0
  2345. .rva se_handler
  2346. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  2347. ___
  2348. $code.=<<___ if ($avx>1);
  2349. .LSEH_info_${func}_avx2:
  2350. .byte 9,0,0,0
  2351. .rva se_handler
  2352. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  2353. ___
  2354. }
  2355. sub sha256op38 {
  2356. my $instr = shift;
  2357. my %opcodelet = (
  2358. "sha256rnds2" => 0xcb,
  2359. "sha256msg1" => 0xcc,
  2360. "sha256msg2" => 0xcd );
  2361. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
  2362. my @opcode=(0x0f,0x38);
  2363. push @opcode,$opcodelet{$instr};
  2364. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  2365. return ".byte\t".join(',',@opcode);
  2366. } else {
  2367. return $instr."\t".@_[0];
  2368. }
  2369. }
  2370. foreach (split("\n",$code)) {
  2371. s/\`([^\`]*)\`/eval $1/geo;
  2372. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
  2373. print $_,"\n";
  2374. }
  2375. close STDOUT;