sha512-x86_64.pl 63 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560
  1. #! /usr/bin/env perl
  2. # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. Rights for redistribution and usage in source and binary
  12. # forms are granted according to the License.
  13. # ====================================================================
  14. #
  15. # sha256/512_block procedure for x86_64.
  16. #
  17. # 40% improvement over compiler-generated code on Opteron. On EM64T
  18. # sha256 was observed to run >80% faster and sha512 - >40%. No magical
  19. # tricks, just straight implementation... I really wonder why gcc
  20. # [being armed with inline assembler] fails to generate as fast code.
  21. # The only thing which is cool about this module is that it's very
  22. # same instruction sequence used for both SHA-256 and SHA-512. In
  23. # former case the instructions operate on 32-bit operands, while in
  24. # latter - on 64-bit ones. All I had to do is to get one flavor right,
  25. # the other one passed the test right away:-)
  26. #
  27. # sha256_block runs in ~1005 cycles on Opteron, which gives you
  28. # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  29. # frequency in GHz. sha512_block runs in ~1275 cycles, which results
  30. # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  31. # Well, if you compare it to IA-64 implementation, which maintains
  32. # X[16] in register bank[!], tends to 4 instructions per CPU clock
  33. # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  34. # issue Opteron pipeline and X[16] maintained in memory. So that *if*
  35. # there is a way to improve it, *then* the only way would be to try to
  36. # offload X[16] updates to SSE unit, but that would require "deeper"
  37. # loop unroll, which in turn would naturally cause size blow-up, not
  38. # to mention increased complexity! And once again, only *if* it's
  39. # actually possible to noticeably improve overall ILP, instruction
  40. # level parallelism, on a given CPU implementation in this case.
  41. #
  42. # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  43. # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
  44. # [currently available] EM64T CPUs apparently are far from it. On the
  45. # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  46. # sha256_block:-( This is presumably because 64-bit shifts/rotates
  47. # apparently are not atomic instructions, but implemented in microcode.
  48. #
  49. # May 2012.
  50. #
  51. # Optimization including one of Pavel Semjanov's ideas, alternative
  52. # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
  53. # unfortunately -2% SHA512 on P4 [which nobody should care about
  54. # that much].
  55. #
  56. # June 2012.
  57. #
  58. # Add SIMD code paths, see below for improvement coefficients. SSSE3
  59. # code path was not attempted for SHA512, because improvement is not
  60. # estimated to be high enough, noticeably less than 9%, to justify
  61. # the effort, not on pre-AVX processors. [Obviously with exclusion
  62. # for VIA Nano, but it has SHA512 instruction that is faster and
  63. # should be used instead.] For reference, corresponding estimated
  64. # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
  65. # higher coefficients are observed on VIA Nano and Bulldozer has more
  66. # to do with specifics of their architecture [which is topic for
  67. # separate discussion].
  68. #
  69. # November 2012.
  70. #
  71. # Add AVX2 code path. Two consecutive input blocks are loaded to
  72. # 256-bit %ymm registers, with data from first block to least
  73. # significant 128-bit halves and data from second to most significant.
  74. # The data is then processed with same SIMD instruction sequence as
  75. # for AVX, but with %ymm as operands. Side effect is increased stack
  76. # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
  77. # code size increase.
  78. #
  79. # March 2014.
  80. #
  81. # Add support for Intel SHA Extensions.
  82. ######################################################################
  83. # Current performance in cycles per processed byte (less is better):
  84. #
  85. # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
  86. #
  87. # AMD K8 14.9 - - 9.57 -
  88. # P4 17.3 - - 30.8 -
  89. # Core 2 15.6 13.8(+13%) - 9.97 -
  90. # Westmere 14.8 12.3(+19%) - 9.58 -
  91. # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
  92. # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
  93. # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
  94. # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
  95. # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
  96. # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
  97. # VIA Nano 23.0 16.5(+39%) - 14.7 -
  98. # Atom 23.0 18.9(+22%) - 14.7 -
  99. # Silvermont 27.4 20.6(+33%) - 17.5 -
  100. # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
  101. # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
  102. #
  103. # (*) whichever best applicable, including SHAEXT;
  104. # (**) switch from ror to shrd stands for fair share of improvement;
  105. # (***) execution time is fully determined by remaining integer-only
  106. # part, body_00_15; reducing the amount of SIMD instructions
  107. # below certain limit makes no difference/sense; to conserve
  108. # space SHA256 XOP code path is therefore omitted;
  109. # $output is the last argument if it looks like a file (it has an extension)
  110. # $flavour is the first argument if it doesn't look like a file
  111. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  112. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  113. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  114. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  115. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  116. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  117. die "can't locate x86_64-xlate.pl";
  118. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  119. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  120. $avx = ($1>=2.19) + ($1>=2.22);
  121. }
  122. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  123. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  124. $avx = ($1>=2.09) + ($1>=2.10);
  125. }
  126. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  127. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  128. $avx = ($1>=10) + ($1>=11);
  129. }
  130. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  131. $avx = ($2>=3.0) + ($2>3.0);
  132. }
  133. $shaext=1; ### set to zero if compiling for 1.0.1
  134. $avx=1 if (!$shaext && $avx);
  135. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  136. or die "can't call $xlate: $!";
  137. *STDOUT=*OUT;
  138. if ($output =~ /512/) {
  139. $func="sha512_block_data_order";
  140. $TABLE="K512";
  141. $SZ=8;
  142. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
  143. "%r8", "%r9", "%r10","%r11");
  144. ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
  145. @Sigma0=(28,34,39);
  146. @Sigma1=(14,18,41);
  147. @sigma0=(1, 8, 7);
  148. @sigma1=(19,61, 6);
  149. $rounds=80;
  150. } else {
  151. $func="sha256_block_data_order";
  152. $TABLE="K256";
  153. $SZ=4;
  154. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  155. "%r8d","%r9d","%r10d","%r11d");
  156. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
  157. @Sigma0=( 2,13,22);
  158. @Sigma1=( 6,11,25);
  159. @sigma0=( 7,18, 3);
  160. @sigma1=(17,19,10);
  161. $rounds=64;
  162. }
  163. $ctx="%rdi"; # 1st arg, zapped by $a3
  164. $inp="%rsi"; # 2nd arg
  165. $Tbl="%rbp";
  166. $_ctx="16*$SZ+0*8(%rsp)";
  167. $_inp="16*$SZ+1*8(%rsp)";
  168. $_end="16*$SZ+2*8(%rsp)";
  169. $_rsp="`16*$SZ+3*8`(%rsp)";
  170. $framesz="16*$SZ+4*8";
  171. sub ROUND_00_15()
  172. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  173. my $STRIDE=$SZ;
  174. $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
  175. $code.=<<___;
  176. ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
  177. mov $f,$a2
  178. xor $e,$a0
  179. ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
  180. xor $g,$a2 # f^g
  181. mov $T1,`$SZ*($i&0xf)`(%rsp)
  182. xor $a,$a1
  183. and $e,$a2 # (f^g)&e
  184. ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
  185. add $h,$T1 # T1+=h
  186. xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
  187. ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
  188. xor $e,$a0
  189. add $a2,$T1 # T1+=Ch(e,f,g)
  190. mov $a,$a2
  191. add ($Tbl),$T1 # T1+=K[round]
  192. xor $a,$a1
  193. xor $b,$a2 # a^b, b^c in next round
  194. ror \$$Sigma1[0],$a0 # Sigma1(e)
  195. mov $b,$h
  196. and $a2,$a3
  197. ror \$$Sigma0[0],$a1 # Sigma0(a)
  198. add $a0,$T1 # T1+=Sigma1(e)
  199. xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  200. add $T1,$d # d+=T1
  201. add $T1,$h # h+=T1
  202. lea $STRIDE($Tbl),$Tbl # round++
  203. ___
  204. $code.=<<___ if ($i<15);
  205. add $a1,$h # h+=Sigma0(a)
  206. ___
  207. ($a2,$a3) = ($a3,$a2);
  208. }
  209. sub ROUND_16_XX()
  210. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  211. $code.=<<___;
  212. mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
  213. mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
  214. mov $a0,$T1
  215. ror \$`$sigma0[1]-$sigma0[0]`,$a0
  216. add $a1,$a # modulo-scheduled h+=Sigma0(a)
  217. mov $a2,$a1
  218. ror \$`$sigma1[1]-$sigma1[0]`,$a2
  219. xor $T1,$a0
  220. shr \$$sigma0[2],$T1
  221. ror \$$sigma0[0],$a0
  222. xor $a1,$a2
  223. shr \$$sigma1[2],$a1
  224. ror \$$sigma1[0],$a2
  225. xor $a0,$T1 # sigma0(X[(i+1)&0xf])
  226. xor $a1,$a2 # sigma1(X[(i+14)&0xf])
  227. add `$SZ*(($i+9)&0xf)`(%rsp),$T1
  228. add `$SZ*($i&0xf)`(%rsp),$T1
  229. mov $e,$a0
  230. add $a2,$T1
  231. mov $a,$a1
  232. ___
  233. &ROUND_00_15(@_);
  234. }
  235. $code=<<___;
  236. .text
  237. .extern OPENSSL_ia32cap_P
  238. .globl $func
  239. .type $func,\@function,3
  240. .align 16
  241. $func:
  242. .cfi_startproc
  243. ___
  244. $code.=<<___ if ($SZ==4 || $avx);
  245. lea OPENSSL_ia32cap_P(%rip),%r11
  246. mov 0(%r11),%r9d
  247. mov 4(%r11),%r10d
  248. mov 8(%r11),%r11d
  249. ___
  250. $code.=<<___ if ($SZ==4 && $shaext);
  251. test \$`1<<29`,%r11d # check for SHA
  252. jnz _shaext_shortcut
  253. ___
  254. $code.=<<___ if ($avx && $SZ==8);
  255. test \$`1<<11`,%r10d # check for XOP
  256. jnz .Lxop_shortcut
  257. ___
  258. $code.=<<___ if ($avx>1);
  259. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  260. cmp \$`1<<8|1<<5|1<<3`,%r11d
  261. je .Lavx2_shortcut
  262. ___
  263. $code.=<<___ if ($avx);
  264. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  265. and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
  266. or %r9d,%r10d
  267. cmp \$`1<<28|1<<9|1<<30`,%r10d
  268. je .Lavx_shortcut
  269. ___
  270. $code.=<<___ if ($SZ==4);
  271. test \$`1<<9`,%r10d
  272. jnz .Lssse3_shortcut
  273. ___
  274. $code.=<<___;
  275. mov %rsp,%rax # copy %rsp
  276. .cfi_def_cfa_register %rax
  277. push %rbx
  278. .cfi_push %rbx
  279. push %rbp
  280. .cfi_push %rbp
  281. push %r12
  282. .cfi_push %r12
  283. push %r13
  284. .cfi_push %r13
  285. push %r14
  286. .cfi_push %r14
  287. push %r15
  288. .cfi_push %r15
  289. shl \$4,%rdx # num*16
  290. sub \$$framesz,%rsp
  291. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  292. and \$-64,%rsp # align stack frame
  293. mov $ctx,$_ctx # save ctx, 1st arg
  294. mov $inp,$_inp # save inp, 2nd arh
  295. mov %rdx,$_end # save end pointer, "3rd" arg
  296. mov %rax,$_rsp # save copy of %rsp
  297. .cfi_cfa_expression $_rsp,deref,+8
  298. .Lprologue:
  299. mov $SZ*0($ctx),$A
  300. mov $SZ*1($ctx),$B
  301. mov $SZ*2($ctx),$C
  302. mov $SZ*3($ctx),$D
  303. mov $SZ*4($ctx),$E
  304. mov $SZ*5($ctx),$F
  305. mov $SZ*6($ctx),$G
  306. mov $SZ*7($ctx),$H
  307. jmp .Lloop
  308. .align 16
  309. .Lloop:
  310. mov $B,$a3
  311. lea $TABLE(%rip),$Tbl
  312. xor $C,$a3 # magic
  313. ___
  314. for($i=0;$i<16;$i++) {
  315. $code.=" mov $SZ*$i($inp),$T1\n";
  316. $code.=" mov @ROT[4],$a0\n";
  317. $code.=" mov @ROT[0],$a1\n";
  318. $code.=" bswap $T1\n";
  319. &ROUND_00_15($i,@ROT);
  320. unshift(@ROT,pop(@ROT));
  321. }
  322. $code.=<<___;
  323. jmp .Lrounds_16_xx
  324. .align 16
  325. .Lrounds_16_xx:
  326. ___
  327. for(;$i<32;$i++) {
  328. &ROUND_16_XX($i,@ROT);
  329. unshift(@ROT,pop(@ROT));
  330. }
  331. $code.=<<___;
  332. cmpb \$0,`$SZ-1`($Tbl)
  333. jnz .Lrounds_16_xx
  334. mov $_ctx,$ctx
  335. add $a1,$A # modulo-scheduled h+=Sigma0(a)
  336. lea 16*$SZ($inp),$inp
  337. add $SZ*0($ctx),$A
  338. add $SZ*1($ctx),$B
  339. add $SZ*2($ctx),$C
  340. add $SZ*3($ctx),$D
  341. add $SZ*4($ctx),$E
  342. add $SZ*5($ctx),$F
  343. add $SZ*6($ctx),$G
  344. add $SZ*7($ctx),$H
  345. cmp $_end,$inp
  346. mov $A,$SZ*0($ctx)
  347. mov $B,$SZ*1($ctx)
  348. mov $C,$SZ*2($ctx)
  349. mov $D,$SZ*3($ctx)
  350. mov $E,$SZ*4($ctx)
  351. mov $F,$SZ*5($ctx)
  352. mov $G,$SZ*6($ctx)
  353. mov $H,$SZ*7($ctx)
  354. jb .Lloop
  355. mov $_rsp,%rsi
  356. .cfi_def_cfa %rsi,8
  357. mov -48(%rsi),%r15
  358. .cfi_restore %r15
  359. mov -40(%rsi),%r14
  360. .cfi_restore %r14
  361. mov -32(%rsi),%r13
  362. .cfi_restore %r13
  363. mov -24(%rsi),%r12
  364. .cfi_restore %r12
  365. mov -16(%rsi),%rbp
  366. .cfi_restore %rbp
  367. mov -8(%rsi),%rbx
  368. .cfi_restore %rbx
  369. lea (%rsi),%rsp
  370. .cfi_def_cfa_register %rsp
  371. .Lepilogue:
  372. ret
  373. .cfi_endproc
  374. .size $func,.-$func
  375. ___
  376. if ($SZ==4) {
  377. $code.=<<___;
  378. .align 64
  379. .type $TABLE,\@object
  380. $TABLE:
  381. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  382. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  383. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  384. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  385. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  386. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  387. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  388. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  389. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  390. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  391. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  392. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  393. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  394. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  395. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  396. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  397. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  398. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  399. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  400. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  401. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  402. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  403. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  404. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  405. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  406. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  407. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  408. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  409. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  410. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  411. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  412. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  413. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  414. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  415. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  416. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  417. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  418. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  419. .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  420. ___
  421. } else {
  422. $code.=<<___;
  423. .align 64
  424. .type $TABLE,\@object
  425. $TABLE:
  426. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  427. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  428. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  429. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  430. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  431. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  432. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  433. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  434. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  435. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  436. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  437. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  438. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  439. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  440. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  441. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  442. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  443. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  444. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  445. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  446. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  447. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  448. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  449. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  450. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  451. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  452. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  453. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  454. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  455. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  456. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  457. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  458. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  459. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  460. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  461. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  462. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  463. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  464. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  465. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  466. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  467. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  468. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  469. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  470. .quad 0xd192e819d6ef5218,0xd69906245565a910
  471. .quad 0xd192e819d6ef5218,0xd69906245565a910
  472. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  473. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  474. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  475. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  476. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  477. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  478. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  479. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  480. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  481. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  482. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  483. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  484. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  485. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  486. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  487. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  488. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  489. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  490. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  491. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  492. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  493. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  494. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  495. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  496. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  497. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  498. .quad 0x28db77f523047d84,0x32caab7b40c72493
  499. .quad 0x28db77f523047d84,0x32caab7b40c72493
  500. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  501. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  502. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  503. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  504. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  505. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  506. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  507. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  508. .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  509. ___
  510. }
  511. ######################################################################
  512. # SIMD code paths
  513. #
  514. if ($SZ==4 && $shaext) {{{
  515. ######################################################################
  516. # Intel SHA Extensions implementation of SHA256 update function.
  517. #
  518. my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
  519. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
  520. my @MSG=map("%xmm$_",(3..6));
  521. $code.=<<___;
  522. .type sha256_block_data_order_shaext,\@function,3
  523. .align 64
  524. sha256_block_data_order_shaext:
  525. _shaext_shortcut:
  526. .cfi_startproc
  527. ___
  528. $code.=<<___ if ($win64);
  529. lea `-8-5*16`(%rsp),%rsp
  530. movaps %xmm6,-8-5*16(%rax)
  531. movaps %xmm7,-8-4*16(%rax)
  532. movaps %xmm8,-8-3*16(%rax)
  533. movaps %xmm9,-8-2*16(%rax)
  534. movaps %xmm10,-8-1*16(%rax)
  535. .Lprologue_shaext:
  536. ___
  537. $code.=<<___;
  538. lea K256+0x80(%rip),$Tbl
  539. movdqu ($ctx),$ABEF # DCBA
  540. movdqu 16($ctx),$CDGH # HGFE
  541. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  542. pshufd \$0x1b,$ABEF,$Wi # ABCD
  543. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  544. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  545. movdqa $TMP,$BSWAP # offload
  546. palignr \$8,$CDGH,$ABEF # ABEF
  547. punpcklqdq $Wi,$CDGH # CDGH
  548. jmp .Loop_shaext
  549. .align 16
  550. .Loop_shaext:
  551. movdqu ($inp),@MSG[0]
  552. movdqu 0x10($inp),@MSG[1]
  553. movdqu 0x20($inp),@MSG[2]
  554. pshufb $TMP,@MSG[0]
  555. movdqu 0x30($inp),@MSG[3]
  556. movdqa 0*32-0x80($Tbl),$Wi
  557. paddd @MSG[0],$Wi
  558. pshufb $TMP,@MSG[1]
  559. movdqa $CDGH,$CDGH_SAVE # offload
  560. sha256rnds2 $ABEF,$CDGH # 0-3
  561. pshufd \$0x0e,$Wi,$Wi
  562. nop
  563. movdqa $ABEF,$ABEF_SAVE # offload
  564. sha256rnds2 $CDGH,$ABEF
  565. movdqa 1*32-0x80($Tbl),$Wi
  566. paddd @MSG[1],$Wi
  567. pshufb $TMP,@MSG[2]
  568. sha256rnds2 $ABEF,$CDGH # 4-7
  569. pshufd \$0x0e,$Wi,$Wi
  570. lea 0x40($inp),$inp
  571. sha256msg1 @MSG[1],@MSG[0]
  572. sha256rnds2 $CDGH,$ABEF
  573. movdqa 2*32-0x80($Tbl),$Wi
  574. paddd @MSG[2],$Wi
  575. pshufb $TMP,@MSG[3]
  576. sha256rnds2 $ABEF,$CDGH # 8-11
  577. pshufd \$0x0e,$Wi,$Wi
  578. movdqa @MSG[3],$TMP
  579. palignr \$4,@MSG[2],$TMP
  580. nop
  581. paddd $TMP,@MSG[0]
  582. sha256msg1 @MSG[2],@MSG[1]
  583. sha256rnds2 $CDGH,$ABEF
  584. movdqa 3*32-0x80($Tbl),$Wi
  585. paddd @MSG[3],$Wi
  586. sha256msg2 @MSG[3],@MSG[0]
  587. sha256rnds2 $ABEF,$CDGH # 12-15
  588. pshufd \$0x0e,$Wi,$Wi
  589. movdqa @MSG[0],$TMP
  590. palignr \$4,@MSG[3],$TMP
  591. nop
  592. paddd $TMP,@MSG[1]
  593. sha256msg1 @MSG[3],@MSG[2]
  594. sha256rnds2 $CDGH,$ABEF
  595. ___
  596. for($i=4;$i<16-3;$i++) {
  597. $code.=<<___;
  598. movdqa $i*32-0x80($Tbl),$Wi
  599. paddd @MSG[0],$Wi
  600. sha256msg2 @MSG[0],@MSG[1]
  601. sha256rnds2 $ABEF,$CDGH # 16-19...
  602. pshufd \$0x0e,$Wi,$Wi
  603. movdqa @MSG[1],$TMP
  604. palignr \$4,@MSG[0],$TMP
  605. nop
  606. paddd $TMP,@MSG[2]
  607. sha256msg1 @MSG[0],@MSG[3]
  608. sha256rnds2 $CDGH,$ABEF
  609. ___
  610. push(@MSG,shift(@MSG));
  611. }
  612. $code.=<<___;
  613. movdqa 13*32-0x80($Tbl),$Wi
  614. paddd @MSG[0],$Wi
  615. sha256msg2 @MSG[0],@MSG[1]
  616. sha256rnds2 $ABEF,$CDGH # 52-55
  617. pshufd \$0x0e,$Wi,$Wi
  618. movdqa @MSG[1],$TMP
  619. palignr \$4,@MSG[0],$TMP
  620. sha256rnds2 $CDGH,$ABEF
  621. paddd $TMP,@MSG[2]
  622. movdqa 14*32-0x80($Tbl),$Wi
  623. paddd @MSG[1],$Wi
  624. sha256rnds2 $ABEF,$CDGH # 56-59
  625. pshufd \$0x0e,$Wi,$Wi
  626. sha256msg2 @MSG[1],@MSG[2]
  627. movdqa $BSWAP,$TMP
  628. sha256rnds2 $CDGH,$ABEF
  629. movdqa 15*32-0x80($Tbl),$Wi
  630. paddd @MSG[2],$Wi
  631. nop
  632. sha256rnds2 $ABEF,$CDGH # 60-63
  633. pshufd \$0x0e,$Wi,$Wi
  634. dec $num
  635. nop
  636. sha256rnds2 $CDGH,$ABEF
  637. paddd $CDGH_SAVE,$CDGH
  638. paddd $ABEF_SAVE,$ABEF
  639. jnz .Loop_shaext
  640. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  641. pshufd \$0x1b,$ABEF,$TMP # FEBA
  642. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  643. punpckhqdq $CDGH,$ABEF # DCBA
  644. palignr \$8,$TMP,$CDGH # HGFE
  645. movdqu $ABEF,($ctx)
  646. movdqu $CDGH,16($ctx)
  647. ___
  648. $code.=<<___ if ($win64);
  649. movaps -8-5*16(%rax),%xmm6
  650. movaps -8-4*16(%rax),%xmm7
  651. movaps -8-3*16(%rax),%xmm8
  652. movaps -8-2*16(%rax),%xmm9
  653. movaps -8-1*16(%rax),%xmm10
  654. mov %rax,%rsp
  655. .Lepilogue_shaext:
  656. ___
  657. $code.=<<___;
  658. ret
  659. .cfi_endproc
  660. .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
  661. ___
  662. }}}
  663. {{{
  664. my $a4=$T1;
  665. my ($a,$b,$c,$d,$e,$f,$g,$h);
  666. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  667. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  668. my $arg = pop;
  669. $arg = "\$$arg" if ($arg*1 eq $arg);
  670. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  671. }
  672. sub body_00_15 () {
  673. (
  674. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  675. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  676. '&mov ($a,$a1)',
  677. '&mov ($a4,$f)',
  678. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  679. '&xor ($a0,$e)',
  680. '&xor ($a4,$g)', # f^g
  681. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  682. '&xor ($a1,$a)',
  683. '&and ($a4,$e)', # (f^g)&e
  684. '&xor ($a0,$e)',
  685. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  686. '&mov ($a2,$a)',
  687. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  688. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  689. '&xor ($a2,$b)', # a^b, b^c in next round
  690. '&add ($h,$a4)', # h+=Ch(e,f,g)
  691. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  692. '&and ($a3,$a2)', # (b^c)&(a^b)
  693. '&xor ($a1,$a)',
  694. '&add ($h,$a0)', # h+=Sigma1(e)
  695. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  696. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  697. '&add ($d,$h)', # d+=h
  698. '&add ($h,$a3)', # h+=Maj(a,b,c)
  699. '&mov ($a0,$d)',
  700. '&add ($a1,$h);'. # h+=Sigma0(a)
  701. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  702. );
  703. }
  704. ######################################################################
  705. # SSSE3 code path
  706. #
  707. if ($SZ==4) { # SHA256 only
  708. my @X = map("%xmm$_",(0..3));
  709. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  710. $code.=<<___;
  711. .type ${func}_ssse3,\@function,3
  712. .align 64
  713. ${func}_ssse3:
  714. .cfi_startproc
  715. .Lssse3_shortcut:
  716. mov %rsp,%rax # copy %rsp
  717. .cfi_def_cfa_register %rax
  718. push %rbx
  719. .cfi_push %rbx
  720. push %rbp
  721. .cfi_push %rbp
  722. push %r12
  723. .cfi_push %r12
  724. push %r13
  725. .cfi_push %r13
  726. push %r14
  727. .cfi_push %r14
  728. push %r15
  729. .cfi_push %r15
  730. shl \$4,%rdx # num*16
  731. sub \$`$framesz+$win64*16*4`,%rsp
  732. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  733. and \$-64,%rsp # align stack frame
  734. mov $ctx,$_ctx # save ctx, 1st arg
  735. mov $inp,$_inp # save inp, 2nd arh
  736. mov %rdx,$_end # save end pointer, "3rd" arg
  737. mov %rax,$_rsp # save copy of %rsp
  738. .cfi_cfa_expression $_rsp,deref,+8
  739. ___
  740. $code.=<<___ if ($win64);
  741. movaps %xmm6,16*$SZ+32(%rsp)
  742. movaps %xmm7,16*$SZ+48(%rsp)
  743. movaps %xmm8,16*$SZ+64(%rsp)
  744. movaps %xmm9,16*$SZ+80(%rsp)
  745. ___
  746. $code.=<<___;
  747. .Lprologue_ssse3:
  748. mov $SZ*0($ctx),$A
  749. mov $SZ*1($ctx),$B
  750. mov $SZ*2($ctx),$C
  751. mov $SZ*3($ctx),$D
  752. mov $SZ*4($ctx),$E
  753. mov $SZ*5($ctx),$F
  754. mov $SZ*6($ctx),$G
  755. mov $SZ*7($ctx),$H
  756. ___
  757. $code.=<<___;
  758. #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  759. #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  760. jmp .Lloop_ssse3
  761. .align 16
  762. .Lloop_ssse3:
  763. movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  764. movdqu 0x00($inp),@X[0]
  765. movdqu 0x10($inp),@X[1]
  766. movdqu 0x20($inp),@X[2]
  767. pshufb $t3,@X[0]
  768. movdqu 0x30($inp),@X[3]
  769. lea $TABLE(%rip),$Tbl
  770. pshufb $t3,@X[1]
  771. movdqa 0x00($Tbl),$t0
  772. movdqa 0x20($Tbl),$t1
  773. pshufb $t3,@X[2]
  774. paddd @X[0],$t0
  775. movdqa 0x40($Tbl),$t2
  776. pshufb $t3,@X[3]
  777. movdqa 0x60($Tbl),$t3
  778. paddd @X[1],$t1
  779. paddd @X[2],$t2
  780. paddd @X[3],$t3
  781. movdqa $t0,0x00(%rsp)
  782. mov $A,$a1
  783. movdqa $t1,0x10(%rsp)
  784. mov $B,$a3
  785. movdqa $t2,0x20(%rsp)
  786. xor $C,$a3 # magic
  787. movdqa $t3,0x30(%rsp)
  788. mov $E,$a0
  789. jmp .Lssse3_00_47
  790. .align 16
  791. .Lssse3_00_47:
  792. sub \$`-16*2*$SZ`,$Tbl # size optimization
  793. ___
  794. sub Xupdate_256_SSSE3 () {
  795. (
  796. '&movdqa ($t0,@X[1]);',
  797. '&movdqa ($t3,@X[3])',
  798. '&palignr ($t0,@X[0],$SZ)', # X[1..4]
  799. '&palignr ($t3,@X[2],$SZ);', # X[9..12]
  800. '&movdqa ($t1,$t0)',
  801. '&movdqa ($t2,$t0);',
  802. '&psrld ($t0,$sigma0[2])',
  803. '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
  804. '&psrld ($t2,$sigma0[0])',
  805. '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
  806. '&pslld ($t1,8*$SZ-$sigma0[1]);'.
  807. '&pxor ($t0,$t2)',
  808. '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
  809. '&pxor ($t0,$t1)',
  810. '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
  811. '&pxor ($t0,$t2);',
  812. '&movdqa ($t2,$t3)',
  813. '&pxor ($t0,$t1);', # sigma0(X[1..4])
  814. '&psrld ($t3,$sigma1[2])',
  815. '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
  816. '&psrlq ($t2,$sigma1[0])',
  817. '&pxor ($t3,$t2);',
  818. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  819. '&pxor ($t3,$t2)',
  820. '&pshufb ($t3,$t4)', # sigma1(X[14..15])
  821. '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  822. '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
  823. '&movdqa ($t2,$t3);',
  824. '&psrld ($t3,$sigma1[2])',
  825. '&psrlq ($t2,$sigma1[0])',
  826. '&pxor ($t3,$t2);',
  827. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  828. '&pxor ($t3,$t2);',
  829. '&movdqa ($t2,16*2*$j."($Tbl)")',
  830. '&pshufb ($t3,$t5)',
  831. '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
  832. );
  833. }
  834. sub SSSE3_256_00_47 () {
  835. my $j = shift;
  836. my $body = shift;
  837. my @X = @_;
  838. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  839. if (0) {
  840. foreach (Xupdate_256_SSSE3()) { # 36 instructions
  841. eval;
  842. eval(shift(@insns));
  843. eval(shift(@insns));
  844. eval(shift(@insns));
  845. }
  846. } else { # squeeze extra 4% on Westmere and 19% on Atom
  847. eval(shift(@insns)); #@
  848. &movdqa ($t0,@X[1]);
  849. eval(shift(@insns));
  850. eval(shift(@insns));
  851. &movdqa ($t3,@X[3]);
  852. eval(shift(@insns)); #@
  853. eval(shift(@insns));
  854. eval(shift(@insns));
  855. eval(shift(@insns)); #@
  856. eval(shift(@insns));
  857. &palignr ($t0,@X[0],$SZ); # X[1..4]
  858. eval(shift(@insns));
  859. eval(shift(@insns));
  860. &palignr ($t3,@X[2],$SZ); # X[9..12]
  861. eval(shift(@insns));
  862. eval(shift(@insns));
  863. eval(shift(@insns));
  864. eval(shift(@insns)); #@
  865. &movdqa ($t1,$t0);
  866. eval(shift(@insns));
  867. eval(shift(@insns));
  868. &movdqa ($t2,$t0);
  869. eval(shift(@insns)); #@
  870. eval(shift(@insns));
  871. &psrld ($t0,$sigma0[2]);
  872. eval(shift(@insns));
  873. eval(shift(@insns));
  874. eval(shift(@insns));
  875. &paddd (@X[0],$t3); # X[0..3] += X[9..12]
  876. eval(shift(@insns)); #@
  877. eval(shift(@insns));
  878. &psrld ($t2,$sigma0[0]);
  879. eval(shift(@insns));
  880. eval(shift(@insns));
  881. &pshufd ($t3,@X[3],0b11111010); # X[4..15]
  882. eval(shift(@insns));
  883. eval(shift(@insns)); #@
  884. &pslld ($t1,8*$SZ-$sigma0[1]);
  885. eval(shift(@insns));
  886. eval(shift(@insns));
  887. &pxor ($t0,$t2);
  888. eval(shift(@insns)); #@
  889. eval(shift(@insns));
  890. eval(shift(@insns));
  891. eval(shift(@insns)); #@
  892. &psrld ($t2,$sigma0[1]-$sigma0[0]);
  893. eval(shift(@insns));
  894. &pxor ($t0,$t1);
  895. eval(shift(@insns));
  896. eval(shift(@insns));
  897. &pslld ($t1,$sigma0[1]-$sigma0[0]);
  898. eval(shift(@insns));
  899. eval(shift(@insns));
  900. &pxor ($t0,$t2);
  901. eval(shift(@insns));
  902. eval(shift(@insns)); #@
  903. &movdqa ($t2,$t3);
  904. eval(shift(@insns));
  905. eval(shift(@insns));
  906. &pxor ($t0,$t1); # sigma0(X[1..4])
  907. eval(shift(@insns)); #@
  908. eval(shift(@insns));
  909. eval(shift(@insns));
  910. &psrld ($t3,$sigma1[2]);
  911. eval(shift(@insns));
  912. eval(shift(@insns));
  913. &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  914. eval(shift(@insns)); #@
  915. eval(shift(@insns));
  916. &psrlq ($t2,$sigma1[0]);
  917. eval(shift(@insns));
  918. eval(shift(@insns));
  919. eval(shift(@insns));
  920. &pxor ($t3,$t2);
  921. eval(shift(@insns)); #@
  922. eval(shift(@insns));
  923. eval(shift(@insns));
  924. eval(shift(@insns)); #@
  925. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  926. eval(shift(@insns));
  927. eval(shift(@insns));
  928. &pxor ($t3,$t2);
  929. eval(shift(@insns)); #@
  930. eval(shift(@insns));
  931. eval(shift(@insns));
  932. #&pshufb ($t3,$t4); # sigma1(X[14..15])
  933. &pshufd ($t3,$t3,0b10000000);
  934. eval(shift(@insns));
  935. eval(shift(@insns));
  936. eval(shift(@insns));
  937. &psrldq ($t3,8);
  938. eval(shift(@insns));
  939. eval(shift(@insns)); #@
  940. eval(shift(@insns));
  941. eval(shift(@insns));
  942. eval(shift(@insns)); #@
  943. &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  944. eval(shift(@insns));
  945. eval(shift(@insns));
  946. eval(shift(@insns));
  947. &pshufd ($t3,@X[0],0b01010000); # X[16..17]
  948. eval(shift(@insns));
  949. eval(shift(@insns)); #@
  950. eval(shift(@insns));
  951. &movdqa ($t2,$t3);
  952. eval(shift(@insns));
  953. eval(shift(@insns));
  954. &psrld ($t3,$sigma1[2]);
  955. eval(shift(@insns));
  956. eval(shift(@insns)); #@
  957. &psrlq ($t2,$sigma1[0]);
  958. eval(shift(@insns));
  959. eval(shift(@insns));
  960. &pxor ($t3,$t2);
  961. eval(shift(@insns)); #@
  962. eval(shift(@insns));
  963. eval(shift(@insns));
  964. eval(shift(@insns)); #@
  965. eval(shift(@insns));
  966. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  967. eval(shift(@insns));
  968. eval(shift(@insns));
  969. eval(shift(@insns));
  970. &pxor ($t3,$t2);
  971. eval(shift(@insns));
  972. eval(shift(@insns));
  973. eval(shift(@insns)); #@
  974. #&pshufb ($t3,$t5);
  975. &pshufd ($t3,$t3,0b00001000);
  976. eval(shift(@insns));
  977. eval(shift(@insns));
  978. &movdqa ($t2,16*2*$j."($Tbl)");
  979. eval(shift(@insns)); #@
  980. eval(shift(@insns));
  981. &pslldq ($t3,8);
  982. eval(shift(@insns));
  983. eval(shift(@insns));
  984. eval(shift(@insns));
  985. &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  986. eval(shift(@insns)); #@
  987. eval(shift(@insns));
  988. eval(shift(@insns));
  989. }
  990. &paddd ($t2,@X[0]);
  991. foreach (@insns) { eval; } # remaining instructions
  992. &movdqa (16*$j."(%rsp)",$t2);
  993. }
  994. for ($i=0,$j=0; $j<4; $j++) {
  995. &SSSE3_256_00_47($j,\&body_00_15,@X);
  996. push(@X,shift(@X)); # rotate(@X)
  997. }
  998. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  999. &jne (".Lssse3_00_47");
  1000. for ($i=0; $i<16; ) {
  1001. foreach(body_00_15()) { eval; }
  1002. }
  1003. $code.=<<___;
  1004. mov $_ctx,$ctx
  1005. mov $a1,$A
  1006. add $SZ*0($ctx),$A
  1007. lea 16*$SZ($inp),$inp
  1008. add $SZ*1($ctx),$B
  1009. add $SZ*2($ctx),$C
  1010. add $SZ*3($ctx),$D
  1011. add $SZ*4($ctx),$E
  1012. add $SZ*5($ctx),$F
  1013. add $SZ*6($ctx),$G
  1014. add $SZ*7($ctx),$H
  1015. cmp $_end,$inp
  1016. mov $A,$SZ*0($ctx)
  1017. mov $B,$SZ*1($ctx)
  1018. mov $C,$SZ*2($ctx)
  1019. mov $D,$SZ*3($ctx)
  1020. mov $E,$SZ*4($ctx)
  1021. mov $F,$SZ*5($ctx)
  1022. mov $G,$SZ*6($ctx)
  1023. mov $H,$SZ*7($ctx)
  1024. jb .Lloop_ssse3
  1025. mov $_rsp,%rsi
  1026. .cfi_def_cfa %rsi,8
  1027. ___
  1028. $code.=<<___ if ($win64);
  1029. movaps 16*$SZ+32(%rsp),%xmm6
  1030. movaps 16*$SZ+48(%rsp),%xmm7
  1031. movaps 16*$SZ+64(%rsp),%xmm8
  1032. movaps 16*$SZ+80(%rsp),%xmm9
  1033. ___
  1034. $code.=<<___;
  1035. mov -48(%rsi),%r15
  1036. .cfi_restore %r15
  1037. mov -40(%rsi),%r14
  1038. .cfi_restore %r14
  1039. mov -32(%rsi),%r13
  1040. .cfi_restore %r13
  1041. mov -24(%rsi),%r12
  1042. .cfi_restore %r12
  1043. mov -16(%rsi),%rbp
  1044. .cfi_restore %rbp
  1045. mov -8(%rsi),%rbx
  1046. .cfi_restore %rbx
  1047. lea (%rsi),%rsp
  1048. .cfi_def_cfa_register %rsp
  1049. .Lepilogue_ssse3:
  1050. ret
  1051. .cfi_endproc
  1052. .size ${func}_ssse3,.-${func}_ssse3
  1053. ___
  1054. }
  1055. if ($avx) {{
  1056. ######################################################################
  1057. # XOP code path
  1058. #
  1059. if ($SZ==8) { # SHA512 only
  1060. $code.=<<___;
  1061. .type ${func}_xop,\@function,3
  1062. .align 64
  1063. ${func}_xop:
  1064. .cfi_startproc
  1065. .Lxop_shortcut:
  1066. mov %rsp,%rax # copy %rsp
  1067. .cfi_def_cfa_register %rax
  1068. push %rbx
  1069. .cfi_push %rbx
  1070. push %rbp
  1071. .cfi_push %rbp
  1072. push %r12
  1073. .cfi_push %r12
  1074. push %r13
  1075. .cfi_push %r13
  1076. push %r14
  1077. .cfi_push %r14
  1078. push %r15
  1079. .cfi_push %r15
  1080. shl \$4,%rdx # num*16
  1081. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1082. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1083. and \$-64,%rsp # align stack frame
  1084. mov $ctx,$_ctx # save ctx, 1st arg
  1085. mov $inp,$_inp # save inp, 2nd arh
  1086. mov %rdx,$_end # save end pointer, "3rd" arg
  1087. mov %rax,$_rsp # save copy of %rsp
  1088. .cfi_cfa_expression $_rsp,deref,+8
  1089. ___
  1090. $code.=<<___ if ($win64);
  1091. movaps %xmm6,16*$SZ+32(%rsp)
  1092. movaps %xmm7,16*$SZ+48(%rsp)
  1093. movaps %xmm8,16*$SZ+64(%rsp)
  1094. movaps %xmm9,16*$SZ+80(%rsp)
  1095. ___
  1096. $code.=<<___ if ($win64 && $SZ>4);
  1097. movaps %xmm10,16*$SZ+96(%rsp)
  1098. movaps %xmm11,16*$SZ+112(%rsp)
  1099. ___
  1100. $code.=<<___;
  1101. .Lprologue_xop:
  1102. vzeroupper
  1103. mov $SZ*0($ctx),$A
  1104. mov $SZ*1($ctx),$B
  1105. mov $SZ*2($ctx),$C
  1106. mov $SZ*3($ctx),$D
  1107. mov $SZ*4($ctx),$E
  1108. mov $SZ*5($ctx),$F
  1109. mov $SZ*6($ctx),$G
  1110. mov $SZ*7($ctx),$H
  1111. jmp .Lloop_xop
  1112. ___
  1113. if ($SZ==4) { # SHA256
  1114. my @X = map("%xmm$_",(0..3));
  1115. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  1116. $code.=<<___;
  1117. .align 16
  1118. .Lloop_xop:
  1119. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1120. vmovdqu 0x00($inp),@X[0]
  1121. vmovdqu 0x10($inp),@X[1]
  1122. vmovdqu 0x20($inp),@X[2]
  1123. vmovdqu 0x30($inp),@X[3]
  1124. vpshufb $t3,@X[0],@X[0]
  1125. lea $TABLE(%rip),$Tbl
  1126. vpshufb $t3,@X[1],@X[1]
  1127. vpshufb $t3,@X[2],@X[2]
  1128. vpaddd 0x00($Tbl),@X[0],$t0
  1129. vpshufb $t3,@X[3],@X[3]
  1130. vpaddd 0x20($Tbl),@X[1],$t1
  1131. vpaddd 0x40($Tbl),@X[2],$t2
  1132. vpaddd 0x60($Tbl),@X[3],$t3
  1133. vmovdqa $t0,0x00(%rsp)
  1134. mov $A,$a1
  1135. vmovdqa $t1,0x10(%rsp)
  1136. mov $B,$a3
  1137. vmovdqa $t2,0x20(%rsp)
  1138. xor $C,$a3 # magic
  1139. vmovdqa $t3,0x30(%rsp)
  1140. mov $E,$a0
  1141. jmp .Lxop_00_47
  1142. .align 16
  1143. .Lxop_00_47:
  1144. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1145. ___
  1146. sub XOP_256_00_47 () {
  1147. my $j = shift;
  1148. my $body = shift;
  1149. my @X = @_;
  1150. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1151. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  1152. eval(shift(@insns));
  1153. eval(shift(@insns));
  1154. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  1155. eval(shift(@insns));
  1156. eval(shift(@insns));
  1157. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  1158. eval(shift(@insns));
  1159. eval(shift(@insns));
  1160. &vpsrld ($t0,$t0,$sigma0[2]);
  1161. eval(shift(@insns));
  1162. eval(shift(@insns));
  1163. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  1164. eval(shift(@insns));
  1165. eval(shift(@insns));
  1166. eval(shift(@insns));
  1167. eval(shift(@insns));
  1168. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1169. eval(shift(@insns));
  1170. eval(shift(@insns));
  1171. &vpxor ($t0,$t0,$t1);
  1172. eval(shift(@insns));
  1173. eval(shift(@insns));
  1174. eval(shift(@insns));
  1175. eval(shift(@insns));
  1176. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  1177. eval(shift(@insns));
  1178. eval(shift(@insns));
  1179. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  1180. eval(shift(@insns));
  1181. eval(shift(@insns));
  1182. &vpsrld ($t2,@X[3],$sigma1[2]);
  1183. eval(shift(@insns));
  1184. eval(shift(@insns));
  1185. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  1186. eval(shift(@insns));
  1187. eval(shift(@insns));
  1188. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1189. eval(shift(@insns));
  1190. eval(shift(@insns));
  1191. &vpxor ($t3,$t3,$t2);
  1192. eval(shift(@insns));
  1193. eval(shift(@insns));
  1194. eval(shift(@insns));
  1195. eval(shift(@insns));
  1196. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1197. eval(shift(@insns));
  1198. eval(shift(@insns));
  1199. eval(shift(@insns));
  1200. eval(shift(@insns));
  1201. &vpsrldq ($t3,$t3,8);
  1202. eval(shift(@insns));
  1203. eval(shift(@insns));
  1204. eval(shift(@insns));
  1205. eval(shift(@insns));
  1206. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1207. eval(shift(@insns));
  1208. eval(shift(@insns));
  1209. eval(shift(@insns));
  1210. eval(shift(@insns));
  1211. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  1212. eval(shift(@insns));
  1213. eval(shift(@insns));
  1214. &vpsrld ($t2,@X[0],$sigma1[2]);
  1215. eval(shift(@insns));
  1216. eval(shift(@insns));
  1217. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1218. eval(shift(@insns));
  1219. eval(shift(@insns));
  1220. &vpxor ($t3,$t3,$t2);
  1221. eval(shift(@insns));
  1222. eval(shift(@insns));
  1223. eval(shift(@insns));
  1224. eval(shift(@insns));
  1225. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  1226. eval(shift(@insns));
  1227. eval(shift(@insns));
  1228. eval(shift(@insns));
  1229. eval(shift(@insns));
  1230. &vpslldq ($t3,$t3,8); # 22 instructions
  1231. eval(shift(@insns));
  1232. eval(shift(@insns));
  1233. eval(shift(@insns));
  1234. eval(shift(@insns));
  1235. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  1236. eval(shift(@insns));
  1237. eval(shift(@insns));
  1238. eval(shift(@insns));
  1239. eval(shift(@insns));
  1240. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1241. foreach (@insns) { eval; } # remaining instructions
  1242. &vmovdqa (16*$j."(%rsp)",$t2);
  1243. }
  1244. for ($i=0,$j=0; $j<4; $j++) {
  1245. &XOP_256_00_47($j,\&body_00_15,@X);
  1246. push(@X,shift(@X)); # rotate(@X)
  1247. }
  1248. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1249. &jne (".Lxop_00_47");
  1250. for ($i=0; $i<16; ) {
  1251. foreach(body_00_15()) { eval; }
  1252. }
  1253. } else { # SHA512
  1254. my @X = map("%xmm$_",(0..7));
  1255. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1256. $code.=<<___;
  1257. .align 16
  1258. .Lloop_xop:
  1259. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1260. vmovdqu 0x00($inp),@X[0]
  1261. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1262. vmovdqu 0x10($inp),@X[1]
  1263. vmovdqu 0x20($inp),@X[2]
  1264. vpshufb $t3,@X[0],@X[0]
  1265. vmovdqu 0x30($inp),@X[3]
  1266. vpshufb $t3,@X[1],@X[1]
  1267. vmovdqu 0x40($inp),@X[4]
  1268. vpshufb $t3,@X[2],@X[2]
  1269. vmovdqu 0x50($inp),@X[5]
  1270. vpshufb $t3,@X[3],@X[3]
  1271. vmovdqu 0x60($inp),@X[6]
  1272. vpshufb $t3,@X[4],@X[4]
  1273. vmovdqu 0x70($inp),@X[7]
  1274. vpshufb $t3,@X[5],@X[5]
  1275. vpaddq -0x80($Tbl),@X[0],$t0
  1276. vpshufb $t3,@X[6],@X[6]
  1277. vpaddq -0x60($Tbl),@X[1],$t1
  1278. vpshufb $t3,@X[7],@X[7]
  1279. vpaddq -0x40($Tbl),@X[2],$t2
  1280. vpaddq -0x20($Tbl),@X[3],$t3
  1281. vmovdqa $t0,0x00(%rsp)
  1282. vpaddq 0x00($Tbl),@X[4],$t0
  1283. vmovdqa $t1,0x10(%rsp)
  1284. vpaddq 0x20($Tbl),@X[5],$t1
  1285. vmovdqa $t2,0x20(%rsp)
  1286. vpaddq 0x40($Tbl),@X[6],$t2
  1287. vmovdqa $t3,0x30(%rsp)
  1288. vpaddq 0x60($Tbl),@X[7],$t3
  1289. vmovdqa $t0,0x40(%rsp)
  1290. mov $A,$a1
  1291. vmovdqa $t1,0x50(%rsp)
  1292. mov $B,$a3
  1293. vmovdqa $t2,0x60(%rsp)
  1294. xor $C,$a3 # magic
  1295. vmovdqa $t3,0x70(%rsp)
  1296. mov $E,$a0
  1297. jmp .Lxop_00_47
  1298. .align 16
  1299. .Lxop_00_47:
  1300. add \$`16*2*$SZ`,$Tbl
  1301. ___
  1302. sub XOP_512_00_47 () {
  1303. my $j = shift;
  1304. my $body = shift;
  1305. my @X = @_;
  1306. my @insns = (&$body,&$body); # 52 instructions
  1307. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
  1308. eval(shift(@insns));
  1309. eval(shift(@insns));
  1310. &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
  1311. eval(shift(@insns));
  1312. eval(shift(@insns));
  1313. &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
  1314. eval(shift(@insns));
  1315. eval(shift(@insns));
  1316. &vpsrlq ($t0,$t0,$sigma0[2]);
  1317. eval(shift(@insns));
  1318. eval(shift(@insns));
  1319. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
  1320. eval(shift(@insns));
  1321. eval(shift(@insns));
  1322. eval(shift(@insns));
  1323. eval(shift(@insns));
  1324. &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1325. eval(shift(@insns));
  1326. eval(shift(@insns));
  1327. &vpxor ($t0,$t0,$t1);
  1328. eval(shift(@insns));
  1329. eval(shift(@insns));
  1330. eval(shift(@insns));
  1331. eval(shift(@insns));
  1332. &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
  1333. eval(shift(@insns));
  1334. eval(shift(@insns));
  1335. &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
  1336. eval(shift(@insns));
  1337. eval(shift(@insns));
  1338. &vpsrlq ($t2,@X[7],$sigma1[2]);
  1339. eval(shift(@insns));
  1340. eval(shift(@insns));
  1341. &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
  1342. eval(shift(@insns));
  1343. eval(shift(@insns));
  1344. &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1345. eval(shift(@insns));
  1346. eval(shift(@insns));
  1347. &vpxor ($t3,$t3,$t2);
  1348. eval(shift(@insns));
  1349. eval(shift(@insns));
  1350. eval(shift(@insns));
  1351. eval(shift(@insns));
  1352. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1353. eval(shift(@insns));
  1354. eval(shift(@insns));
  1355. eval(shift(@insns));
  1356. eval(shift(@insns));
  1357. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1358. eval(shift(@insns));
  1359. eval(shift(@insns));
  1360. eval(shift(@insns));
  1361. eval(shift(@insns));
  1362. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1363. foreach (@insns) { eval; } # remaining instructions
  1364. &vmovdqa (16*$j."(%rsp)",$t2);
  1365. }
  1366. for ($i=0,$j=0; $j<8; $j++) {
  1367. &XOP_512_00_47($j,\&body_00_15,@X);
  1368. push(@X,shift(@X)); # rotate(@X)
  1369. }
  1370. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1371. &jne (".Lxop_00_47");
  1372. for ($i=0; $i<16; ) {
  1373. foreach(body_00_15()) { eval; }
  1374. }
  1375. }
  1376. $code.=<<___;
  1377. mov $_ctx,$ctx
  1378. mov $a1,$A
  1379. add $SZ*0($ctx),$A
  1380. lea 16*$SZ($inp),$inp
  1381. add $SZ*1($ctx),$B
  1382. add $SZ*2($ctx),$C
  1383. add $SZ*3($ctx),$D
  1384. add $SZ*4($ctx),$E
  1385. add $SZ*5($ctx),$F
  1386. add $SZ*6($ctx),$G
  1387. add $SZ*7($ctx),$H
  1388. cmp $_end,$inp
  1389. mov $A,$SZ*0($ctx)
  1390. mov $B,$SZ*1($ctx)
  1391. mov $C,$SZ*2($ctx)
  1392. mov $D,$SZ*3($ctx)
  1393. mov $E,$SZ*4($ctx)
  1394. mov $F,$SZ*5($ctx)
  1395. mov $G,$SZ*6($ctx)
  1396. mov $H,$SZ*7($ctx)
  1397. jb .Lloop_xop
  1398. mov $_rsp,%rsi
  1399. .cfi_def_cfa %rsi,8
  1400. vzeroupper
  1401. ___
  1402. $code.=<<___ if ($win64);
  1403. movaps 16*$SZ+32(%rsp),%xmm6
  1404. movaps 16*$SZ+48(%rsp),%xmm7
  1405. movaps 16*$SZ+64(%rsp),%xmm8
  1406. movaps 16*$SZ+80(%rsp),%xmm9
  1407. ___
  1408. $code.=<<___ if ($win64 && $SZ>4);
  1409. movaps 16*$SZ+96(%rsp),%xmm10
  1410. movaps 16*$SZ+112(%rsp),%xmm11
  1411. ___
  1412. $code.=<<___;
  1413. mov -48(%rsi),%r15
  1414. .cfi_restore %r15
  1415. mov -40(%rsi),%r14
  1416. .cfi_restore %r14
  1417. mov -32(%rsi),%r13
  1418. .cfi_restore %r13
  1419. mov -24(%rsi),%r12
  1420. .cfi_restore %r12
  1421. mov -16(%rsi),%rbp
  1422. .cfi_restore %rbp
  1423. mov -8(%rsi),%rbx
  1424. .cfi_restore %rbx
  1425. lea (%rsi),%rsp
  1426. .cfi_def_cfa_register %rsp
  1427. .Lepilogue_xop:
  1428. ret
  1429. .cfi_endproc
  1430. .size ${func}_xop,.-${func}_xop
  1431. ___
  1432. }
  1433. ######################################################################
  1434. # AVX+shrd code path
  1435. #
  1436. local *ror = sub { &shrd(@_[0],@_) };
  1437. $code.=<<___;
  1438. .type ${func}_avx,\@function,3
  1439. .align 64
  1440. ${func}_avx:
  1441. .cfi_startproc
  1442. .Lavx_shortcut:
  1443. mov %rsp,%rax # copy %rsp
  1444. .cfi_def_cfa_register %rax
  1445. push %rbx
  1446. .cfi_push %rbx
  1447. push %rbp
  1448. .cfi_push %rbp
  1449. push %r12
  1450. .cfi_push %r12
  1451. push %r13
  1452. .cfi_push %r13
  1453. push %r14
  1454. .cfi_push %r14
  1455. push %r15
  1456. .cfi_push %r15
  1457. shl \$4,%rdx # num*16
  1458. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1459. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1460. and \$-64,%rsp # align stack frame
  1461. mov $ctx,$_ctx # save ctx, 1st arg
  1462. mov $inp,$_inp # save inp, 2nd arh
  1463. mov %rdx,$_end # save end pointer, "3rd" arg
  1464. mov %rax,$_rsp # save copy of %rsp
  1465. .cfi_cfa_expression $_rsp,deref,+8
  1466. ___
  1467. $code.=<<___ if ($win64);
  1468. movaps %xmm6,16*$SZ+32(%rsp)
  1469. movaps %xmm7,16*$SZ+48(%rsp)
  1470. movaps %xmm8,16*$SZ+64(%rsp)
  1471. movaps %xmm9,16*$SZ+80(%rsp)
  1472. ___
  1473. $code.=<<___ if ($win64 && $SZ>4);
  1474. movaps %xmm10,16*$SZ+96(%rsp)
  1475. movaps %xmm11,16*$SZ+112(%rsp)
  1476. ___
  1477. $code.=<<___;
  1478. .Lprologue_avx:
  1479. vzeroupper
  1480. mov $SZ*0($ctx),$A
  1481. mov $SZ*1($ctx),$B
  1482. mov $SZ*2($ctx),$C
  1483. mov $SZ*3($ctx),$D
  1484. mov $SZ*4($ctx),$E
  1485. mov $SZ*5($ctx),$F
  1486. mov $SZ*6($ctx),$G
  1487. mov $SZ*7($ctx),$H
  1488. ___
  1489. if ($SZ==4) { # SHA256
  1490. my @X = map("%xmm$_",(0..3));
  1491. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  1492. $code.=<<___;
  1493. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1494. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1495. jmp .Lloop_avx
  1496. .align 16
  1497. .Lloop_avx:
  1498. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1499. vmovdqu 0x00($inp),@X[0]
  1500. vmovdqu 0x10($inp),@X[1]
  1501. vmovdqu 0x20($inp),@X[2]
  1502. vmovdqu 0x30($inp),@X[3]
  1503. vpshufb $t3,@X[0],@X[0]
  1504. lea $TABLE(%rip),$Tbl
  1505. vpshufb $t3,@X[1],@X[1]
  1506. vpshufb $t3,@X[2],@X[2]
  1507. vpaddd 0x00($Tbl),@X[0],$t0
  1508. vpshufb $t3,@X[3],@X[3]
  1509. vpaddd 0x20($Tbl),@X[1],$t1
  1510. vpaddd 0x40($Tbl),@X[2],$t2
  1511. vpaddd 0x60($Tbl),@X[3],$t3
  1512. vmovdqa $t0,0x00(%rsp)
  1513. mov $A,$a1
  1514. vmovdqa $t1,0x10(%rsp)
  1515. mov $B,$a3
  1516. vmovdqa $t2,0x20(%rsp)
  1517. xor $C,$a3 # magic
  1518. vmovdqa $t3,0x30(%rsp)
  1519. mov $E,$a0
  1520. jmp .Lavx_00_47
  1521. .align 16
  1522. .Lavx_00_47:
  1523. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1524. ___
  1525. sub Xupdate_256_AVX () {
  1526. (
  1527. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  1528. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  1529. '&vpsrld ($t2,$t0,$sigma0[0]);',
  1530. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  1531. '&vpsrld ($t3,$t0,$sigma0[2])',
  1532. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  1533. '&vpxor ($t0,$t3,$t2)',
  1534. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  1535. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1536. '&vpxor ($t0,$t0,$t1)',
  1537. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1538. '&vpxor ($t0,$t0,$t2)',
  1539. '&vpsrld ($t2,$t3,$sigma1[2]);',
  1540. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  1541. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  1542. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  1543. '&vpxor ($t2,$t2,$t3);',
  1544. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1545. '&vpxor ($t2,$t2,$t3)',
  1546. '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
  1547. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  1548. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  1549. '&vpsrld ($t2,$t3,$sigma1[2])',
  1550. '&vpsrlq ($t3,$t3,$sigma1[0])',
  1551. '&vpxor ($t2,$t2,$t3);',
  1552. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1553. '&vpxor ($t2,$t2,$t3)',
  1554. '&vpshufb ($t2,$t2,$t5)',
  1555. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  1556. );
  1557. }
  1558. sub AVX_256_00_47 () {
  1559. my $j = shift;
  1560. my $body = shift;
  1561. my @X = @_;
  1562. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1563. foreach (Xupdate_256_AVX()) { # 29 instructions
  1564. eval;
  1565. eval(shift(@insns));
  1566. eval(shift(@insns));
  1567. eval(shift(@insns));
  1568. }
  1569. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1570. foreach (@insns) { eval; } # remaining instructions
  1571. &vmovdqa (16*$j."(%rsp)",$t2);
  1572. }
  1573. for ($i=0,$j=0; $j<4; $j++) {
  1574. &AVX_256_00_47($j,\&body_00_15,@X);
  1575. push(@X,shift(@X)); # rotate(@X)
  1576. }
  1577. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1578. &jne (".Lavx_00_47");
  1579. for ($i=0; $i<16; ) {
  1580. foreach(body_00_15()) { eval; }
  1581. }
  1582. } else { # SHA512
  1583. my @X = map("%xmm$_",(0..7));
  1584. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1585. $code.=<<___;
  1586. jmp .Lloop_avx
  1587. .align 16
  1588. .Lloop_avx:
  1589. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1590. vmovdqu 0x00($inp),@X[0]
  1591. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1592. vmovdqu 0x10($inp),@X[1]
  1593. vmovdqu 0x20($inp),@X[2]
  1594. vpshufb $t3,@X[0],@X[0]
  1595. vmovdqu 0x30($inp),@X[3]
  1596. vpshufb $t3,@X[1],@X[1]
  1597. vmovdqu 0x40($inp),@X[4]
  1598. vpshufb $t3,@X[2],@X[2]
  1599. vmovdqu 0x50($inp),@X[5]
  1600. vpshufb $t3,@X[3],@X[3]
  1601. vmovdqu 0x60($inp),@X[6]
  1602. vpshufb $t3,@X[4],@X[4]
  1603. vmovdqu 0x70($inp),@X[7]
  1604. vpshufb $t3,@X[5],@X[5]
  1605. vpaddq -0x80($Tbl),@X[0],$t0
  1606. vpshufb $t3,@X[6],@X[6]
  1607. vpaddq -0x60($Tbl),@X[1],$t1
  1608. vpshufb $t3,@X[7],@X[7]
  1609. vpaddq -0x40($Tbl),@X[2],$t2
  1610. vpaddq -0x20($Tbl),@X[3],$t3
  1611. vmovdqa $t0,0x00(%rsp)
  1612. vpaddq 0x00($Tbl),@X[4],$t0
  1613. vmovdqa $t1,0x10(%rsp)
  1614. vpaddq 0x20($Tbl),@X[5],$t1
  1615. vmovdqa $t2,0x20(%rsp)
  1616. vpaddq 0x40($Tbl),@X[6],$t2
  1617. vmovdqa $t3,0x30(%rsp)
  1618. vpaddq 0x60($Tbl),@X[7],$t3
  1619. vmovdqa $t0,0x40(%rsp)
  1620. mov $A,$a1
  1621. vmovdqa $t1,0x50(%rsp)
  1622. mov $B,$a3
  1623. vmovdqa $t2,0x60(%rsp)
  1624. xor $C,$a3 # magic
  1625. vmovdqa $t3,0x70(%rsp)
  1626. mov $E,$a0
  1627. jmp .Lavx_00_47
  1628. .align 16
  1629. .Lavx_00_47:
  1630. add \$`16*2*$SZ`,$Tbl
  1631. ___
  1632. sub Xupdate_512_AVX () {
  1633. (
  1634. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
  1635. '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
  1636. '&vpsrlq ($t2,$t0,$sigma0[0])',
  1637. '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
  1638. '&vpsrlq ($t3,$t0,$sigma0[2])',
  1639. '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
  1640. '&vpxor ($t0,$t3,$t2)',
  1641. '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1642. '&vpxor ($t0,$t0,$t1)',
  1643. '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1644. '&vpxor ($t0,$t0,$t2)',
  1645. '&vpsrlq ($t3,@X[7],$sigma1[2]);',
  1646. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
  1647. '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
  1648. '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
  1649. '&vpsrlq ($t1,@X[7],$sigma1[0]);',
  1650. '&vpxor ($t3,$t3,$t2)',
  1651. '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
  1652. '&vpxor ($t3,$t3,$t1)',
  1653. '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
  1654. '&vpxor ($t3,$t3,$t2)',
  1655. '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
  1656. '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  1657. );
  1658. }
  1659. sub AVX_512_00_47 () {
  1660. my $j = shift;
  1661. my $body = shift;
  1662. my @X = @_;
  1663. my @insns = (&$body,&$body); # 52 instructions
  1664. foreach (Xupdate_512_AVX()) { # 23 instructions
  1665. eval;
  1666. eval(shift(@insns));
  1667. eval(shift(@insns));
  1668. }
  1669. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1670. foreach (@insns) { eval; } # remaining instructions
  1671. &vmovdqa (16*$j."(%rsp)",$t2);
  1672. }
  1673. for ($i=0,$j=0; $j<8; $j++) {
  1674. &AVX_512_00_47($j,\&body_00_15,@X);
  1675. push(@X,shift(@X)); # rotate(@X)
  1676. }
  1677. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1678. &jne (".Lavx_00_47");
  1679. for ($i=0; $i<16; ) {
  1680. foreach(body_00_15()) { eval; }
  1681. }
  1682. }
  1683. $code.=<<___;
  1684. mov $_ctx,$ctx
  1685. mov $a1,$A
  1686. add $SZ*0($ctx),$A
  1687. lea 16*$SZ($inp),$inp
  1688. add $SZ*1($ctx),$B
  1689. add $SZ*2($ctx),$C
  1690. add $SZ*3($ctx),$D
  1691. add $SZ*4($ctx),$E
  1692. add $SZ*5($ctx),$F
  1693. add $SZ*6($ctx),$G
  1694. add $SZ*7($ctx),$H
  1695. cmp $_end,$inp
  1696. mov $A,$SZ*0($ctx)
  1697. mov $B,$SZ*1($ctx)
  1698. mov $C,$SZ*2($ctx)
  1699. mov $D,$SZ*3($ctx)
  1700. mov $E,$SZ*4($ctx)
  1701. mov $F,$SZ*5($ctx)
  1702. mov $G,$SZ*6($ctx)
  1703. mov $H,$SZ*7($ctx)
  1704. jb .Lloop_avx
  1705. mov $_rsp,%rsi
  1706. .cfi_def_cfa %rsi,8
  1707. vzeroupper
  1708. ___
  1709. $code.=<<___ if ($win64);
  1710. movaps 16*$SZ+32(%rsp),%xmm6
  1711. movaps 16*$SZ+48(%rsp),%xmm7
  1712. movaps 16*$SZ+64(%rsp),%xmm8
  1713. movaps 16*$SZ+80(%rsp),%xmm9
  1714. ___
  1715. $code.=<<___ if ($win64 && $SZ>4);
  1716. movaps 16*$SZ+96(%rsp),%xmm10
  1717. movaps 16*$SZ+112(%rsp),%xmm11
  1718. ___
  1719. $code.=<<___;
  1720. mov -48(%rsi),%r15
  1721. .cfi_restore %r15
  1722. mov -40(%rsi),%r14
  1723. .cfi_restore %r14
  1724. mov -32(%rsi),%r13
  1725. .cfi_restore %r13
  1726. mov -24(%rsi),%r12
  1727. .cfi_restore %r12
  1728. mov -16(%rsi),%rbp
  1729. .cfi_restore %rbp
  1730. mov -8(%rsi),%rbx
  1731. .cfi_restore %rbx
  1732. lea (%rsi),%rsp
  1733. .cfi_def_cfa_register %rsp
  1734. .Lepilogue_avx:
  1735. ret
  1736. .cfi_endproc
  1737. .size ${func}_avx,.-${func}_avx
  1738. ___
  1739. if ($avx>1) {{
  1740. ######################################################################
  1741. # AVX2+BMI code path
  1742. #
  1743. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  1744. my $PUSH8=8*2*$SZ;
  1745. use integer;
  1746. sub bodyx_00_15 () {
  1747. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  1748. (
  1749. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  1750. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  1751. '&and ($a4,$e)', # f&e
  1752. '&rorx ($a0,$e,$Sigma1[2])',
  1753. '&rorx ($a2,$e,$Sigma1[1])',
  1754. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  1755. '&lea ($h,"($h,$a4)")',
  1756. '&andn ($a4,$e,$g)', # ~e&g
  1757. '&xor ($a0,$a2)',
  1758. '&rorx ($a1,$e,$Sigma1[0])',
  1759. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  1760. '&xor ($a0,$a1)', # Sigma1(e)
  1761. '&mov ($a2,$a)',
  1762. '&rorx ($a4,$a,$Sigma0[2])',
  1763. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  1764. '&xor ($a2,$b)', # a^b, b^c in next round
  1765. '&rorx ($a1,$a,$Sigma0[1])',
  1766. '&rorx ($a0,$a,$Sigma0[0])',
  1767. '&lea ($d,"($d,$h)")', # d+=h
  1768. '&and ($a3,$a2)', # (b^c)&(a^b)
  1769. '&xor ($a1,$a4)',
  1770. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  1771. '&xor ($a1,$a0)', # Sigma0(a)
  1772. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  1773. '&mov ($a4,$e)', # copy of f in future
  1774. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  1775. );
  1776. # and at the finish one has to $a+=$a1
  1777. }
  1778. $code.=<<___;
  1779. .type ${func}_avx2,\@function,3
  1780. .align 64
  1781. ${func}_avx2:
  1782. .cfi_startproc
  1783. .Lavx2_shortcut:
  1784. mov %rsp,%rax # copy %rsp
  1785. .cfi_def_cfa_register %rax
  1786. push %rbx
  1787. .cfi_push %rbx
  1788. push %rbp
  1789. .cfi_push %rbp
  1790. push %r12
  1791. .cfi_push %r12
  1792. push %r13
  1793. .cfi_push %r13
  1794. push %r14
  1795. .cfi_push %r14
  1796. push %r15
  1797. .cfi_push %r15
  1798. sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
  1799. shl \$4,%rdx # num*16
  1800. and \$-256*$SZ,%rsp # align stack frame
  1801. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1802. add \$`2*$SZ*($rounds-8)`,%rsp
  1803. mov $ctx,$_ctx # save ctx, 1st arg
  1804. mov $inp,$_inp # save inp, 2nd arh
  1805. mov %rdx,$_end # save end pointer, "3rd" arg
  1806. mov %rax,$_rsp # save copy of %rsp
  1807. .cfi_cfa_expression $_rsp,deref,+8
  1808. ___
  1809. $code.=<<___ if ($win64);
  1810. movaps %xmm6,16*$SZ+32(%rsp)
  1811. movaps %xmm7,16*$SZ+48(%rsp)
  1812. movaps %xmm8,16*$SZ+64(%rsp)
  1813. movaps %xmm9,16*$SZ+80(%rsp)
  1814. ___
  1815. $code.=<<___ if ($win64 && $SZ>4);
  1816. movaps %xmm10,16*$SZ+96(%rsp)
  1817. movaps %xmm11,16*$SZ+112(%rsp)
  1818. ___
  1819. $code.=<<___;
  1820. .Lprologue_avx2:
  1821. vzeroupper
  1822. sub \$-16*$SZ,$inp # inp++, size optimization
  1823. mov $SZ*0($ctx),$A
  1824. mov $inp,%r12 # borrow $T1
  1825. mov $SZ*1($ctx),$B
  1826. cmp %rdx,$inp # $_end
  1827. mov $SZ*2($ctx),$C
  1828. cmove %rsp,%r12 # next block or random data
  1829. mov $SZ*3($ctx),$D
  1830. mov $SZ*4($ctx),$E
  1831. mov $SZ*5($ctx),$F
  1832. mov $SZ*6($ctx),$G
  1833. mov $SZ*7($ctx),$H
  1834. ___
  1835. if ($SZ==4) { # SHA256
  1836. my @X = map("%ymm$_",(0..3));
  1837. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
  1838. $code.=<<___;
  1839. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1840. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1841. jmp .Loop_avx2
  1842. .align 16
  1843. .Loop_avx2:
  1844. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1845. vmovdqu -16*$SZ+0($inp),%xmm0
  1846. vmovdqu -16*$SZ+16($inp),%xmm1
  1847. vmovdqu -16*$SZ+32($inp),%xmm2
  1848. vmovdqu -16*$SZ+48($inp),%xmm3
  1849. #mov $inp,$_inp # offload $inp
  1850. vinserti128 \$1,(%r12),@X[0],@X[0]
  1851. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1852. vpshufb $t3,@X[0],@X[0]
  1853. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1854. vpshufb $t3,@X[1],@X[1]
  1855. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1856. lea $TABLE(%rip),$Tbl
  1857. vpshufb $t3,@X[2],@X[2]
  1858. vpaddd 0x00($Tbl),@X[0],$t0
  1859. vpshufb $t3,@X[3],@X[3]
  1860. vpaddd 0x20($Tbl),@X[1],$t1
  1861. vpaddd 0x40($Tbl),@X[2],$t2
  1862. vpaddd 0x60($Tbl),@X[3],$t3
  1863. vmovdqa $t0,0x00(%rsp)
  1864. xor $a1,$a1
  1865. vmovdqa $t1,0x20(%rsp)
  1866. ___
  1867. $code.=<<___ if (!$win64);
  1868. # temporarily use %rdi as frame pointer
  1869. mov $_rsp,%rdi
  1870. .cfi_def_cfa %rdi,8
  1871. ___
  1872. $code.=<<___;
  1873. lea -$PUSH8(%rsp),%rsp
  1874. ___
  1875. $code.=<<___ if (!$win64);
  1876. # the frame info is at $_rsp, but the stack is moving...
  1877. # so a second frame pointer is saved at -8(%rsp)
  1878. # that is in the red zone
  1879. mov %rdi,-8(%rsp)
  1880. .cfi_cfa_expression %rsp-8,deref,+8
  1881. ___
  1882. $code.=<<___;
  1883. mov $B,$a3
  1884. vmovdqa $t2,0x00(%rsp)
  1885. xor $C,$a3 # magic
  1886. vmovdqa $t3,0x20(%rsp)
  1887. mov $F,$a4
  1888. sub \$-16*2*$SZ,$Tbl # size optimization
  1889. jmp .Lavx2_00_47
  1890. .align 16
  1891. .Lavx2_00_47:
  1892. ___
  1893. sub AVX2_256_00_47 () {
  1894. my $j = shift;
  1895. my $body = shift;
  1896. my @X = @_;
  1897. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1898. my $base = "+2*$PUSH8(%rsp)";
  1899. if (($j%2)==0) {
  1900. &lea ("%rsp","-$PUSH8(%rsp)");
  1901. $code.=<<___ if (!$win64);
  1902. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  1903. # copy secondary frame pointer to new location again at -8(%rsp)
  1904. pushq $PUSH8-8(%rsp)
  1905. .cfi_cfa_expression %rsp,deref,+8
  1906. lea 8(%rsp),%rsp
  1907. .cfi_cfa_expression %rsp-8,deref,+8
  1908. ___
  1909. }
  1910. foreach (Xupdate_256_AVX()) { # 29 instructions
  1911. eval;
  1912. eval(shift(@insns));
  1913. eval(shift(@insns));
  1914. eval(shift(@insns));
  1915. }
  1916. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1917. foreach (@insns) { eval; } # remaining instructions
  1918. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1919. }
  1920. for ($i=0,$j=0; $j<4; $j++) {
  1921. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1922. push(@X,shift(@X)); # rotate(@X)
  1923. }
  1924. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1925. &cmpb (($SZ-1)."($Tbl)",0);
  1926. &jne (".Lavx2_00_47");
  1927. for ($i=0; $i<16; ) {
  1928. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1929. foreach(bodyx_00_15()) { eval; }
  1930. }
  1931. } else { # SHA512
  1932. my @X = map("%ymm$_",(0..7));
  1933. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
  1934. $code.=<<___;
  1935. jmp .Loop_avx2
  1936. .align 16
  1937. .Loop_avx2:
  1938. vmovdqu -16*$SZ($inp),%xmm0
  1939. vmovdqu -16*$SZ+16($inp),%xmm1
  1940. vmovdqu -16*$SZ+32($inp),%xmm2
  1941. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1942. vmovdqu -16*$SZ+48($inp),%xmm3
  1943. vmovdqu -16*$SZ+64($inp),%xmm4
  1944. vmovdqu -16*$SZ+80($inp),%xmm5
  1945. vmovdqu -16*$SZ+96($inp),%xmm6
  1946. vmovdqu -16*$SZ+112($inp),%xmm7
  1947. #mov $inp,$_inp # offload $inp
  1948. vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
  1949. vinserti128 \$1,(%r12),@X[0],@X[0]
  1950. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1951. vpshufb $t2,@X[0],@X[0]
  1952. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1953. vpshufb $t2,@X[1],@X[1]
  1954. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1955. vpshufb $t2,@X[2],@X[2]
  1956. vinserti128 \$1,64(%r12),@X[4],@X[4]
  1957. vpshufb $t2,@X[3],@X[3]
  1958. vinserti128 \$1,80(%r12),@X[5],@X[5]
  1959. vpshufb $t2,@X[4],@X[4]
  1960. vinserti128 \$1,96(%r12),@X[6],@X[6]
  1961. vpshufb $t2,@X[5],@X[5]
  1962. vinserti128 \$1,112(%r12),@X[7],@X[7]
  1963. vpaddq -0x80($Tbl),@X[0],$t0
  1964. vpshufb $t2,@X[6],@X[6]
  1965. vpaddq -0x60($Tbl),@X[1],$t1
  1966. vpshufb $t2,@X[7],@X[7]
  1967. vpaddq -0x40($Tbl),@X[2],$t2
  1968. vpaddq -0x20($Tbl),@X[3],$t3
  1969. vmovdqa $t0,0x00(%rsp)
  1970. vpaddq 0x00($Tbl),@X[4],$t0
  1971. vmovdqa $t1,0x20(%rsp)
  1972. vpaddq 0x20($Tbl),@X[5],$t1
  1973. vmovdqa $t2,0x40(%rsp)
  1974. vpaddq 0x40($Tbl),@X[6],$t2
  1975. vmovdqa $t3,0x60(%rsp)
  1976. ___
  1977. $code.=<<___ if (!$win64);
  1978. # temporarily use %rdi as frame pointer
  1979. mov $_rsp,%rdi
  1980. .cfi_def_cfa %rdi,8
  1981. ___
  1982. $code.=<<___;
  1983. lea -$PUSH8(%rsp),%rsp
  1984. ___
  1985. $code.=<<___ if (!$win64);
  1986. # the frame info is at $_rsp, but the stack is moving...
  1987. # so a second frame pointer is saved at -8(%rsp)
  1988. # that is in the red zone
  1989. mov %rdi,-8(%rsp)
  1990. .cfi_cfa_expression %rsp-8,deref,+8
  1991. ___
  1992. $code.=<<___;
  1993. vpaddq 0x60($Tbl),@X[7],$t3
  1994. vmovdqa $t0,0x00(%rsp)
  1995. xor $a1,$a1
  1996. vmovdqa $t1,0x20(%rsp)
  1997. mov $B,$a3
  1998. vmovdqa $t2,0x40(%rsp)
  1999. xor $C,$a3 # magic
  2000. vmovdqa $t3,0x60(%rsp)
  2001. mov $F,$a4
  2002. add \$16*2*$SZ,$Tbl
  2003. jmp .Lavx2_00_47
  2004. .align 16
  2005. .Lavx2_00_47:
  2006. ___
  2007. sub AVX2_512_00_47 () {
  2008. my $j = shift;
  2009. my $body = shift;
  2010. my @X = @_;
  2011. my @insns = (&$body,&$body); # 48 instructions
  2012. my $base = "+2*$PUSH8(%rsp)";
  2013. if (($j%4)==0) {
  2014. &lea ("%rsp","-$PUSH8(%rsp)");
  2015. $code.=<<___ if (!$win64);
  2016. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  2017. # copy secondary frame pointer to new location again at -8(%rsp)
  2018. pushq $PUSH8-8(%rsp)
  2019. .cfi_cfa_expression %rsp,deref,+8
  2020. lea 8(%rsp),%rsp
  2021. .cfi_cfa_expression %rsp-8,deref,+8
  2022. ___
  2023. }
  2024. foreach (Xupdate_512_AVX()) { # 23 instructions
  2025. eval;
  2026. if ($_ !~ /\;$/) {
  2027. eval(shift(@insns));
  2028. eval(shift(@insns));
  2029. eval(shift(@insns));
  2030. }
  2031. }
  2032. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  2033. foreach (@insns) { eval; } # remaining instructions
  2034. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  2035. }
  2036. for ($i=0,$j=0; $j<8; $j++) {
  2037. &AVX2_512_00_47($j,\&bodyx_00_15,@X);
  2038. push(@X,shift(@X)); # rotate(@X)
  2039. }
  2040. &lea ($Tbl,16*2*$SZ."($Tbl)");
  2041. &cmpb (($SZ-1-0x80)."($Tbl)",0);
  2042. &jne (".Lavx2_00_47");
  2043. for ($i=0; $i<16; ) {
  2044. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  2045. foreach(bodyx_00_15()) { eval; }
  2046. }
  2047. }
  2048. $code.=<<___;
  2049. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2050. add $a1,$A
  2051. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2052. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  2053. add $SZ*0($ctx),$A
  2054. add $SZ*1($ctx),$B
  2055. add $SZ*2($ctx),$C
  2056. add $SZ*3($ctx),$D
  2057. add $SZ*4($ctx),$E
  2058. add $SZ*5($ctx),$F
  2059. add $SZ*6($ctx),$G
  2060. add $SZ*7($ctx),$H
  2061. mov $A,$SZ*0($ctx)
  2062. mov $B,$SZ*1($ctx)
  2063. mov $C,$SZ*2($ctx)
  2064. mov $D,$SZ*3($ctx)
  2065. mov $E,$SZ*4($ctx)
  2066. mov $F,$SZ*5($ctx)
  2067. mov $G,$SZ*6($ctx)
  2068. mov $H,$SZ*7($ctx)
  2069. cmp `$PUSH8+2*8`($Tbl),$inp # $_end
  2070. je .Ldone_avx2
  2071. xor $a1,$a1
  2072. mov $B,$a3
  2073. xor $C,$a3 # magic
  2074. mov $F,$a4
  2075. jmp .Lower_avx2
  2076. .align 16
  2077. .Lower_avx2:
  2078. ___
  2079. for ($i=0; $i<8; ) {
  2080. my $base="+16($Tbl)";
  2081. foreach(bodyx_00_15()) { eval; }
  2082. }
  2083. $code.=<<___;
  2084. lea -$PUSH8($Tbl),$Tbl
  2085. cmp %rsp,$Tbl
  2086. jae .Lower_avx2
  2087. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  2088. add $a1,$A
  2089. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  2090. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  2091. # restore frame pointer to original location at $_rsp
  2092. .cfi_cfa_expression $_rsp,deref,+8
  2093. add $SZ*0($ctx),$A
  2094. add $SZ*1($ctx),$B
  2095. add $SZ*2($ctx),$C
  2096. add $SZ*3($ctx),$D
  2097. add $SZ*4($ctx),$E
  2098. add $SZ*5($ctx),$F
  2099. lea `2*16*$SZ`($inp),$inp # inp+=2
  2100. add $SZ*6($ctx),$G
  2101. mov $inp,%r12
  2102. add $SZ*7($ctx),$H
  2103. cmp $_end,$inp
  2104. mov $A,$SZ*0($ctx)
  2105. cmove %rsp,%r12 # next block or stale data
  2106. mov $B,$SZ*1($ctx)
  2107. mov $C,$SZ*2($ctx)
  2108. mov $D,$SZ*3($ctx)
  2109. mov $E,$SZ*4($ctx)
  2110. mov $F,$SZ*5($ctx)
  2111. mov $G,$SZ*6($ctx)
  2112. mov $H,$SZ*7($ctx)
  2113. jbe .Loop_avx2
  2114. lea (%rsp),$Tbl
  2115. # temporarily use $Tbl as index to $_rsp
  2116. # this avoids the need to save a secondary frame pointer at -8(%rsp)
  2117. .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
  2118. .Ldone_avx2:
  2119. mov `16*$SZ+3*8`($Tbl),%rsi
  2120. .cfi_def_cfa %rsi,8
  2121. vzeroupper
  2122. ___
  2123. $code.=<<___ if ($win64);
  2124. movaps 16*$SZ+32($Tbl),%xmm6
  2125. movaps 16*$SZ+48($Tbl),%xmm7
  2126. movaps 16*$SZ+64($Tbl),%xmm8
  2127. movaps 16*$SZ+80($Tbl),%xmm9
  2128. ___
  2129. $code.=<<___ if ($win64 && $SZ>4);
  2130. movaps 16*$SZ+96($Tbl),%xmm10
  2131. movaps 16*$SZ+112($Tbl),%xmm11
  2132. ___
  2133. $code.=<<___;
  2134. mov -48(%rsi),%r15
  2135. .cfi_restore %r15
  2136. mov -40(%rsi),%r14
  2137. .cfi_restore %r14
  2138. mov -32(%rsi),%r13
  2139. .cfi_restore %r13
  2140. mov -24(%rsi),%r12
  2141. .cfi_restore %r12
  2142. mov -16(%rsi),%rbp
  2143. .cfi_restore %rbp
  2144. mov -8(%rsi),%rbx
  2145. .cfi_restore %rbx
  2146. lea (%rsi),%rsp
  2147. .cfi_def_cfa_register %rsp
  2148. .Lepilogue_avx2:
  2149. ret
  2150. .cfi_endproc
  2151. .size ${func}_avx2,.-${func}_avx2
  2152. ___
  2153. }}
  2154. }}}}}
  2155. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  2156. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  2157. if ($win64) {
  2158. $rec="%rcx";
  2159. $frame="%rdx";
  2160. $context="%r8";
  2161. $disp="%r9";
  2162. $code.=<<___;
  2163. .extern __imp_RtlVirtualUnwind
  2164. .type se_handler,\@abi-omnipotent
  2165. .align 16
  2166. se_handler:
  2167. push %rsi
  2168. push %rdi
  2169. push %rbx
  2170. push %rbp
  2171. push %r12
  2172. push %r13
  2173. push %r14
  2174. push %r15
  2175. pushfq
  2176. sub \$64,%rsp
  2177. mov 120($context),%rax # pull context->Rax
  2178. mov 248($context),%rbx # pull context->Rip
  2179. mov 8($disp),%rsi # disp->ImageBase
  2180. mov 56($disp),%r11 # disp->HanderlData
  2181. mov 0(%r11),%r10d # HandlerData[0]
  2182. lea (%rsi,%r10),%r10 # prologue label
  2183. cmp %r10,%rbx # context->Rip<prologue label
  2184. jb .Lin_prologue
  2185. mov 152($context),%rax # pull context->Rsp
  2186. mov 4(%r11),%r10d # HandlerData[1]
  2187. lea (%rsi,%r10),%r10 # epilogue label
  2188. cmp %r10,%rbx # context->Rip>=epilogue label
  2189. jae .Lin_prologue
  2190. ___
  2191. $code.=<<___ if ($avx>1);
  2192. lea .Lavx2_shortcut(%rip),%r10
  2193. cmp %r10,%rbx # context->Rip<avx2_shortcut
  2194. jb .Lnot_in_avx2
  2195. and \$-256*$SZ,%rax
  2196. add \$`2*$SZ*($rounds-8)`,%rax
  2197. .Lnot_in_avx2:
  2198. ___
  2199. $code.=<<___;
  2200. mov %rax,%rsi # put aside Rsp
  2201. mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
  2202. mov -8(%rax),%rbx
  2203. mov -16(%rax),%rbp
  2204. mov -24(%rax),%r12
  2205. mov -32(%rax),%r13
  2206. mov -40(%rax),%r14
  2207. mov -48(%rax),%r15
  2208. mov %rbx,144($context) # restore context->Rbx
  2209. mov %rbp,160($context) # restore context->Rbp
  2210. mov %r12,216($context) # restore context->R12
  2211. mov %r13,224($context) # restore context->R13
  2212. mov %r14,232($context) # restore context->R14
  2213. mov %r15,240($context) # restore context->R15
  2214. lea .Lepilogue(%rip),%r10
  2215. cmp %r10,%rbx
  2216. jb .Lin_prologue # non-AVX code
  2217. lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
  2218. lea 512($context),%rdi # &context.Xmm6
  2219. mov \$`$SZ==4?8:12`,%ecx
  2220. .long 0xa548f3fc # cld; rep movsq
  2221. .Lin_prologue:
  2222. mov 8(%rax),%rdi
  2223. mov 16(%rax),%rsi
  2224. mov %rax,152($context) # restore context->Rsp
  2225. mov %rsi,168($context) # restore context->Rsi
  2226. mov %rdi,176($context) # restore context->Rdi
  2227. mov 40($disp),%rdi # disp->ContextRecord
  2228. mov $context,%rsi # context
  2229. mov \$154,%ecx # sizeof(CONTEXT)
  2230. .long 0xa548f3fc # cld; rep movsq
  2231. mov $disp,%rsi
  2232. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2233. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2234. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2235. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2236. mov 40(%rsi),%r10 # disp->ContextRecord
  2237. lea 56(%rsi),%r11 # &disp->HandlerData
  2238. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2239. mov %r10,32(%rsp) # arg5
  2240. mov %r11,40(%rsp) # arg6
  2241. mov %r12,48(%rsp) # arg7
  2242. mov %rcx,56(%rsp) # arg8, (NULL)
  2243. call *__imp_RtlVirtualUnwind(%rip)
  2244. mov \$1,%eax # ExceptionContinueSearch
  2245. add \$64,%rsp
  2246. popfq
  2247. pop %r15
  2248. pop %r14
  2249. pop %r13
  2250. pop %r12
  2251. pop %rbp
  2252. pop %rbx
  2253. pop %rdi
  2254. pop %rsi
  2255. ret
  2256. .size se_handler,.-se_handler
  2257. ___
  2258. $code.=<<___ if ($SZ==4 && $shaext);
  2259. .type shaext_handler,\@abi-omnipotent
  2260. .align 16
  2261. shaext_handler:
  2262. push %rsi
  2263. push %rdi
  2264. push %rbx
  2265. push %rbp
  2266. push %r12
  2267. push %r13
  2268. push %r14
  2269. push %r15
  2270. pushfq
  2271. sub \$64,%rsp
  2272. mov 120($context),%rax # pull context->Rax
  2273. mov 248($context),%rbx # pull context->Rip
  2274. lea .Lprologue_shaext(%rip),%r10
  2275. cmp %r10,%rbx # context->Rip<.Lprologue
  2276. jb .Lin_prologue
  2277. lea .Lepilogue_shaext(%rip),%r10
  2278. cmp %r10,%rbx # context->Rip>=.Lepilogue
  2279. jae .Lin_prologue
  2280. lea -8-5*16(%rax),%rsi
  2281. lea 512($context),%rdi # &context.Xmm6
  2282. mov \$10,%ecx
  2283. .long 0xa548f3fc # cld; rep movsq
  2284. jmp .Lin_prologue
  2285. .size shaext_handler,.-shaext_handler
  2286. ___
  2287. $code.=<<___;
  2288. .section .pdata
  2289. .align 4
  2290. .rva .LSEH_begin_$func
  2291. .rva .LSEH_end_$func
  2292. .rva .LSEH_info_$func
  2293. ___
  2294. $code.=<<___ if ($SZ==4 && $shaext);
  2295. .rva .LSEH_begin_${func}_shaext
  2296. .rva .LSEH_end_${func}_shaext
  2297. .rva .LSEH_info_${func}_shaext
  2298. ___
  2299. $code.=<<___ if ($SZ==4);
  2300. .rva .LSEH_begin_${func}_ssse3
  2301. .rva .LSEH_end_${func}_ssse3
  2302. .rva .LSEH_info_${func}_ssse3
  2303. ___
  2304. $code.=<<___ if ($avx && $SZ==8);
  2305. .rva .LSEH_begin_${func}_xop
  2306. .rva .LSEH_end_${func}_xop
  2307. .rva .LSEH_info_${func}_xop
  2308. ___
  2309. $code.=<<___ if ($avx);
  2310. .rva .LSEH_begin_${func}_avx
  2311. .rva .LSEH_end_${func}_avx
  2312. .rva .LSEH_info_${func}_avx
  2313. ___
  2314. $code.=<<___ if ($avx>1);
  2315. .rva .LSEH_begin_${func}_avx2
  2316. .rva .LSEH_end_${func}_avx2
  2317. .rva .LSEH_info_${func}_avx2
  2318. ___
  2319. $code.=<<___;
  2320. .section .xdata
  2321. .align 8
  2322. .LSEH_info_$func:
  2323. .byte 9,0,0,0
  2324. .rva se_handler
  2325. .rva .Lprologue,.Lepilogue # HandlerData[]
  2326. ___
  2327. $code.=<<___ if ($SZ==4 && $shaext);
  2328. .LSEH_info_${func}_shaext:
  2329. .byte 9,0,0,0
  2330. .rva shaext_handler
  2331. ___
  2332. $code.=<<___ if ($SZ==4);
  2333. .LSEH_info_${func}_ssse3:
  2334. .byte 9,0,0,0
  2335. .rva se_handler
  2336. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  2337. ___
  2338. $code.=<<___ if ($avx && $SZ==8);
  2339. .LSEH_info_${func}_xop:
  2340. .byte 9,0,0,0
  2341. .rva se_handler
  2342. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  2343. ___
  2344. $code.=<<___ if ($avx);
  2345. .LSEH_info_${func}_avx:
  2346. .byte 9,0,0,0
  2347. .rva se_handler
  2348. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  2349. ___
  2350. $code.=<<___ if ($avx>1);
  2351. .LSEH_info_${func}_avx2:
  2352. .byte 9,0,0,0
  2353. .rva se_handler
  2354. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  2355. ___
  2356. }
  2357. sub sha256op38 {
  2358. my $instr = shift;
  2359. my %opcodelet = (
  2360. "sha256rnds2" => 0xcb,
  2361. "sha256msg1" => 0xcc,
  2362. "sha256msg2" => 0xcd );
  2363. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
  2364. my @opcode=(0x0f,0x38);
  2365. push @opcode,$opcodelet{$instr};
  2366. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  2367. return ".byte\t".join(',',@opcode);
  2368. } else {
  2369. return $instr."\t".@_[0];
  2370. }
  2371. }
  2372. foreach (split("\n",$code)) {
  2373. s/\`([^\`]*)\`/eval $1/geo;
  2374. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
  2375. print $_,"\n";
  2376. }
  2377. close STDOUT or die "error closing STDOUT: $!";