x86_64-mont5.pl 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # August 2011.
  9. #
  10. # Companion to x86_64-mont.pl that optimizes cache-timing attack
  11. # countermeasures. The subroutines are produced by replacing bp[i]
  12. # references in their x86_64-mont.pl counterparts with cache-neutral
  13. # references to powers table computed in BN_mod_exp_mont_consttime.
  14. # In addition subroutine that scatters elements of the powers table
  15. # is implemented, so that scatter-/gathering can be tuned without
  16. # bn_exp.c modifications.
  17. # August 2013.
  18. #
  19. # Add MULX/AD*X code paths and additional interfaces to optimize for
  20. # branch prediction unit. For input lengths that are multiples of 8
  21. # the np argument is not just modulus value, but one interleaved
  22. # with 0. This is to optimize post-condition...
  23. $flavour = shift;
  24. $output = shift;
  25. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  26. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  27. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  28. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  29. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  30. die "can't locate x86_64-xlate.pl";
  31. open OUT,"| \"$^X\" $xlate $flavour $output";
  32. *STDOUT=*OUT;
  33. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  34. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  35. $addx = ($1>=2.23);
  36. }
  37. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  38. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  39. $addx = ($1>=2.10);
  40. }
  41. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  42. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  43. $addx = ($1>=12);
  44. }
  45. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  46. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  47. $addx = ($ver>=3.03);
  48. }
  49. # int bn_mul_mont_gather5(
  50. $rp="%rdi"; # BN_ULONG *rp,
  51. $ap="%rsi"; # const BN_ULONG *ap,
  52. $bp="%rdx"; # const BN_ULONG *bp,
  53. $np="%rcx"; # const BN_ULONG *np,
  54. $n0="%r8"; # const BN_ULONG *n0,
  55. $num="%r9"; # int num,
  56. # int idx); # 0 to 2^5-1, "index" in $bp holding
  57. # pre-computed powers of a', interlaced
  58. # in such manner that b[0] is $bp[idx],
  59. # b[1] is [2^5+idx], etc.
  60. $lo0="%r10";
  61. $hi0="%r11";
  62. $hi1="%r13";
  63. $i="%r14";
  64. $j="%r15";
  65. $m0="%rbx";
  66. $m1="%rbp";
  67. $code=<<___;
  68. .text
  69. .extern OPENSSL_ia32cap_P
  70. .globl bn_mul_mont_gather5
  71. .type bn_mul_mont_gather5,\@function,6
  72. .align 64
  73. bn_mul_mont_gather5:
  74. test \$7,${num}d
  75. jnz .Lmul_enter
  76. ___
  77. $code.=<<___ if ($addx);
  78. mov OPENSSL_ia32cap_P+8(%rip),%r11d
  79. ___
  80. $code.=<<___;
  81. jmp .Lmul4x_enter
  82. .align 16
  83. .Lmul_enter:
  84. mov ${num}d,${num}d
  85. mov %rsp,%rax
  86. mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
  87. push %rbx
  88. push %rbp
  89. push %r12
  90. push %r13
  91. push %r14
  92. push %r15
  93. ___
  94. $code.=<<___ if ($win64);
  95. lea -0x28(%rsp),%rsp
  96. movaps %xmm6,(%rsp)
  97. movaps %xmm7,0x10(%rsp)
  98. ___
  99. $code.=<<___;
  100. lea 2($num),%r11
  101. neg %r11
  102. lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
  103. and \$-1024,%rsp # minimize TLB usage
  104. mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
  105. .Lmul_body:
  106. mov $bp,%r12 # reassign $bp
  107. ___
  108. $bp="%r12";
  109. $STRIDE=2**5*8; # 5 is "window size"
  110. $N=$STRIDE/4; # should match cache line size
  111. $code.=<<___;
  112. mov %r10,%r11
  113. shr \$`log($N/8)/log(2)`,%r10
  114. and \$`$N/8-1`,%r11
  115. not %r10
  116. lea .Lmagic_masks(%rip),%rax
  117. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  118. lea 96($bp,%r11,8),$bp # pointer within 1st cache line
  119. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  120. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  121. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  122. movq 24(%rax,%r10,8),%xmm7
  123. movq `0*$STRIDE/4-96`($bp),%xmm0
  124. movq `1*$STRIDE/4-96`($bp),%xmm1
  125. pand %xmm4,%xmm0
  126. movq `2*$STRIDE/4-96`($bp),%xmm2
  127. pand %xmm5,%xmm1
  128. movq `3*$STRIDE/4-96`($bp),%xmm3
  129. pand %xmm6,%xmm2
  130. por %xmm1,%xmm0
  131. pand %xmm7,%xmm3
  132. por %xmm2,%xmm0
  133. lea $STRIDE($bp),$bp
  134. por %xmm3,%xmm0
  135. movq %xmm0,$m0 # m0=bp[0]
  136. mov ($n0),$n0 # pull n0[0] value
  137. mov ($ap),%rax
  138. xor $i,$i # i=0
  139. xor $j,$j # j=0
  140. movq `0*$STRIDE/4-96`($bp),%xmm0
  141. movq `1*$STRIDE/4-96`($bp),%xmm1
  142. pand %xmm4,%xmm0
  143. movq `2*$STRIDE/4-96`($bp),%xmm2
  144. pand %xmm5,%xmm1
  145. mov $n0,$m1
  146. mulq $m0 # ap[0]*bp[0]
  147. mov %rax,$lo0
  148. mov ($np),%rax
  149. movq `3*$STRIDE/4-96`($bp),%xmm3
  150. pand %xmm6,%xmm2
  151. por %xmm1,%xmm0
  152. pand %xmm7,%xmm3
  153. imulq $lo0,$m1 # "tp[0]"*n0
  154. mov %rdx,$hi0
  155. por %xmm2,%xmm0
  156. lea $STRIDE($bp),$bp
  157. por %xmm3,%xmm0
  158. mulq $m1 # np[0]*m1
  159. add %rax,$lo0 # discarded
  160. mov 8($ap),%rax
  161. adc \$0,%rdx
  162. mov %rdx,$hi1
  163. lea 1($j),$j # j++
  164. jmp .L1st_enter
  165. .align 16
  166. .L1st:
  167. add %rax,$hi1
  168. mov ($ap,$j,8),%rax
  169. adc \$0,%rdx
  170. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  171. mov $lo0,$hi0
  172. adc \$0,%rdx
  173. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  174. mov %rdx,$hi1
  175. .L1st_enter:
  176. mulq $m0 # ap[j]*bp[0]
  177. add %rax,$hi0
  178. mov ($np,$j,8),%rax
  179. adc \$0,%rdx
  180. lea 1($j),$j # j++
  181. mov %rdx,$lo0
  182. mulq $m1 # np[j]*m1
  183. cmp $num,$j
  184. jne .L1st
  185. movq %xmm0,$m0 # bp[1]
  186. add %rax,$hi1
  187. mov ($ap),%rax # ap[0]
  188. adc \$0,%rdx
  189. add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
  190. adc \$0,%rdx
  191. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  192. mov %rdx,$hi1
  193. mov $lo0,$hi0
  194. xor %rdx,%rdx
  195. add $hi0,$hi1
  196. adc \$0,%rdx
  197. mov $hi1,-8(%rsp,$num,8)
  198. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  199. lea 1($i),$i # i++
  200. jmp .Louter
  201. .align 16
  202. .Louter:
  203. xor $j,$j # j=0
  204. mov $n0,$m1
  205. mov (%rsp),$lo0
  206. movq `0*$STRIDE/4-96`($bp),%xmm0
  207. movq `1*$STRIDE/4-96`($bp),%xmm1
  208. pand %xmm4,%xmm0
  209. movq `2*$STRIDE/4-96`($bp),%xmm2
  210. pand %xmm5,%xmm1
  211. mulq $m0 # ap[0]*bp[i]
  212. add %rax,$lo0 # ap[0]*bp[i]+tp[0]
  213. mov ($np),%rax
  214. adc \$0,%rdx
  215. movq `3*$STRIDE/4-96`($bp),%xmm3
  216. pand %xmm6,%xmm2
  217. por %xmm1,%xmm0
  218. pand %xmm7,%xmm3
  219. imulq $lo0,$m1 # tp[0]*n0
  220. mov %rdx,$hi0
  221. por %xmm2,%xmm0
  222. lea $STRIDE($bp),$bp
  223. por %xmm3,%xmm0
  224. mulq $m1 # np[0]*m1
  225. add %rax,$lo0 # discarded
  226. mov 8($ap),%rax
  227. adc \$0,%rdx
  228. mov 8(%rsp),$lo0 # tp[1]
  229. mov %rdx,$hi1
  230. lea 1($j),$j # j++
  231. jmp .Linner_enter
  232. .align 16
  233. .Linner:
  234. add %rax,$hi1
  235. mov ($ap,$j,8),%rax
  236. adc \$0,%rdx
  237. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  238. mov (%rsp,$j,8),$lo0
  239. adc \$0,%rdx
  240. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  241. mov %rdx,$hi1
  242. .Linner_enter:
  243. mulq $m0 # ap[j]*bp[i]
  244. add %rax,$hi0
  245. mov ($np,$j,8),%rax
  246. adc \$0,%rdx
  247. add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
  248. mov %rdx,$hi0
  249. adc \$0,$hi0
  250. lea 1($j),$j # j++
  251. mulq $m1 # np[j]*m1
  252. cmp $num,$j
  253. jne .Linner
  254. movq %xmm0,$m0 # bp[i+1]
  255. add %rax,$hi1
  256. mov ($ap),%rax # ap[0]
  257. adc \$0,%rdx
  258. add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
  259. mov (%rsp,$j,8),$lo0
  260. adc \$0,%rdx
  261. mov $hi1,-16(%rsp,$j,8) # tp[j-1]
  262. mov %rdx,$hi1
  263. xor %rdx,%rdx
  264. add $hi0,$hi1
  265. adc \$0,%rdx
  266. add $lo0,$hi1 # pull upmost overflow bit
  267. adc \$0,%rdx
  268. mov $hi1,-8(%rsp,$num,8)
  269. mov %rdx,(%rsp,$num,8) # store upmost overflow bit
  270. lea 1($i),$i # i++
  271. cmp $num,$i
  272. jb .Louter
  273. xor $i,$i # i=0 and clear CF!
  274. mov (%rsp),%rax # tp[0]
  275. lea (%rsp),$ap # borrow ap for tp
  276. mov $num,$j # j=num
  277. jmp .Lsub
  278. .align 16
  279. .Lsub: sbb ($np,$i,8),%rax
  280. mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
  281. mov 8($ap,$i,8),%rax # tp[i+1]
  282. lea 1($i),$i # i++
  283. dec $j # doesnn't affect CF!
  284. jnz .Lsub
  285. sbb \$0,%rax # handle upmost overflow bit
  286. xor $i,$i
  287. and %rax,$ap
  288. not %rax
  289. mov $rp,$np
  290. and %rax,$np
  291. mov $num,$j # j=num
  292. or $np,$ap # ap=borrow?tp:rp
  293. .align 16
  294. .Lcopy: # copy or in-place refresh
  295. mov ($ap,$i,8),%rax
  296. mov $i,(%rsp,$i,8) # zap temporary vector
  297. mov %rax,($rp,$i,8) # rp[i]=tp[i]
  298. lea 1($i),$i
  299. sub \$1,$j
  300. jnz .Lcopy
  301. mov 8(%rsp,$num,8),%rsi # restore %rsp
  302. mov \$1,%rax
  303. ___
  304. $code.=<<___ if ($win64);
  305. movaps -88(%rsi),%xmm6
  306. movaps -72(%rsi),%xmm7
  307. ___
  308. $code.=<<___;
  309. mov -48(%rsi),%r15
  310. mov -40(%rsi),%r14
  311. mov -32(%rsi),%r13
  312. mov -24(%rsi),%r12
  313. mov -16(%rsi),%rbp
  314. mov -8(%rsi),%rbx
  315. lea (%rsi),%rsp
  316. .Lmul_epilogue:
  317. ret
  318. .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
  319. ___
  320. {{{
  321. my @A=("%r10","%r11");
  322. my @N=("%r13","%rdi");
  323. $code.=<<___;
  324. .type bn_mul4x_mont_gather5,\@function,6
  325. .align 32
  326. bn_mul4x_mont_gather5:
  327. .Lmul4x_enter:
  328. ___
  329. $code.=<<___ if ($addx);
  330. and \$0x80100,%r11d
  331. cmp \$0x80100,%r11d
  332. je .Lmulx4x_enter
  333. ___
  334. $code.=<<___;
  335. .byte 0x67
  336. mov %rsp,%rax
  337. push %rbx
  338. push %rbp
  339. push %r12
  340. push %r13
  341. push %r14
  342. push %r15
  343. ___
  344. $code.=<<___ if ($win64);
  345. lea -0x28(%rsp),%rsp
  346. movaps %xmm6,(%rsp)
  347. movaps %xmm7,0x10(%rsp)
  348. ___
  349. $code.=<<___;
  350. .byte 0x67
  351. mov ${num}d,%r10d
  352. shl \$3,${num}d
  353. shl \$3+2,%r10d # 4*$num
  354. neg $num # -$num
  355. ##############################################################
  356. # ensure that stack frame doesn't alias with $aptr+4*$num
  357. # modulo 4096, which covers ret[num], am[num] and n[2*num]
  358. # (see bn_exp.c). this is done to allow memory disambiguation
  359. # logic do its magic. [excessive frame is allocated in order
  360. # to allow bn_from_mont8x to clear it.]
  361. #
  362. lea -64(%rsp,$num,2),%r11
  363. sub $ap,%r11
  364. and \$4095,%r11
  365. cmp %r11,%r10
  366. jb .Lmul4xsp_alt
  367. sub %r11,%rsp # align with $ap
  368. lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
  369. jmp .Lmul4xsp_done
  370. .align 32
  371. .Lmul4xsp_alt:
  372. lea 4096-64(,$num,2),%r10
  373. lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
  374. sub %r10,%r11
  375. mov \$0,%r10
  376. cmovc %r10,%r11
  377. sub %r11,%rsp
  378. .Lmul4xsp_done:
  379. and \$-64,%rsp
  380. neg $num
  381. mov %rax,40(%rsp)
  382. .Lmul4x_body:
  383. call mul4x_internal
  384. mov 40(%rsp),%rsi # restore %rsp
  385. mov \$1,%rax
  386. ___
  387. $code.=<<___ if ($win64);
  388. movaps -88(%rsi),%xmm6
  389. movaps -72(%rsi),%xmm7
  390. ___
  391. $code.=<<___;
  392. mov -48(%rsi),%r15
  393. mov -40(%rsi),%r14
  394. mov -32(%rsi),%r13
  395. mov -24(%rsi),%r12
  396. mov -16(%rsi),%rbp
  397. mov -8(%rsi),%rbx
  398. lea (%rsi),%rsp
  399. .Lmul4x_epilogue:
  400. ret
  401. .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
  402. .type mul4x_internal,\@abi-omnipotent
  403. .align 32
  404. mul4x_internal:
  405. shl \$5,$num
  406. mov `($win64?56:8)`(%rax),%r10d # load 7th argument
  407. lea 256(%rdx,$num),%r13
  408. shr \$5,$num # restore $num
  409. ___
  410. $bp="%r12";
  411. $STRIDE=2**5*8; # 5 is "window size"
  412. $N=$STRIDE/4; # should match cache line size
  413. $tp=$i;
  414. $code.=<<___;
  415. mov %r10,%r11
  416. shr \$`log($N/8)/log(2)`,%r10
  417. and \$`$N/8-1`,%r11
  418. not %r10
  419. lea .Lmagic_masks(%rip),%rax
  420. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  421. lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
  422. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  423. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  424. add \$7,%r11
  425. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  426. movq 24(%rax,%r10,8),%xmm7
  427. and \$7,%r11
  428. movq `0*$STRIDE/4-96`($bp),%xmm0
  429. lea $STRIDE($bp),$tp # borrow $tp
  430. movq `1*$STRIDE/4-96`($bp),%xmm1
  431. pand %xmm4,%xmm0
  432. movq `2*$STRIDE/4-96`($bp),%xmm2
  433. pand %xmm5,%xmm1
  434. movq `3*$STRIDE/4-96`($bp),%xmm3
  435. pand %xmm6,%xmm2
  436. .byte 0x67
  437. por %xmm1,%xmm0
  438. movq `0*$STRIDE/4-96`($tp),%xmm1
  439. .byte 0x67
  440. pand %xmm7,%xmm3
  441. .byte 0x67
  442. por %xmm2,%xmm0
  443. movq `1*$STRIDE/4-96`($tp),%xmm2
  444. .byte 0x67
  445. pand %xmm4,%xmm1
  446. .byte 0x67
  447. por %xmm3,%xmm0
  448. movq `2*$STRIDE/4-96`($tp),%xmm3
  449. movq %xmm0,$m0 # m0=bp[0]
  450. movq `3*$STRIDE/4-96`($tp),%xmm0
  451. mov %r13,16+8(%rsp) # save end of b[num]
  452. mov $rp, 56+8(%rsp) # save $rp
  453. mov ($n0),$n0 # pull n0[0] value
  454. mov ($ap),%rax
  455. lea ($ap,$num),$ap # end of a[num]
  456. neg $num
  457. mov $n0,$m1
  458. mulq $m0 # ap[0]*bp[0]
  459. mov %rax,$A[0]
  460. mov ($np),%rax
  461. pand %xmm5,%xmm2
  462. pand %xmm6,%xmm3
  463. por %xmm2,%xmm1
  464. imulq $A[0],$m1 # "tp[0]"*n0
  465. ##############################################################
  466. # $tp is chosen so that writing to top-most element of the
  467. # vector occurs just "above" references to powers table,
  468. # "above" modulo cache-line size, which effectively precludes
  469. # possibility of memory disambiguation logic failure when
  470. # accessing the table.
  471. #
  472. lea 64+8(%rsp,%r11,8),$tp
  473. mov %rdx,$A[1]
  474. pand %xmm7,%xmm0
  475. por %xmm3,%xmm1
  476. lea 2*$STRIDE($bp),$bp
  477. por %xmm1,%xmm0
  478. mulq $m1 # np[0]*m1
  479. add %rax,$A[0] # discarded
  480. mov 8($ap,$num),%rax
  481. adc \$0,%rdx
  482. mov %rdx,$N[1]
  483. mulq $m0
  484. add %rax,$A[1]
  485. mov 16*1($np),%rax # interleaved with 0, therefore 16*n
  486. adc \$0,%rdx
  487. mov %rdx,$A[0]
  488. mulq $m1
  489. add %rax,$N[1]
  490. mov 16($ap,$num),%rax
  491. adc \$0,%rdx
  492. add $A[1],$N[1]
  493. lea 4*8($num),$j # j=4
  494. lea 16*4($np),$np
  495. adc \$0,%rdx
  496. mov $N[1],($tp)
  497. mov %rdx,$N[0]
  498. jmp .L1st4x
  499. .align 32
  500. .L1st4x:
  501. mulq $m0 # ap[j]*bp[0]
  502. add %rax,$A[0]
  503. mov -16*2($np),%rax
  504. lea 32($tp),$tp
  505. adc \$0,%rdx
  506. mov %rdx,$A[1]
  507. mulq $m1 # np[j]*m1
  508. add %rax,$N[0]
  509. mov -8($ap,$j),%rax
  510. adc \$0,%rdx
  511. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  512. adc \$0,%rdx
  513. mov $N[0],-24($tp) # tp[j-1]
  514. mov %rdx,$N[1]
  515. mulq $m0 # ap[j]*bp[0]
  516. add %rax,$A[1]
  517. mov -16*1($np),%rax
  518. adc \$0,%rdx
  519. mov %rdx,$A[0]
  520. mulq $m1 # np[j]*m1
  521. add %rax,$N[1]
  522. mov ($ap,$j),%rax
  523. adc \$0,%rdx
  524. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  525. adc \$0,%rdx
  526. mov $N[1],-16($tp) # tp[j-1]
  527. mov %rdx,$N[0]
  528. mulq $m0 # ap[j]*bp[0]
  529. add %rax,$A[0]
  530. mov 16*0($np),%rax
  531. adc \$0,%rdx
  532. mov %rdx,$A[1]
  533. mulq $m1 # np[j]*m1
  534. add %rax,$N[0]
  535. mov 8($ap,$j),%rax
  536. adc \$0,%rdx
  537. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  538. adc \$0,%rdx
  539. mov $N[0],-8($tp) # tp[j-1]
  540. mov %rdx,$N[1]
  541. mulq $m0 # ap[j]*bp[0]
  542. add %rax,$A[1]
  543. mov 16*1($np),%rax
  544. adc \$0,%rdx
  545. mov %rdx,$A[0]
  546. mulq $m1 # np[j]*m1
  547. add %rax,$N[1]
  548. mov 16($ap,$j),%rax
  549. adc \$0,%rdx
  550. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  551. lea 16*4($np),$np
  552. adc \$0,%rdx
  553. mov $N[1],($tp) # tp[j-1]
  554. mov %rdx,$N[0]
  555. add \$32,$j # j+=4
  556. jnz .L1st4x
  557. mulq $m0 # ap[j]*bp[0]
  558. add %rax,$A[0]
  559. mov -16*2($np),%rax
  560. lea 32($tp),$tp
  561. adc \$0,%rdx
  562. mov %rdx,$A[1]
  563. mulq $m1 # np[j]*m1
  564. add %rax,$N[0]
  565. mov -8($ap),%rax
  566. adc \$0,%rdx
  567. add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
  568. adc \$0,%rdx
  569. mov $N[0],-24($tp) # tp[j-1]
  570. mov %rdx,$N[1]
  571. mulq $m0 # ap[j]*bp[0]
  572. add %rax,$A[1]
  573. mov -16*1($np),%rax
  574. adc \$0,%rdx
  575. mov %rdx,$A[0]
  576. mulq $m1 # np[j]*m1
  577. add %rax,$N[1]
  578. mov ($ap,$num),%rax # ap[0]
  579. adc \$0,%rdx
  580. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
  581. adc \$0,%rdx
  582. mov $N[1],-16($tp) # tp[j-1]
  583. mov %rdx,$N[0]
  584. movq %xmm0,$m0 # bp[1]
  585. lea ($np,$num,2),$np # rewind $np
  586. xor $N[1],$N[1]
  587. add $A[0],$N[0]
  588. adc \$0,$N[1]
  589. mov $N[0],-8($tp)
  590. jmp .Louter4x
  591. .align 32
  592. .Louter4x:
  593. mov ($tp,$num),$A[0]
  594. mov $n0,$m1
  595. mulq $m0 # ap[0]*bp[i]
  596. add %rax,$A[0] # ap[0]*bp[i]+tp[0]
  597. mov ($np),%rax
  598. adc \$0,%rdx
  599. movq `0*$STRIDE/4-96`($bp),%xmm0
  600. movq `1*$STRIDE/4-96`($bp),%xmm1
  601. pand %xmm4,%xmm0
  602. movq `2*$STRIDE/4-96`($bp),%xmm2
  603. pand %xmm5,%xmm1
  604. movq `3*$STRIDE/4-96`($bp),%xmm3
  605. imulq $A[0],$m1 # tp[0]*n0
  606. .byte 0x67
  607. mov %rdx,$A[1]
  608. mov $N[1],($tp) # store upmost overflow bit
  609. pand %xmm6,%xmm2
  610. por %xmm1,%xmm0
  611. pand %xmm7,%xmm3
  612. por %xmm2,%xmm0
  613. lea ($tp,$num),$tp # rewind $tp
  614. lea $STRIDE($bp),$bp
  615. por %xmm3,%xmm0
  616. mulq $m1 # np[0]*m1
  617. add %rax,$A[0] # "$N[0]", discarded
  618. mov 8($ap,$num),%rax
  619. adc \$0,%rdx
  620. mov %rdx,$N[1]
  621. mulq $m0 # ap[j]*bp[i]
  622. add %rax,$A[1]
  623. mov 16*1($np),%rax # interleaved with 0, therefore 16*n
  624. adc \$0,%rdx
  625. add 8($tp),$A[1] # +tp[1]
  626. adc \$0,%rdx
  627. mov %rdx,$A[0]
  628. mulq $m1 # np[j]*m1
  629. add %rax,$N[1]
  630. mov 16($ap,$num),%rax
  631. adc \$0,%rdx
  632. add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
  633. lea 4*8($num),$j # j=4
  634. lea 16*4($np),$np
  635. adc \$0,%rdx
  636. mov %rdx,$N[0]
  637. jmp .Linner4x
  638. .align 32
  639. .Linner4x:
  640. mulq $m0 # ap[j]*bp[i]
  641. add %rax,$A[0]
  642. mov -16*2($np),%rax
  643. adc \$0,%rdx
  644. add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
  645. lea 32($tp),$tp
  646. adc \$0,%rdx
  647. mov %rdx,$A[1]
  648. mulq $m1 # np[j]*m1
  649. add %rax,$N[0]
  650. mov -8($ap,$j),%rax
  651. adc \$0,%rdx
  652. add $A[0],$N[0]
  653. adc \$0,%rdx
  654. mov $N[1],-32($tp) # tp[j-1]
  655. mov %rdx,$N[1]
  656. mulq $m0 # ap[j]*bp[i]
  657. add %rax,$A[1]
  658. mov -16*1($np),%rax
  659. adc \$0,%rdx
  660. add -8($tp),$A[1]
  661. adc \$0,%rdx
  662. mov %rdx,$A[0]
  663. mulq $m1 # np[j]*m1
  664. add %rax,$N[1]
  665. mov ($ap,$j),%rax
  666. adc \$0,%rdx
  667. add $A[1],$N[1]
  668. adc \$0,%rdx
  669. mov $N[0],-24($tp) # tp[j-1]
  670. mov %rdx,$N[0]
  671. mulq $m0 # ap[j]*bp[i]
  672. add %rax,$A[0]
  673. mov 16*0($np),%rax
  674. adc \$0,%rdx
  675. add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
  676. adc \$0,%rdx
  677. mov %rdx,$A[1]
  678. mulq $m1 # np[j]*m1
  679. add %rax,$N[0]
  680. mov 8($ap,$j),%rax
  681. adc \$0,%rdx
  682. add $A[0],$N[0]
  683. adc \$0,%rdx
  684. mov $N[1],-16($tp) # tp[j-1]
  685. mov %rdx,$N[1]
  686. mulq $m0 # ap[j]*bp[i]
  687. add %rax,$A[1]
  688. mov 16*1($np),%rax
  689. adc \$0,%rdx
  690. add 8($tp),$A[1]
  691. adc \$0,%rdx
  692. mov %rdx,$A[0]
  693. mulq $m1 # np[j]*m1
  694. add %rax,$N[1]
  695. mov 16($ap,$j),%rax
  696. adc \$0,%rdx
  697. add $A[1],$N[1]
  698. lea 16*4($np),$np
  699. adc \$0,%rdx
  700. mov $N[0],-8($tp) # tp[j-1]
  701. mov %rdx,$N[0]
  702. add \$32,$j # j+=4
  703. jnz .Linner4x
  704. mulq $m0 # ap[j]*bp[i]
  705. add %rax,$A[0]
  706. mov -16*2($np),%rax
  707. adc \$0,%rdx
  708. add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
  709. lea 32($tp),$tp
  710. adc \$0,%rdx
  711. mov %rdx,$A[1]
  712. mulq $m1 # np[j]*m1
  713. add %rax,$N[0]
  714. mov -8($ap),%rax
  715. adc \$0,%rdx
  716. add $A[0],$N[0]
  717. adc \$0,%rdx
  718. mov $N[1],-32($tp) # tp[j-1]
  719. mov %rdx,$N[1]
  720. mulq $m0 # ap[j]*bp[i]
  721. add %rax,$A[1]
  722. mov $m1,%rax
  723. mov -16*1($np),$m1
  724. adc \$0,%rdx
  725. add -8($tp),$A[1]
  726. adc \$0,%rdx
  727. mov %rdx,$A[0]
  728. mulq $m1 # np[j]*m1
  729. add %rax,$N[1]
  730. mov ($ap,$num),%rax # ap[0]
  731. adc \$0,%rdx
  732. add $A[1],$N[1]
  733. adc \$0,%rdx
  734. mov $N[0],-24($tp) # tp[j-1]
  735. mov %rdx,$N[0]
  736. movq %xmm0,$m0 # bp[i+1]
  737. mov $N[1],-16($tp) # tp[j-1]
  738. lea ($np,$num,2),$np # rewind $np
  739. xor $N[1],$N[1]
  740. add $A[0],$N[0]
  741. adc \$0,$N[1]
  742. add ($tp),$N[0] # pull upmost overflow bit
  743. adc \$0,$N[1] # upmost overflow bit
  744. mov $N[0],-8($tp)
  745. cmp 16+8(%rsp),$bp
  746. jb .Louter4x
  747. ___
  748. if (1) {
  749. $code.=<<___;
  750. sub $N[0],$m1 # compare top-most words
  751. adc $j,$j # $j is zero
  752. or $j,$N[1]
  753. xor \$1,$N[1]
  754. lea ($tp,$num),%rbx # tptr in .sqr4x_sub
  755. lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
  756. mov %r9,%rcx
  757. sar \$3+2,%rcx # cf=0
  758. mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
  759. jmp .Lsqr4x_sub
  760. ___
  761. } else {
  762. my @ri=("%rax",$bp,$m0,$m1);
  763. my $rp="%rdx";
  764. $code.=<<___
  765. xor \$1,$N[1]
  766. lea ($tp,$num),$tp # rewind $tp
  767. sar \$5,$num # cf=0
  768. lea ($np,$N[1],8),$np
  769. mov 56+8(%rsp),$rp # restore $rp
  770. jmp .Lsub4x
  771. .align 32
  772. .Lsub4x:
  773. .byte 0x66
  774. mov 8*0($tp),@ri[0]
  775. mov 8*1($tp),@ri[1]
  776. .byte 0x66
  777. sbb 16*0($np),@ri[0]
  778. mov 8*2($tp),@ri[2]
  779. sbb 16*1($np),@ri[1]
  780. mov 3*8($tp),@ri[3]
  781. lea 4*8($tp),$tp
  782. sbb 16*2($np),@ri[2]
  783. mov @ri[0],8*0($rp)
  784. sbb 16*3($np),@ri[3]
  785. lea 16*4($np),$np
  786. mov @ri[1],8*1($rp)
  787. mov @ri[2],8*2($rp)
  788. mov @ri[3],8*3($rp)
  789. lea 8*4($rp),$rp
  790. inc $num
  791. jnz .Lsub4x
  792. ret
  793. ___
  794. }
  795. $code.=<<___;
  796. .size mul4x_internal,.-mul4x_internal
  797. ___
  798. }}}
  799. {{{
  800. ######################################################################
  801. # void bn_power5(
  802. my $rptr="%rdi"; # BN_ULONG *rptr,
  803. my $aptr="%rsi"; # const BN_ULONG *aptr,
  804. my $bptr="%rdx"; # const void *table,
  805. my $nptr="%rcx"; # const BN_ULONG *nptr,
  806. my $n0 ="%r8"; # const BN_ULONG *n0);
  807. my $num ="%r9"; # int num, has to be divisible by 8
  808. # int pwr
  809. my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
  810. my @A0=("%r10","%r11");
  811. my @A1=("%r12","%r13");
  812. my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
  813. $code.=<<___;
  814. .globl bn_power5
  815. .type bn_power5,\@function,6
  816. .align 32
  817. bn_power5:
  818. ___
  819. $code.=<<___ if ($addx);
  820. mov OPENSSL_ia32cap_P+8(%rip),%r11d
  821. and \$0x80100,%r11d
  822. cmp \$0x80100,%r11d
  823. je .Lpowerx5_enter
  824. ___
  825. $code.=<<___;
  826. mov %rsp,%rax
  827. push %rbx
  828. push %rbp
  829. push %r12
  830. push %r13
  831. push %r14
  832. push %r15
  833. ___
  834. $code.=<<___ if ($win64);
  835. lea -0x28(%rsp),%rsp
  836. movaps %xmm6,(%rsp)
  837. movaps %xmm7,0x10(%rsp)
  838. ___
  839. $code.=<<___;
  840. mov ${num}d,%r10d
  841. shl \$3,${num}d # convert $num to bytes
  842. shl \$3+2,%r10d # 4*$num
  843. neg $num
  844. mov ($n0),$n0 # *n0
  845. ##############################################################
  846. # ensure that stack frame doesn't alias with $aptr+4*$num
  847. # modulo 4096, which covers ret[num], am[num] and n[2*num]
  848. # (see bn_exp.c). this is done to allow memory disambiguation
  849. # logic do its magic.
  850. #
  851. lea -64(%rsp,$num,2),%r11
  852. sub $aptr,%r11
  853. and \$4095,%r11
  854. cmp %r11,%r10
  855. jb .Lpwr_sp_alt
  856. sub %r11,%rsp # align with $aptr
  857. lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
  858. jmp .Lpwr_sp_done
  859. .align 32
  860. .Lpwr_sp_alt:
  861. lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
  862. lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
  863. sub %r10,%r11
  864. mov \$0,%r10
  865. cmovc %r10,%r11
  866. sub %r11,%rsp
  867. .Lpwr_sp_done:
  868. and \$-64,%rsp
  869. mov $num,%r10
  870. neg $num
  871. ##############################################################
  872. # Stack layout
  873. #
  874. # +0 saved $num, used in reduction section
  875. # +8 &t[2*$num], used in reduction section
  876. # +32 saved *n0
  877. # +40 saved %rsp
  878. # +48 t[2*$num]
  879. #
  880. mov $n0, 32(%rsp)
  881. mov %rax, 40(%rsp) # save original %rsp
  882. .Lpower5_body:
  883. movq $rptr,%xmm1 # save $rptr
  884. movq $nptr,%xmm2 # save $nptr
  885. movq %r10, %xmm3 # -$num
  886. movq $bptr,%xmm4
  887. call __bn_sqr8x_internal
  888. call __bn_sqr8x_internal
  889. call __bn_sqr8x_internal
  890. call __bn_sqr8x_internal
  891. call __bn_sqr8x_internal
  892. movq %xmm2,$nptr
  893. movq %xmm4,$bptr
  894. mov $aptr,$rptr
  895. mov 40(%rsp),%rax
  896. lea 32(%rsp),$n0
  897. call mul4x_internal
  898. mov 40(%rsp),%rsi # restore %rsp
  899. mov \$1,%rax
  900. mov -48(%rsi),%r15
  901. mov -40(%rsi),%r14
  902. mov -32(%rsi),%r13
  903. mov -24(%rsi),%r12
  904. mov -16(%rsi),%rbp
  905. mov -8(%rsi),%rbx
  906. lea (%rsi),%rsp
  907. .Lpower5_epilogue:
  908. ret
  909. .size bn_power5,.-bn_power5
  910. .globl bn_sqr8x_internal
  911. .hidden bn_sqr8x_internal
  912. .type bn_sqr8x_internal,\@abi-omnipotent
  913. .align 32
  914. bn_sqr8x_internal:
  915. __bn_sqr8x_internal:
  916. ##############################################################
  917. # Squaring part:
  918. #
  919. # a) multiply-n-add everything but a[i]*a[i];
  920. # b) shift result of a) by 1 to the left and accumulate
  921. # a[i]*a[i] products;
  922. #
  923. ##############################################################
  924. # a[1]a[0]
  925. # a[2]a[0]
  926. # a[3]a[0]
  927. # a[2]a[1]
  928. # a[4]a[0]
  929. # a[3]a[1]
  930. # a[5]a[0]
  931. # a[4]a[1]
  932. # a[3]a[2]
  933. # a[6]a[0]
  934. # a[5]a[1]
  935. # a[4]a[2]
  936. # a[7]a[0]
  937. # a[6]a[1]
  938. # a[5]a[2]
  939. # a[4]a[3]
  940. # a[7]a[1]
  941. # a[6]a[2]
  942. # a[5]a[3]
  943. # a[7]a[2]
  944. # a[6]a[3]
  945. # a[5]a[4]
  946. # a[7]a[3]
  947. # a[6]a[4]
  948. # a[7]a[4]
  949. # a[6]a[5]
  950. # a[7]a[5]
  951. # a[7]a[6]
  952. # a[1]a[0]
  953. # a[2]a[0]
  954. # a[3]a[0]
  955. # a[4]a[0]
  956. # a[5]a[0]
  957. # a[6]a[0]
  958. # a[7]a[0]
  959. # a[2]a[1]
  960. # a[3]a[1]
  961. # a[4]a[1]
  962. # a[5]a[1]
  963. # a[6]a[1]
  964. # a[7]a[1]
  965. # a[3]a[2]
  966. # a[4]a[2]
  967. # a[5]a[2]
  968. # a[6]a[2]
  969. # a[7]a[2]
  970. # a[4]a[3]
  971. # a[5]a[3]
  972. # a[6]a[3]
  973. # a[7]a[3]
  974. # a[5]a[4]
  975. # a[6]a[4]
  976. # a[7]a[4]
  977. # a[6]a[5]
  978. # a[7]a[5]
  979. # a[7]a[6]
  980. # a[0]a[0]
  981. # a[1]a[1]
  982. # a[2]a[2]
  983. # a[3]a[3]
  984. # a[4]a[4]
  985. # a[5]a[5]
  986. # a[6]a[6]
  987. # a[7]a[7]
  988. lea 32(%r10),$i # $i=-($num-32)
  989. lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
  990. mov $num,$j # $j=$num
  991. # comments apply to $num==8 case
  992. mov -32($aptr,$i),$a0 # a[0]
  993. lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
  994. mov -24($aptr,$i),%rax # a[1]
  995. lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
  996. mov -16($aptr,$i),$ai # a[2]
  997. mov %rax,$a1
  998. mul $a0 # a[1]*a[0]
  999. mov %rax,$A0[0] # a[1]*a[0]
  1000. mov $ai,%rax # a[2]
  1001. mov %rdx,$A0[1]
  1002. mov $A0[0],-24($tptr,$i) # t[1]
  1003. mul $a0 # a[2]*a[0]
  1004. add %rax,$A0[1]
  1005. mov $ai,%rax
  1006. adc \$0,%rdx
  1007. mov $A0[1],-16($tptr,$i) # t[2]
  1008. mov %rdx,$A0[0]
  1009. mov -8($aptr,$i),$ai # a[3]
  1010. mul $a1 # a[2]*a[1]
  1011. mov %rax,$A1[0] # a[2]*a[1]+t[3]
  1012. mov $ai,%rax
  1013. mov %rdx,$A1[1]
  1014. lea ($i),$j
  1015. mul $a0 # a[3]*a[0]
  1016. add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
  1017. mov $ai,%rax
  1018. mov %rdx,$A0[1]
  1019. adc \$0,$A0[1]
  1020. add $A1[0],$A0[0]
  1021. adc \$0,$A0[1]
  1022. mov $A0[0],-8($tptr,$j) # t[3]
  1023. jmp .Lsqr4x_1st
  1024. .align 32
  1025. .Lsqr4x_1st:
  1026. mov ($aptr,$j),$ai # a[4]
  1027. mul $a1 # a[3]*a[1]
  1028. add %rax,$A1[1] # a[3]*a[1]+t[4]
  1029. mov $ai,%rax
  1030. mov %rdx,$A1[0]
  1031. adc \$0,$A1[0]
  1032. mul $a0 # a[4]*a[0]
  1033. add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
  1034. mov $ai,%rax # a[3]
  1035. mov 8($aptr,$j),$ai # a[5]
  1036. mov %rdx,$A0[0]
  1037. adc \$0,$A0[0]
  1038. add $A1[1],$A0[1]
  1039. adc \$0,$A0[0]
  1040. mul $a1 # a[4]*a[3]
  1041. add %rax,$A1[0] # a[4]*a[3]+t[5]
  1042. mov $ai,%rax
  1043. mov $A0[1],($tptr,$j) # t[4]
  1044. mov %rdx,$A1[1]
  1045. adc \$0,$A1[1]
  1046. mul $a0 # a[5]*a[2]
  1047. add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
  1048. mov $ai,%rax
  1049. mov 16($aptr,$j),$ai # a[6]
  1050. mov %rdx,$A0[1]
  1051. adc \$0,$A0[1]
  1052. add $A1[0],$A0[0]
  1053. adc \$0,$A0[1]
  1054. mul $a1 # a[5]*a[3]
  1055. add %rax,$A1[1] # a[5]*a[3]+t[6]
  1056. mov $ai,%rax
  1057. mov $A0[0],8($tptr,$j) # t[5]
  1058. mov %rdx,$A1[0]
  1059. adc \$0,$A1[0]
  1060. mul $a0 # a[6]*a[2]
  1061. add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
  1062. mov $ai,%rax # a[3]
  1063. mov 24($aptr,$j),$ai # a[7]
  1064. mov %rdx,$A0[0]
  1065. adc \$0,$A0[0]
  1066. add $A1[1],$A0[1]
  1067. adc \$0,$A0[0]
  1068. mul $a1 # a[6]*a[5]
  1069. add %rax,$A1[0] # a[6]*a[5]+t[7]
  1070. mov $ai,%rax
  1071. mov $A0[1],16($tptr,$j) # t[6]
  1072. mov %rdx,$A1[1]
  1073. adc \$0,$A1[1]
  1074. lea 32($j),$j
  1075. mul $a0 # a[7]*a[4]
  1076. add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
  1077. mov $ai,%rax
  1078. mov %rdx,$A0[1]
  1079. adc \$0,$A0[1]
  1080. add $A1[0],$A0[0]
  1081. adc \$0,$A0[1]
  1082. mov $A0[0],-8($tptr,$j) # t[7]
  1083. cmp \$0,$j
  1084. jne .Lsqr4x_1st
  1085. mul $a1 # a[7]*a[5]
  1086. add %rax,$A1[1]
  1087. lea 16($i),$i
  1088. adc \$0,%rdx
  1089. add $A0[1],$A1[1]
  1090. adc \$0,%rdx
  1091. mov $A1[1],($tptr) # t[8]
  1092. mov %rdx,$A1[0]
  1093. mov %rdx,8($tptr) # t[9]
  1094. jmp .Lsqr4x_outer
  1095. .align 32
  1096. .Lsqr4x_outer: # comments apply to $num==6 case
  1097. mov -32($aptr,$i),$a0 # a[0]
  1098. lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
  1099. mov -24($aptr,$i),%rax # a[1]
  1100. lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
  1101. mov -16($aptr,$i),$ai # a[2]
  1102. mov %rax,$a1
  1103. mul $a0 # a[1]*a[0]
  1104. mov -24($tptr,$i),$A0[0] # t[1]
  1105. add %rax,$A0[0] # a[1]*a[0]+t[1]
  1106. mov $ai,%rax # a[2]
  1107. adc \$0,%rdx
  1108. mov $A0[0],-24($tptr,$i) # t[1]
  1109. mov %rdx,$A0[1]
  1110. mul $a0 # a[2]*a[0]
  1111. add %rax,$A0[1]
  1112. mov $ai,%rax
  1113. adc \$0,%rdx
  1114. add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
  1115. mov %rdx,$A0[0]
  1116. adc \$0,$A0[0]
  1117. mov $A0[1],-16($tptr,$i) # t[2]
  1118. xor $A1[0],$A1[0]
  1119. mov -8($aptr,$i),$ai # a[3]
  1120. mul $a1 # a[2]*a[1]
  1121. add %rax,$A1[0] # a[2]*a[1]+t[3]
  1122. mov $ai,%rax
  1123. adc \$0,%rdx
  1124. add -8($tptr,$i),$A1[0]
  1125. mov %rdx,$A1[1]
  1126. adc \$0,$A1[1]
  1127. mul $a0 # a[3]*a[0]
  1128. add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
  1129. mov $ai,%rax
  1130. adc \$0,%rdx
  1131. add $A1[0],$A0[0]
  1132. mov %rdx,$A0[1]
  1133. adc \$0,$A0[1]
  1134. mov $A0[0],-8($tptr,$i) # t[3]
  1135. lea ($i),$j
  1136. jmp .Lsqr4x_inner
  1137. .align 32
  1138. .Lsqr4x_inner:
  1139. mov ($aptr,$j),$ai # a[4]
  1140. mul $a1 # a[3]*a[1]
  1141. add %rax,$A1[1] # a[3]*a[1]+t[4]
  1142. mov $ai,%rax
  1143. mov %rdx,$A1[0]
  1144. adc \$0,$A1[0]
  1145. add ($tptr,$j),$A1[1]
  1146. adc \$0,$A1[0]
  1147. .byte 0x67
  1148. mul $a0 # a[4]*a[0]
  1149. add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
  1150. mov $ai,%rax # a[3]
  1151. mov 8($aptr,$j),$ai # a[5]
  1152. mov %rdx,$A0[0]
  1153. adc \$0,$A0[0]
  1154. add $A1[1],$A0[1]
  1155. adc \$0,$A0[0]
  1156. mul $a1 # a[4]*a[3]
  1157. add %rax,$A1[0] # a[4]*a[3]+t[5]
  1158. mov $A0[1],($tptr,$j) # t[4]
  1159. mov $ai,%rax
  1160. mov %rdx,$A1[1]
  1161. adc \$0,$A1[1]
  1162. add 8($tptr,$j),$A1[0]
  1163. lea 16($j),$j # j++
  1164. adc \$0,$A1[1]
  1165. mul $a0 # a[5]*a[2]
  1166. add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
  1167. mov $ai,%rax
  1168. adc \$0,%rdx
  1169. add $A1[0],$A0[0]
  1170. mov %rdx,$A0[1]
  1171. adc \$0,$A0[1]
  1172. mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
  1173. cmp \$0,$j
  1174. jne .Lsqr4x_inner
  1175. .byte 0x67
  1176. mul $a1 # a[5]*a[3]
  1177. add %rax,$A1[1]
  1178. adc \$0,%rdx
  1179. add $A0[1],$A1[1]
  1180. adc \$0,%rdx
  1181. mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
  1182. mov %rdx,$A1[0]
  1183. mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
  1184. add \$16,$i
  1185. jnz .Lsqr4x_outer
  1186. # comments apply to $num==4 case
  1187. mov -32($aptr),$a0 # a[0]
  1188. lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
  1189. mov -24($aptr),%rax # a[1]
  1190. lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
  1191. mov -16($aptr),$ai # a[2]
  1192. mov %rax,$a1
  1193. mul $a0 # a[1]*a[0]
  1194. add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
  1195. mov $ai,%rax # a[2]
  1196. mov %rdx,$A0[1]
  1197. adc \$0,$A0[1]
  1198. mul $a0 # a[2]*a[0]
  1199. add %rax,$A0[1]
  1200. mov $ai,%rax
  1201. mov $A0[0],-24($tptr) # t[1]
  1202. mov %rdx,$A0[0]
  1203. adc \$0,$A0[0]
  1204. add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
  1205. mov -8($aptr),$ai # a[3]
  1206. adc \$0,$A0[0]
  1207. mul $a1 # a[2]*a[1]
  1208. add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
  1209. mov $ai,%rax
  1210. mov $A0[1],-16($tptr) # t[2]
  1211. mov %rdx,$A1[1]
  1212. adc \$0,$A1[1]
  1213. mul $a0 # a[3]*a[0]
  1214. add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
  1215. mov $ai,%rax
  1216. mov %rdx,$A0[1]
  1217. adc \$0,$A0[1]
  1218. add $A1[0],$A0[0]
  1219. adc \$0,$A0[1]
  1220. mov $A0[0],-8($tptr) # t[3]
  1221. mul $a1 # a[3]*a[1]
  1222. add %rax,$A1[1]
  1223. mov -16($aptr),%rax # a[2]
  1224. adc \$0,%rdx
  1225. add $A0[1],$A1[1]
  1226. adc \$0,%rdx
  1227. mov $A1[1],($tptr) # t[4]
  1228. mov %rdx,$A1[0]
  1229. mov %rdx,8($tptr) # t[5]
  1230. mul $ai # a[2]*a[3]
  1231. ___
  1232. {
  1233. my ($shift,$carry)=($a0,$a1);
  1234. my @S=(@A1,$ai,$n0);
  1235. $code.=<<___;
  1236. add \$16,$i
  1237. xor $shift,$shift
  1238. sub $num,$i # $i=16-$num
  1239. xor $carry,$carry
  1240. add $A1[0],%rax # t[5]
  1241. adc \$0,%rdx
  1242. mov %rax,8($tptr) # t[5]
  1243. mov %rdx,16($tptr) # t[6]
  1244. mov $carry,24($tptr) # t[7]
  1245. mov -16($aptr,$i),%rax # a[0]
  1246. lea 48+8(%rsp),$tptr
  1247. xor $A0[0],$A0[0] # t[0]
  1248. mov 8($tptr),$A0[1] # t[1]
  1249. lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
  1250. shr \$63,$A0[0]
  1251. lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
  1252. shr \$63,$A0[1]
  1253. or $A0[0],$S[1] # | t[2*i]>>63
  1254. mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
  1255. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1256. mul %rax # a[i]*a[i]
  1257. neg $carry # mov $carry,cf
  1258. mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1259. adc %rax,$S[0]
  1260. mov -8($aptr,$i),%rax # a[i+1] # prefetch
  1261. mov $S[0],($tptr)
  1262. adc %rdx,$S[1]
  1263. lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
  1264. mov $S[1],8($tptr)
  1265. sbb $carry,$carry # mov cf,$carry
  1266. shr \$63,$A0[0]
  1267. lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
  1268. shr \$63,$A0[1]
  1269. or $A0[0],$S[3] # | t[2*i]>>63
  1270. mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
  1271. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1272. mul %rax # a[i]*a[i]
  1273. neg $carry # mov $carry,cf
  1274. mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1275. adc %rax,$S[2]
  1276. mov 0($aptr,$i),%rax # a[i+1] # prefetch
  1277. mov $S[2],16($tptr)
  1278. adc %rdx,$S[3]
  1279. lea 16($i),$i
  1280. mov $S[3],24($tptr)
  1281. sbb $carry,$carry # mov cf,$carry
  1282. lea 64($tptr),$tptr
  1283. jmp .Lsqr4x_shift_n_add
  1284. .align 32
  1285. .Lsqr4x_shift_n_add:
  1286. lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
  1287. shr \$63,$A0[0]
  1288. lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
  1289. shr \$63,$A0[1]
  1290. or $A0[0],$S[1] # | t[2*i]>>63
  1291. mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
  1292. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1293. mul %rax # a[i]*a[i]
  1294. neg $carry # mov $carry,cf
  1295. mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1296. adc %rax,$S[0]
  1297. mov -8($aptr,$i),%rax # a[i+1] # prefetch
  1298. mov $S[0],-32($tptr)
  1299. adc %rdx,$S[1]
  1300. lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
  1301. mov $S[1],-24($tptr)
  1302. sbb $carry,$carry # mov cf,$carry
  1303. shr \$63,$A0[0]
  1304. lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
  1305. shr \$63,$A0[1]
  1306. or $A0[0],$S[3] # | t[2*i]>>63
  1307. mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
  1308. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1309. mul %rax # a[i]*a[i]
  1310. neg $carry # mov $carry,cf
  1311. mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1312. adc %rax,$S[2]
  1313. mov 0($aptr,$i),%rax # a[i+1] # prefetch
  1314. mov $S[2],-16($tptr)
  1315. adc %rdx,$S[3]
  1316. lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
  1317. mov $S[3],-8($tptr)
  1318. sbb $carry,$carry # mov cf,$carry
  1319. shr \$63,$A0[0]
  1320. lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
  1321. shr \$63,$A0[1]
  1322. or $A0[0],$S[1] # | t[2*i]>>63
  1323. mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
  1324. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1325. mul %rax # a[i]*a[i]
  1326. neg $carry # mov $carry,cf
  1327. mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1328. adc %rax,$S[0]
  1329. mov 8($aptr,$i),%rax # a[i+1] # prefetch
  1330. mov $S[0],0($tptr)
  1331. adc %rdx,$S[1]
  1332. lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
  1333. mov $S[1],8($tptr)
  1334. sbb $carry,$carry # mov cf,$carry
  1335. shr \$63,$A0[0]
  1336. lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
  1337. shr \$63,$A0[1]
  1338. or $A0[0],$S[3] # | t[2*i]>>63
  1339. mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
  1340. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1341. mul %rax # a[i]*a[i]
  1342. neg $carry # mov $carry,cf
  1343. mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1344. adc %rax,$S[2]
  1345. mov 16($aptr,$i),%rax # a[i+1] # prefetch
  1346. mov $S[2],16($tptr)
  1347. adc %rdx,$S[3]
  1348. mov $S[3],24($tptr)
  1349. sbb $carry,$carry # mov cf,$carry
  1350. lea 64($tptr),$tptr
  1351. add \$32,$i
  1352. jnz .Lsqr4x_shift_n_add
  1353. lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
  1354. .byte 0x67
  1355. shr \$63,$A0[0]
  1356. lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
  1357. shr \$63,$A0[1]
  1358. or $A0[0],$S[1] # | t[2*i]>>63
  1359. mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
  1360. mov $A0[1],$shift # shift=t[2*i+1]>>63
  1361. mul %rax # a[i]*a[i]
  1362. neg $carry # mov $carry,cf
  1363. mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
  1364. adc %rax,$S[0]
  1365. mov -8($aptr),%rax # a[i+1] # prefetch
  1366. mov $S[0],-32($tptr)
  1367. adc %rdx,$S[1]
  1368. lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
  1369. mov $S[1],-24($tptr)
  1370. sbb $carry,$carry # mov cf,$carry
  1371. shr \$63,$A0[0]
  1372. lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
  1373. shr \$63,$A0[1]
  1374. or $A0[0],$S[3] # | t[2*i]>>63
  1375. mul %rax # a[i]*a[i]
  1376. neg $carry # mov $carry,cf
  1377. adc %rax,$S[2]
  1378. adc %rdx,$S[3]
  1379. mov $S[2],-16($tptr)
  1380. mov $S[3],-8($tptr)
  1381. ___
  1382. }
  1383. ######################################################################
  1384. # Montgomery reduction part, "word-by-word" algorithm.
  1385. #
  1386. # This new path is inspired by multiple submissions from Intel, by
  1387. # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
  1388. # Vinodh Gopal...
  1389. {
  1390. my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
  1391. $code.=<<___;
  1392. movq %xmm2,$nptr
  1393. sqr8x_reduction:
  1394. xor %rax,%rax
  1395. lea ($nptr,$num,2),%rcx # end of n[]
  1396. lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
  1397. mov %rcx,0+8(%rsp)
  1398. lea 48+8(%rsp,$num),$tptr # end of initial t[] window
  1399. mov %rdx,8+8(%rsp)
  1400. neg $num
  1401. jmp .L8x_reduction_loop
  1402. .align 32
  1403. .L8x_reduction_loop:
  1404. lea ($tptr,$num),$tptr # start of current t[] window
  1405. .byte 0x66
  1406. mov 8*0($tptr),$m0
  1407. mov 8*1($tptr),%r9
  1408. mov 8*2($tptr),%r10
  1409. mov 8*3($tptr),%r11
  1410. mov 8*4($tptr),%r12
  1411. mov 8*5($tptr),%r13
  1412. mov 8*6($tptr),%r14
  1413. mov 8*7($tptr),%r15
  1414. mov %rax,(%rdx) # store top-most carry bit
  1415. lea 8*8($tptr),$tptr
  1416. .byte 0x67
  1417. mov $m0,%r8
  1418. imulq 32+8(%rsp),$m0 # n0*a[0]
  1419. mov 16*0($nptr),%rax # n[0]
  1420. mov \$8,%ecx
  1421. jmp .L8x_reduce
  1422. .align 32
  1423. .L8x_reduce:
  1424. mulq $m0
  1425. mov 16*1($nptr),%rax # n[1]
  1426. neg %r8
  1427. mov %rdx,%r8
  1428. adc \$0,%r8
  1429. mulq $m0
  1430. add %rax,%r9
  1431. mov 16*2($nptr),%rax
  1432. adc \$0,%rdx
  1433. add %r9,%r8
  1434. mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
  1435. mov %rdx,%r9
  1436. adc \$0,%r9
  1437. mulq $m0
  1438. add %rax,%r10
  1439. mov 16*3($nptr),%rax
  1440. adc \$0,%rdx
  1441. add %r10,%r9
  1442. mov 32+8(%rsp),$carry # pull n0, borrow $carry
  1443. mov %rdx,%r10
  1444. adc \$0,%r10
  1445. mulq $m0
  1446. add %rax,%r11
  1447. mov 16*4($nptr),%rax
  1448. adc \$0,%rdx
  1449. imulq %r8,$carry # modulo-scheduled
  1450. add %r11,%r10
  1451. mov %rdx,%r11
  1452. adc \$0,%r11
  1453. mulq $m0
  1454. add %rax,%r12
  1455. mov 16*5($nptr),%rax
  1456. adc \$0,%rdx
  1457. add %r12,%r11
  1458. mov %rdx,%r12
  1459. adc \$0,%r12
  1460. mulq $m0
  1461. add %rax,%r13
  1462. mov 16*6($nptr),%rax
  1463. adc \$0,%rdx
  1464. add %r13,%r12
  1465. mov %rdx,%r13
  1466. adc \$0,%r13
  1467. mulq $m0
  1468. add %rax,%r14
  1469. mov 16*7($nptr),%rax
  1470. adc \$0,%rdx
  1471. add %r14,%r13
  1472. mov %rdx,%r14
  1473. adc \$0,%r14
  1474. mulq $m0
  1475. mov $carry,$m0 # n0*a[i]
  1476. add %rax,%r15
  1477. mov 16*0($nptr),%rax # n[0]
  1478. adc \$0,%rdx
  1479. add %r15,%r14
  1480. mov %rdx,%r15
  1481. adc \$0,%r15
  1482. dec %ecx
  1483. jnz .L8x_reduce
  1484. lea 16*8($nptr),$nptr
  1485. xor %rax,%rax
  1486. mov 8+8(%rsp),%rdx # pull end of t[]
  1487. cmp 0+8(%rsp),$nptr # end of n[]?
  1488. jae .L8x_no_tail
  1489. .byte 0x66
  1490. add 8*0($tptr),%r8
  1491. adc 8*1($tptr),%r9
  1492. adc 8*2($tptr),%r10
  1493. adc 8*3($tptr),%r11
  1494. adc 8*4($tptr),%r12
  1495. adc 8*5($tptr),%r13
  1496. adc 8*6($tptr),%r14
  1497. adc 8*7($tptr),%r15
  1498. sbb $carry,$carry # top carry
  1499. mov 48+56+8(%rsp),$m0 # pull n0*a[0]
  1500. mov \$8,%ecx
  1501. mov 16*0($nptr),%rax
  1502. jmp .L8x_tail
  1503. .align 32
  1504. .L8x_tail:
  1505. mulq $m0
  1506. add %rax,%r8
  1507. mov 16*1($nptr),%rax
  1508. mov %r8,($tptr) # save result
  1509. mov %rdx,%r8
  1510. adc \$0,%r8
  1511. mulq $m0
  1512. add %rax,%r9
  1513. mov 16*2($nptr),%rax
  1514. adc \$0,%rdx
  1515. add %r9,%r8
  1516. lea 8($tptr),$tptr # $tptr++
  1517. mov %rdx,%r9
  1518. adc \$0,%r9
  1519. mulq $m0
  1520. add %rax,%r10
  1521. mov 16*3($nptr),%rax
  1522. adc \$0,%rdx
  1523. add %r10,%r9
  1524. mov %rdx,%r10
  1525. adc \$0,%r10
  1526. mulq $m0
  1527. add %rax,%r11
  1528. mov 16*4($nptr),%rax
  1529. adc \$0,%rdx
  1530. add %r11,%r10
  1531. mov %rdx,%r11
  1532. adc \$0,%r11
  1533. mulq $m0
  1534. add %rax,%r12
  1535. mov 16*5($nptr),%rax
  1536. adc \$0,%rdx
  1537. add %r12,%r11
  1538. mov %rdx,%r12
  1539. adc \$0,%r12
  1540. mulq $m0
  1541. add %rax,%r13
  1542. mov 16*6($nptr),%rax
  1543. adc \$0,%rdx
  1544. add %r13,%r12
  1545. mov %rdx,%r13
  1546. adc \$0,%r13
  1547. mulq $m0
  1548. add %rax,%r14
  1549. mov 16*7($nptr),%rax
  1550. adc \$0,%rdx
  1551. add %r14,%r13
  1552. mov %rdx,%r14
  1553. adc \$0,%r14
  1554. mulq $m0
  1555. mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
  1556. add %rax,%r15
  1557. adc \$0,%rdx
  1558. add %r15,%r14
  1559. mov 16*0($nptr),%rax # pull n[0]
  1560. mov %rdx,%r15
  1561. adc \$0,%r15
  1562. dec %ecx
  1563. jnz .L8x_tail
  1564. lea 16*8($nptr),$nptr
  1565. mov 8+8(%rsp),%rdx # pull end of t[]
  1566. cmp 0+8(%rsp),$nptr # end of n[]?
  1567. jae .L8x_tail_done # break out of loop
  1568. mov 48+56+8(%rsp),$m0 # pull n0*a[0]
  1569. neg $carry
  1570. mov 8*0($nptr),%rax # pull n[0]
  1571. adc 8*0($tptr),%r8
  1572. adc 8*1($tptr),%r9
  1573. adc 8*2($tptr),%r10
  1574. adc 8*3($tptr),%r11
  1575. adc 8*4($tptr),%r12
  1576. adc 8*5($tptr),%r13
  1577. adc 8*6($tptr),%r14
  1578. adc 8*7($tptr),%r15
  1579. sbb $carry,$carry # top carry
  1580. mov \$8,%ecx
  1581. jmp .L8x_tail
  1582. .align 32
  1583. .L8x_tail_done:
  1584. add (%rdx),%r8 # can this overflow?
  1585. adc \$0,%r9
  1586. adc \$0,%r10
  1587. adc \$0,%r11
  1588. adc \$0,%r12
  1589. adc \$0,%r13
  1590. adc \$0,%r14
  1591. adc \$0,%r15 # can't overflow, because we
  1592. # started with "overhung" part
  1593. # of multiplication
  1594. xor %rax,%rax
  1595. neg $carry
  1596. .L8x_no_tail:
  1597. adc 8*0($tptr),%r8
  1598. adc 8*1($tptr),%r9
  1599. adc 8*2($tptr),%r10
  1600. adc 8*3($tptr),%r11
  1601. adc 8*4($tptr),%r12
  1602. adc 8*5($tptr),%r13
  1603. adc 8*6($tptr),%r14
  1604. adc 8*7($tptr),%r15
  1605. adc \$0,%rax # top-most carry
  1606. mov -16($nptr),%rcx # np[num-1]
  1607. xor $carry,$carry
  1608. movq %xmm2,$nptr # restore $nptr
  1609. mov %r8,8*0($tptr) # store top 512 bits
  1610. mov %r9,8*1($tptr)
  1611. movq %xmm3,$num # $num is %r9, can't be moved upwards
  1612. mov %r10,8*2($tptr)
  1613. mov %r11,8*3($tptr)
  1614. mov %r12,8*4($tptr)
  1615. mov %r13,8*5($tptr)
  1616. mov %r14,8*6($tptr)
  1617. mov %r15,8*7($tptr)
  1618. lea 8*8($tptr),$tptr
  1619. cmp %rdx,$tptr # end of t[]?
  1620. jb .L8x_reduction_loop
  1621. ___
  1622. }
  1623. ##############################################################
  1624. # Post-condition, 4x unrolled
  1625. #
  1626. {
  1627. my ($tptr,$nptr)=("%rbx","%rbp");
  1628. $code.=<<___;
  1629. #xor %rsi,%rsi # %rsi was $carry above
  1630. sub %r15,%rcx # compare top-most words
  1631. lea (%rdi,$num),$tptr # %rdi was $tptr above
  1632. adc %rsi,%rsi
  1633. mov $num,%rcx
  1634. or %rsi,%rax
  1635. movq %xmm1,$rptr # restore $rptr
  1636. xor \$1,%rax
  1637. movq %xmm1,$aptr # prepare for back-to-back call
  1638. lea ($nptr,%rax,8),$nptr
  1639. sar \$3+2,%rcx # cf=0
  1640. jmp .Lsqr4x_sub
  1641. .align 32
  1642. .Lsqr4x_sub:
  1643. .byte 0x66
  1644. mov 8*0($tptr),%r12
  1645. mov 8*1($tptr),%r13
  1646. sbb 16*0($nptr),%r12
  1647. mov 8*2($tptr),%r14
  1648. sbb 16*1($nptr),%r13
  1649. mov 8*3($tptr),%r15
  1650. lea 8*4($tptr),$tptr
  1651. sbb 16*2($nptr),%r14
  1652. mov %r12,8*0($rptr)
  1653. sbb 16*3($nptr),%r15
  1654. lea 16*4($nptr),$nptr
  1655. mov %r13,8*1($rptr)
  1656. mov %r14,8*2($rptr)
  1657. mov %r15,8*3($rptr)
  1658. lea 8*4($rptr),$rptr
  1659. inc %rcx # pass %cf
  1660. jnz .Lsqr4x_sub
  1661. ___
  1662. }
  1663. $code.=<<___;
  1664. mov $num,%r10 # prepare for back-to-back call
  1665. neg $num # restore $num
  1666. ret
  1667. .size bn_sqr8x_internal,.-bn_sqr8x_internal
  1668. ___
  1669. {
  1670. $code.=<<___;
  1671. .globl bn_from_montgomery
  1672. .type bn_from_montgomery,\@abi-omnipotent
  1673. .align 32
  1674. bn_from_montgomery:
  1675. testl \$7,`($win64?"48(%rsp)":"%r9d")`
  1676. jz bn_from_mont8x
  1677. xor %eax,%eax
  1678. ret
  1679. .size bn_from_montgomery,.-bn_from_montgomery
  1680. .type bn_from_mont8x,\@function,6
  1681. .align 32
  1682. bn_from_mont8x:
  1683. .byte 0x67
  1684. mov %rsp,%rax
  1685. push %rbx
  1686. push %rbp
  1687. push %r12
  1688. push %r13
  1689. push %r14
  1690. push %r15
  1691. ___
  1692. $code.=<<___ if ($win64);
  1693. lea -0x28(%rsp),%rsp
  1694. movaps %xmm6,(%rsp)
  1695. movaps %xmm7,0x10(%rsp)
  1696. ___
  1697. $code.=<<___;
  1698. .byte 0x67
  1699. mov ${num}d,%r10d
  1700. shl \$3,${num}d # convert $num to bytes
  1701. shl \$3+2,%r10d # 4*$num
  1702. neg $num
  1703. mov ($n0),$n0 # *n0
  1704. ##############################################################
  1705. # ensure that stack frame doesn't alias with $aptr+4*$num
  1706. # modulo 4096, which covers ret[num], am[num] and n[2*num]
  1707. # (see bn_exp.c). this is done to allow memory disambiguation
  1708. # logic do its magic.
  1709. #
  1710. lea -64(%rsp,$num,2),%r11
  1711. sub $aptr,%r11
  1712. and \$4095,%r11
  1713. cmp %r11,%r10
  1714. jb .Lfrom_sp_alt
  1715. sub %r11,%rsp # align with $aptr
  1716. lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
  1717. jmp .Lfrom_sp_done
  1718. .align 32
  1719. .Lfrom_sp_alt:
  1720. lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
  1721. lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
  1722. sub %r10,%r11
  1723. mov \$0,%r10
  1724. cmovc %r10,%r11
  1725. sub %r11,%rsp
  1726. .Lfrom_sp_done:
  1727. and \$-64,%rsp
  1728. mov $num,%r10
  1729. neg $num
  1730. ##############################################################
  1731. # Stack layout
  1732. #
  1733. # +0 saved $num, used in reduction section
  1734. # +8 &t[2*$num], used in reduction section
  1735. # +32 saved *n0
  1736. # +40 saved %rsp
  1737. # +48 t[2*$num]
  1738. #
  1739. mov $n0, 32(%rsp)
  1740. mov %rax, 40(%rsp) # save original %rsp
  1741. .Lfrom_body:
  1742. mov $num,%r11
  1743. lea 48(%rsp),%rax
  1744. pxor %xmm0,%xmm0
  1745. jmp .Lmul_by_1
  1746. .align 32
  1747. .Lmul_by_1:
  1748. movdqu ($aptr),%xmm1
  1749. movdqu 16($aptr),%xmm2
  1750. movdqu 32($aptr),%xmm3
  1751. movdqa %xmm0,(%rax,$num)
  1752. movdqu 48($aptr),%xmm4
  1753. movdqa %xmm0,16(%rax,$num)
  1754. .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
  1755. movdqa %xmm1,(%rax)
  1756. movdqa %xmm0,32(%rax,$num)
  1757. movdqa %xmm2,16(%rax)
  1758. movdqa %xmm0,48(%rax,$num)
  1759. movdqa %xmm3,32(%rax)
  1760. movdqa %xmm4,48(%rax)
  1761. lea 64(%rax),%rax
  1762. sub \$64,%r11
  1763. jnz .Lmul_by_1
  1764. movq $rptr,%xmm1
  1765. movq $nptr,%xmm2
  1766. .byte 0x67
  1767. mov $nptr,%rbp
  1768. movq %r10, %xmm3 # -num
  1769. ___
  1770. $code.=<<___ if ($addx);
  1771. mov OPENSSL_ia32cap_P+8(%rip),%r11d
  1772. and \$0x80100,%r11d
  1773. cmp \$0x80100,%r11d
  1774. jne .Lfrom_mont_nox
  1775. lea (%rax,$num),$rptr
  1776. call sqrx8x_reduction
  1777. pxor %xmm0,%xmm0
  1778. lea 48(%rsp),%rax
  1779. mov 40(%rsp),%rsi # restore %rsp
  1780. jmp .Lfrom_mont_zero
  1781. .align 32
  1782. .Lfrom_mont_nox:
  1783. ___
  1784. $code.=<<___;
  1785. call sqr8x_reduction
  1786. pxor %xmm0,%xmm0
  1787. lea 48(%rsp),%rax
  1788. mov 40(%rsp),%rsi # restore %rsp
  1789. jmp .Lfrom_mont_zero
  1790. .align 32
  1791. .Lfrom_mont_zero:
  1792. movdqa %xmm0,16*0(%rax)
  1793. movdqa %xmm0,16*1(%rax)
  1794. movdqa %xmm0,16*2(%rax)
  1795. movdqa %xmm0,16*3(%rax)
  1796. lea 16*4(%rax),%rax
  1797. sub \$32,$num
  1798. jnz .Lfrom_mont_zero
  1799. mov \$1,%rax
  1800. mov -48(%rsi),%r15
  1801. mov -40(%rsi),%r14
  1802. mov -32(%rsi),%r13
  1803. mov -24(%rsi),%r12
  1804. mov -16(%rsi),%rbp
  1805. mov -8(%rsi),%rbx
  1806. lea (%rsi),%rsp
  1807. .Lfrom_epilogue:
  1808. ret
  1809. .size bn_from_mont8x,.-bn_from_mont8x
  1810. ___
  1811. }
  1812. }}}
  1813. if ($addx) {{{
  1814. my $bp="%rdx"; # restore original value
  1815. $code.=<<___;
  1816. .type bn_mulx4x_mont_gather5,\@function,6
  1817. .align 32
  1818. bn_mulx4x_mont_gather5:
  1819. .Lmulx4x_enter:
  1820. .byte 0x67
  1821. mov %rsp,%rax
  1822. push %rbx
  1823. push %rbp
  1824. push %r12
  1825. push %r13
  1826. push %r14
  1827. push %r15
  1828. ___
  1829. $code.=<<___ if ($win64);
  1830. lea -0x28(%rsp),%rsp
  1831. movaps %xmm6,(%rsp)
  1832. movaps %xmm7,0x10(%rsp)
  1833. ___
  1834. $code.=<<___;
  1835. .byte 0x67
  1836. mov ${num}d,%r10d
  1837. shl \$3,${num}d # convert $num to bytes
  1838. shl \$3+2,%r10d # 4*$num
  1839. neg $num # -$num
  1840. mov ($n0),$n0 # *n0
  1841. ##############################################################
  1842. # ensure that stack frame doesn't alias with $aptr+4*$num
  1843. # modulo 4096, which covers a[num], ret[num] and n[2*num]
  1844. # (see bn_exp.c). this is done to allow memory disambiguation
  1845. # logic do its magic. [excessive frame is allocated in order
  1846. # to allow bn_from_mont8x to clear it.]
  1847. #
  1848. lea -64(%rsp,$num,2),%r11
  1849. sub $ap,%r11
  1850. and \$4095,%r11
  1851. cmp %r11,%r10
  1852. jb .Lmulx4xsp_alt
  1853. sub %r11,%rsp # align with $aptr
  1854. lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
  1855. jmp .Lmulx4xsp_done
  1856. .align 32
  1857. .Lmulx4xsp_alt:
  1858. lea 4096-64(,$num,2),%r10 # 4096-frame-$num
  1859. lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
  1860. sub %r10,%r11
  1861. mov \$0,%r10
  1862. cmovc %r10,%r11
  1863. sub %r11,%rsp
  1864. .Lmulx4xsp_done:
  1865. and \$-64,%rsp # ensure alignment
  1866. ##############################################################
  1867. # Stack layout
  1868. # +0 -num
  1869. # +8 off-loaded &b[i]
  1870. # +16 end of b[num]
  1871. # +24 inner counter
  1872. # +32 saved n0
  1873. # +40 saved %rsp
  1874. # +48
  1875. # +56 saved rp
  1876. # +64 tmp[num+1]
  1877. #
  1878. mov $n0, 32(%rsp) # save *n0
  1879. mov %rax,40(%rsp) # save original %rsp
  1880. .Lmulx4x_body:
  1881. call mulx4x_internal
  1882. mov 40(%rsp),%rsi # restore %rsp
  1883. mov \$1,%rax
  1884. ___
  1885. $code.=<<___ if ($win64);
  1886. movaps -88(%rsi),%xmm6
  1887. movaps -72(%rsi),%xmm7
  1888. ___
  1889. $code.=<<___;
  1890. mov -48(%rsi),%r15
  1891. mov -40(%rsi),%r14
  1892. mov -32(%rsi),%r13
  1893. mov -24(%rsi),%r12
  1894. mov -16(%rsi),%rbp
  1895. mov -8(%rsi),%rbx
  1896. lea (%rsi),%rsp
  1897. .Lmulx4x_epilogue:
  1898. ret
  1899. .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
  1900. .type mulx4x_internal,\@abi-omnipotent
  1901. .align 32
  1902. mulx4x_internal:
  1903. .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
  1904. .byte 0x67
  1905. neg $num # restore $num
  1906. shl \$5,$num
  1907. lea 256($bp,$num),%r13
  1908. shr \$5+5,$num
  1909. mov `($win64?56:8)`(%rax),%r10d # load 7th argument
  1910. sub \$1,$num
  1911. mov %r13,16+8(%rsp) # end of b[num]
  1912. mov $num,24+8(%rsp) # inner counter
  1913. mov $rp, 56+8(%rsp) # save $rp
  1914. ___
  1915. my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
  1916. ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
  1917. my $rptr=$bptr;
  1918. my $STRIDE=2**5*8; # 5 is "window size"
  1919. my $N=$STRIDE/4; # should match cache line size
  1920. $code.=<<___;
  1921. mov %r10,%r11
  1922. shr \$`log($N/8)/log(2)`,%r10
  1923. and \$`$N/8-1`,%r11
  1924. not %r10
  1925. lea .Lmagic_masks(%rip),%rax
  1926. and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
  1927. lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
  1928. movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
  1929. movq 8(%rax,%r10,8),%xmm5 # cache line contains element
  1930. add \$7,%r11
  1931. movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
  1932. movq 24(%rax,%r10,8),%xmm7
  1933. and \$7,%r11
  1934. movq `0*$STRIDE/4-96`($bptr),%xmm0
  1935. lea $STRIDE($bptr),$tptr # borrow $tptr
  1936. movq `1*$STRIDE/4-96`($bptr),%xmm1
  1937. pand %xmm4,%xmm0
  1938. movq `2*$STRIDE/4-96`($bptr),%xmm2
  1939. pand %xmm5,%xmm1
  1940. movq `3*$STRIDE/4-96`($bptr),%xmm3
  1941. pand %xmm6,%xmm2
  1942. por %xmm1,%xmm0
  1943. movq `0*$STRIDE/4-96`($tptr),%xmm1
  1944. pand %xmm7,%xmm3
  1945. por %xmm2,%xmm0
  1946. movq `1*$STRIDE/4-96`($tptr),%xmm2
  1947. por %xmm3,%xmm0
  1948. .byte 0x67,0x67
  1949. pand %xmm4,%xmm1
  1950. movq `2*$STRIDE/4-96`($tptr),%xmm3
  1951. movq %xmm0,%rdx # bp[0]
  1952. movq `3*$STRIDE/4-96`($tptr),%xmm0
  1953. lea 2*$STRIDE($bptr),$bptr # next &b[i]
  1954. pand %xmm5,%xmm2
  1955. .byte 0x67,0x67
  1956. pand %xmm6,%xmm3
  1957. ##############################################################
  1958. # $tptr is chosen so that writing to top-most element of the
  1959. # vector occurs just "above" references to powers table,
  1960. # "above" modulo cache-line size, which effectively precludes
  1961. # possibility of memory disambiguation logic failure when
  1962. # accessing the table.
  1963. #
  1964. lea 64+8*4+8(%rsp,%r11,8),$tptr
  1965. mov %rdx,$bi
  1966. mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
  1967. mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
  1968. add %rax,%r11
  1969. mulx 2*8($aptr),%rax,%r13 # ...
  1970. adc %rax,%r12
  1971. adc \$0,%r13
  1972. mulx 3*8($aptr),%rax,%r14
  1973. mov $mi,%r15
  1974. imulq 32+8(%rsp),$mi # "t[0]"*n0
  1975. xor $zero,$zero # cf=0, of=0
  1976. mov $mi,%rdx
  1977. por %xmm2,%xmm1
  1978. pand %xmm7,%xmm0
  1979. por %xmm3,%xmm1
  1980. mov $bptr,8+8(%rsp) # off-load &b[i]
  1981. por %xmm1,%xmm0
  1982. .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
  1983. adcx %rax,%r13
  1984. adcx $zero,%r14 # cf=0
  1985. mulx 0*16($nptr),%rax,%r10
  1986. adcx %rax,%r15 # discarded
  1987. adox %r11,%r10
  1988. mulx 1*16($nptr),%rax,%r11
  1989. adcx %rax,%r10
  1990. adox %r12,%r11
  1991. mulx 2*16($nptr),%rax,%r12
  1992. mov 24+8(%rsp),$bptr # counter value
  1993. .byte 0x66
  1994. mov %r10,-8*4($tptr)
  1995. adcx %rax,%r11
  1996. adox %r13,%r12
  1997. mulx 3*16($nptr),%rax,%r15
  1998. .byte 0x67,0x67
  1999. mov $bi,%rdx
  2000. mov %r11,-8*3($tptr)
  2001. adcx %rax,%r12
  2002. adox $zero,%r15 # of=0
  2003. .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
  2004. mov %r12,-8*2($tptr)
  2005. #jmp .Lmulx4x_1st
  2006. .align 32
  2007. .Lmulx4x_1st:
  2008. adcx $zero,%r15 # cf=0, modulo-scheduled
  2009. mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
  2010. adcx %r14,%r10
  2011. mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
  2012. adcx %rax,%r11
  2013. mulx 2*8($aptr),%r12,%rax # ...
  2014. adcx %r14,%r12
  2015. mulx 3*8($aptr),%r13,%r14
  2016. .byte 0x67,0x67
  2017. mov $mi,%rdx
  2018. adcx %rax,%r13
  2019. adcx $zero,%r14 # cf=0
  2020. lea 4*8($aptr),$aptr
  2021. lea 4*8($tptr),$tptr
  2022. adox %r15,%r10
  2023. mulx 0*16($nptr),%rax,%r15
  2024. adcx %rax,%r10
  2025. adox %r15,%r11
  2026. mulx 1*16($nptr),%rax,%r15
  2027. adcx %rax,%r11
  2028. adox %r15,%r12
  2029. mulx 2*16($nptr),%rax,%r15
  2030. mov %r10,-5*8($tptr)
  2031. adcx %rax,%r12
  2032. mov %r11,-4*8($tptr)
  2033. adox %r15,%r13
  2034. mulx 3*16($nptr),%rax,%r15
  2035. mov $bi,%rdx
  2036. mov %r12,-3*8($tptr)
  2037. adcx %rax,%r13
  2038. adox $zero,%r15
  2039. lea 4*16($nptr),$nptr
  2040. mov %r13,-2*8($tptr)
  2041. dec $bptr # of=0, pass cf
  2042. jnz .Lmulx4x_1st
  2043. mov 8(%rsp),$num # load -num
  2044. movq %xmm0,%rdx # bp[1]
  2045. adc $zero,%r15 # modulo-scheduled
  2046. lea ($aptr,$num),$aptr # rewind $aptr
  2047. add %r15,%r14
  2048. mov 8+8(%rsp),$bptr # re-load &b[i]
  2049. adc $zero,$zero # top-most carry
  2050. mov %r14,-1*8($tptr)
  2051. jmp .Lmulx4x_outer
  2052. .align 32
  2053. .Lmulx4x_outer:
  2054. mov $zero,($tptr) # save top-most carry
  2055. lea 4*8($tptr,$num),$tptr # rewind $tptr
  2056. mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
  2057. xor $zero,$zero # cf=0, of=0
  2058. mov %rdx,$bi
  2059. mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
  2060. adox -4*8($tptr),$mi # +t[0]
  2061. adcx %r14,%r11
  2062. mulx 2*8($aptr),%r15,%r13 # ...
  2063. adox -3*8($tptr),%r11
  2064. adcx %r15,%r12
  2065. mulx 3*8($aptr),%rdx,%r14
  2066. adox -2*8($tptr),%r12
  2067. adcx %rdx,%r13
  2068. lea ($nptr,$num,2),$nptr # rewind $nptr
  2069. lea 4*8($aptr),$aptr
  2070. adox -1*8($tptr),%r13
  2071. adcx $zero,%r14
  2072. adox $zero,%r14
  2073. .byte 0x67
  2074. mov $mi,%r15
  2075. imulq 32+8(%rsp),$mi # "t[0]"*n0
  2076. movq `0*$STRIDE/4-96`($bptr),%xmm0
  2077. .byte 0x67,0x67
  2078. mov $mi,%rdx
  2079. movq `1*$STRIDE/4-96`($bptr),%xmm1
  2080. .byte 0x67
  2081. pand %xmm4,%xmm0
  2082. movq `2*$STRIDE/4-96`($bptr),%xmm2
  2083. .byte 0x67
  2084. pand %xmm5,%xmm1
  2085. movq `3*$STRIDE/4-96`($bptr),%xmm3
  2086. add \$$STRIDE,$bptr # next &b[i]
  2087. .byte 0x67
  2088. pand %xmm6,%xmm2
  2089. por %xmm1,%xmm0
  2090. pand %xmm7,%xmm3
  2091. xor $zero,$zero # cf=0, of=0
  2092. mov $bptr,8+8(%rsp) # off-load &b[i]
  2093. mulx 0*16($nptr),%rax,%r10
  2094. adcx %rax,%r15 # discarded
  2095. adox %r11,%r10
  2096. mulx 1*16($nptr),%rax,%r11
  2097. adcx %rax,%r10
  2098. adox %r12,%r11
  2099. mulx 2*16($nptr),%rax,%r12
  2100. adcx %rax,%r11
  2101. adox %r13,%r12
  2102. mulx 3*16($nptr),%rax,%r15
  2103. mov $bi,%rdx
  2104. por %xmm2,%xmm0
  2105. mov 24+8(%rsp),$bptr # counter value
  2106. mov %r10,-8*4($tptr)
  2107. por %xmm3,%xmm0
  2108. adcx %rax,%r12
  2109. mov %r11,-8*3($tptr)
  2110. adox $zero,%r15 # of=0
  2111. mov %r12,-8*2($tptr)
  2112. lea 4*16($nptr),$nptr
  2113. jmp .Lmulx4x_inner
  2114. .align 32
  2115. .Lmulx4x_inner:
  2116. mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
  2117. adcx $zero,%r15 # cf=0, modulo-scheduled
  2118. adox %r14,%r10
  2119. mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
  2120. adcx 0*8($tptr),%r10
  2121. adox %rax,%r11
  2122. mulx 2*8($aptr),%r12,%rax # ...
  2123. adcx 1*8($tptr),%r11
  2124. adox %r14,%r12
  2125. mulx 3*8($aptr),%r13,%r14
  2126. mov $mi,%rdx
  2127. adcx 2*8($tptr),%r12
  2128. adox %rax,%r13
  2129. adcx 3*8($tptr),%r13
  2130. adox $zero,%r14 # of=0
  2131. lea 4*8($aptr),$aptr
  2132. lea 4*8($tptr),$tptr
  2133. adcx $zero,%r14 # cf=0
  2134. adox %r15,%r10
  2135. mulx 0*16($nptr),%rax,%r15
  2136. adcx %rax,%r10
  2137. adox %r15,%r11
  2138. mulx 1*16($nptr),%rax,%r15
  2139. adcx %rax,%r11
  2140. adox %r15,%r12
  2141. mulx 2*16($nptr),%rax,%r15
  2142. mov %r10,-5*8($tptr)
  2143. adcx %rax,%r12
  2144. adox %r15,%r13
  2145. mov %r11,-4*8($tptr)
  2146. mulx 3*16($nptr),%rax,%r15
  2147. mov $bi,%rdx
  2148. lea 4*16($nptr),$nptr
  2149. mov %r12,-3*8($tptr)
  2150. adcx %rax,%r13
  2151. adox $zero,%r15
  2152. mov %r13,-2*8($tptr)
  2153. dec $bptr # of=0, pass cf
  2154. jnz .Lmulx4x_inner
  2155. mov 0+8(%rsp),$num # load -num
  2156. movq %xmm0,%rdx # bp[i+1]
  2157. adc $zero,%r15 # modulo-scheduled
  2158. sub 0*8($tptr),$bptr # pull top-most carry to %cf
  2159. mov 8+8(%rsp),$bptr # re-load &b[i]
  2160. mov 16+8(%rsp),%r10
  2161. adc %r15,%r14
  2162. lea ($aptr,$num),$aptr # rewind $aptr
  2163. adc $zero,$zero # top-most carry
  2164. mov %r14,-1*8($tptr)
  2165. cmp %r10,$bptr
  2166. jb .Lmulx4x_outer
  2167. mov -16($nptr),%r10
  2168. xor %r15,%r15
  2169. sub %r14,%r10 # compare top-most words
  2170. adc %r15,%r15
  2171. or %r15,$zero
  2172. xor \$1,$zero
  2173. lea ($tptr,$num),%rdi # rewind $tptr
  2174. lea ($nptr,$num,2),$nptr # rewind $nptr
  2175. .byte 0x67,0x67
  2176. sar \$3+2,$num # cf=0
  2177. lea ($nptr,$zero,8),%rbp
  2178. mov 56+8(%rsp),%rdx # restore rp
  2179. mov $num,%rcx
  2180. jmp .Lsqrx4x_sub # common post-condition
  2181. .size mulx4x_internal,.-mulx4x_internal
  2182. ___
  2183. } {
  2184. ######################################################################
  2185. # void bn_power5(
  2186. my $rptr="%rdi"; # BN_ULONG *rptr,
  2187. my $aptr="%rsi"; # const BN_ULONG *aptr,
  2188. my $bptr="%rdx"; # const void *table,
  2189. my $nptr="%rcx"; # const BN_ULONG *nptr,
  2190. my $n0 ="%r8"; # const BN_ULONG *n0);
  2191. my $num ="%r9"; # int num, has to be divisible by 8
  2192. # int pwr);
  2193. my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
  2194. my @A0=("%r10","%r11");
  2195. my @A1=("%r12","%r13");
  2196. my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
  2197. $code.=<<___;
  2198. .type bn_powerx5,\@function,6
  2199. .align 32
  2200. bn_powerx5:
  2201. .Lpowerx5_enter:
  2202. .byte 0x67
  2203. mov %rsp,%rax
  2204. push %rbx
  2205. push %rbp
  2206. push %r12
  2207. push %r13
  2208. push %r14
  2209. push %r15
  2210. ___
  2211. $code.=<<___ if ($win64);
  2212. lea -0x28(%rsp),%rsp
  2213. movaps %xmm6,(%rsp)
  2214. movaps %xmm7,0x10(%rsp)
  2215. ___
  2216. $code.=<<___;
  2217. .byte 0x67
  2218. mov ${num}d,%r10d
  2219. shl \$3,${num}d # convert $num to bytes
  2220. shl \$3+2,%r10d # 4*$num
  2221. neg $num
  2222. mov ($n0),$n0 # *n0
  2223. ##############################################################
  2224. # ensure that stack frame doesn't alias with $aptr+4*$num
  2225. # modulo 4096, which covers ret[num], am[num] and n[2*num]
  2226. # (see bn_exp.c). this is done to allow memory disambiguation
  2227. # logic do its magic.
  2228. #
  2229. lea -64(%rsp,$num,2),%r11
  2230. sub $aptr,%r11
  2231. and \$4095,%r11
  2232. cmp %r11,%r10
  2233. jb .Lpwrx_sp_alt
  2234. sub %r11,%rsp # align with $aptr
  2235. lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
  2236. jmp .Lpwrx_sp_done
  2237. .align 32
  2238. .Lpwrx_sp_alt:
  2239. lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
  2240. lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
  2241. sub %r10,%r11
  2242. mov \$0,%r10
  2243. cmovc %r10,%r11
  2244. sub %r11,%rsp
  2245. .Lpwrx_sp_done:
  2246. and \$-64,%rsp
  2247. mov $num,%r10
  2248. neg $num
  2249. ##############################################################
  2250. # Stack layout
  2251. #
  2252. # +0 saved $num, used in reduction section
  2253. # +8 &t[2*$num], used in reduction section
  2254. # +16 intermediate carry bit
  2255. # +24 top-most carry bit, used in reduction section
  2256. # +32 saved *n0
  2257. # +40 saved %rsp
  2258. # +48 t[2*$num]
  2259. #
  2260. pxor %xmm0,%xmm0
  2261. movq $rptr,%xmm1 # save $rptr
  2262. movq $nptr,%xmm2 # save $nptr
  2263. movq %r10, %xmm3 # -$num
  2264. movq $bptr,%xmm4
  2265. mov $n0, 32(%rsp)
  2266. mov %rax, 40(%rsp) # save original %rsp
  2267. .Lpowerx5_body:
  2268. call __bn_sqrx8x_internal
  2269. call __bn_sqrx8x_internal
  2270. call __bn_sqrx8x_internal
  2271. call __bn_sqrx8x_internal
  2272. call __bn_sqrx8x_internal
  2273. mov %r10,$num # -num
  2274. mov $aptr,$rptr
  2275. movq %xmm2,$nptr
  2276. movq %xmm4,$bptr
  2277. mov 40(%rsp),%rax
  2278. call mulx4x_internal
  2279. mov 40(%rsp),%rsi # restore %rsp
  2280. mov \$1,%rax
  2281. ___
  2282. $code.=<<___ if ($win64);
  2283. movaps -88(%rsi),%xmm6
  2284. movaps -72(%rsi),%xmm7
  2285. ___
  2286. $code.=<<___;
  2287. mov -48(%rsi),%r15
  2288. mov -40(%rsi),%r14
  2289. mov -32(%rsi),%r13
  2290. mov -24(%rsi),%r12
  2291. mov -16(%rsi),%rbp
  2292. mov -8(%rsi),%rbx
  2293. lea (%rsi),%rsp
  2294. .Lpowerx5_epilogue:
  2295. ret
  2296. .size bn_powerx5,.-bn_powerx5
  2297. .globl bn_sqrx8x_internal
  2298. .hidden bn_sqrx8x_internal
  2299. .type bn_sqrx8x_internal,\@abi-omnipotent
  2300. .align 32
  2301. bn_sqrx8x_internal:
  2302. __bn_sqrx8x_internal:
  2303. ##################################################################
  2304. # Squaring part:
  2305. #
  2306. # a) multiply-n-add everything but a[i]*a[i];
  2307. # b) shift result of a) by 1 to the left and accumulate
  2308. # a[i]*a[i] products;
  2309. #
  2310. ##################################################################
  2311. # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
  2312. # a[1]a[0]
  2313. # a[2]a[0]
  2314. # a[3]a[0]
  2315. # a[2]a[1]
  2316. # a[3]a[1]
  2317. # a[3]a[2]
  2318. #
  2319. # a[4]a[0]
  2320. # a[5]a[0]
  2321. # a[6]a[0]
  2322. # a[7]a[0]
  2323. # a[4]a[1]
  2324. # a[5]a[1]
  2325. # a[6]a[1]
  2326. # a[7]a[1]
  2327. # a[4]a[2]
  2328. # a[5]a[2]
  2329. # a[6]a[2]
  2330. # a[7]a[2]
  2331. # a[4]a[3]
  2332. # a[5]a[3]
  2333. # a[6]a[3]
  2334. # a[7]a[3]
  2335. #
  2336. # a[5]a[4]
  2337. # a[6]a[4]
  2338. # a[7]a[4]
  2339. # a[6]a[5]
  2340. # a[7]a[5]
  2341. # a[7]a[6]
  2342. # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
  2343. ___
  2344. {
  2345. my ($zero,$carry)=("%rbp","%rcx");
  2346. my $aaptr=$zero;
  2347. $code.=<<___;
  2348. lea 48+8(%rsp),$tptr
  2349. lea ($aptr,$num),$aaptr
  2350. mov $num,0+8(%rsp) # save $num
  2351. mov $aaptr,8+8(%rsp) # save end of $aptr
  2352. jmp .Lsqr8x_zero_start
  2353. .align 32
  2354. .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
  2355. .Lsqrx8x_zero:
  2356. .byte 0x3e
  2357. movdqa %xmm0,0*8($tptr)
  2358. movdqa %xmm0,2*8($tptr)
  2359. movdqa %xmm0,4*8($tptr)
  2360. movdqa %xmm0,6*8($tptr)
  2361. .Lsqr8x_zero_start: # aligned at 32
  2362. movdqa %xmm0,8*8($tptr)
  2363. movdqa %xmm0,10*8($tptr)
  2364. movdqa %xmm0,12*8($tptr)
  2365. movdqa %xmm0,14*8($tptr)
  2366. lea 16*8($tptr),$tptr
  2367. sub \$64,$num
  2368. jnz .Lsqrx8x_zero
  2369. mov 0*8($aptr),%rdx # a[0], modulo-scheduled
  2370. #xor %r9,%r9 # t[1], ex-$num, zero already
  2371. xor %r10,%r10
  2372. xor %r11,%r11
  2373. xor %r12,%r12
  2374. xor %r13,%r13
  2375. xor %r14,%r14
  2376. xor %r15,%r15
  2377. lea 48+8(%rsp),$tptr
  2378. xor $zero,$zero # cf=0, cf=0
  2379. jmp .Lsqrx8x_outer_loop
  2380. .align 32
  2381. .Lsqrx8x_outer_loop:
  2382. mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
  2383. adcx %r9,%r8 # a[1]*a[0]+=t[1]
  2384. adox %rax,%r10
  2385. mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
  2386. adcx %r10,%r9
  2387. adox %rax,%r11
  2388. .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
  2389. adcx %r11,%r10
  2390. adox %rax,%r12
  2391. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
  2392. adcx %r12,%r11
  2393. adox %rax,%r13
  2394. mulx 5*8($aptr),%r12,%rax
  2395. adcx %r13,%r12
  2396. adox %rax,%r14
  2397. mulx 6*8($aptr),%r13,%rax
  2398. adcx %r14,%r13
  2399. adox %r15,%rax
  2400. mulx 7*8($aptr),%r14,%r15
  2401. mov 1*8($aptr),%rdx # a[1]
  2402. adcx %rax,%r14
  2403. adox $zero,%r15
  2404. adc 8*8($tptr),%r15
  2405. mov %r8,1*8($tptr) # t[1]
  2406. mov %r9,2*8($tptr) # t[2]
  2407. sbb $carry,$carry # mov %cf,$carry
  2408. xor $zero,$zero # cf=0, of=0
  2409. mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
  2410. mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
  2411. adcx %r10,%r8
  2412. adox %rbx,%r9
  2413. mulx 4*8($aptr),%r10,%rbx # ...
  2414. adcx %r11,%r9
  2415. adox %rax,%r10
  2416. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
  2417. adcx %r12,%r10
  2418. adox %rbx,%r11
  2419. .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
  2420. adcx %r13,%r11
  2421. adox %r14,%r12
  2422. .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
  2423. mov 2*8($aptr),%rdx # a[2]
  2424. adcx %rax,%r12
  2425. adox %rbx,%r13
  2426. adcx %r15,%r13
  2427. adox $zero,%r14 # of=0
  2428. adcx $zero,%r14 # cf=0
  2429. mov %r8,3*8($tptr) # t[3]
  2430. mov %r9,4*8($tptr) # t[4]
  2431. mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
  2432. mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
  2433. adcx %r10,%r8
  2434. adox %rbx,%r9
  2435. mulx 5*8($aptr),%r10,%rbx # ...
  2436. adcx %r11,%r9
  2437. adox %rax,%r10
  2438. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
  2439. adcx %r12,%r10
  2440. adox %r13,%r11
  2441. .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
  2442. .byte 0x3e
  2443. mov 3*8($aptr),%rdx # a[3]
  2444. adcx %rbx,%r11
  2445. adox %rax,%r12
  2446. adcx %r14,%r12
  2447. mov %r8,5*8($tptr) # t[5]
  2448. mov %r9,6*8($tptr) # t[6]
  2449. mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
  2450. adox $zero,%r13 # of=0
  2451. adcx $zero,%r13 # cf=0
  2452. mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
  2453. adcx %r10,%r8
  2454. adox %rax,%r9
  2455. mulx 6*8($aptr),%r10,%rax # ...
  2456. adcx %r11,%r9
  2457. adox %r12,%r10
  2458. mulx 7*8($aptr),%r11,%r12
  2459. mov 4*8($aptr),%rdx # a[4]
  2460. mov 5*8($aptr),%r14 # a[5]
  2461. adcx %rbx,%r10
  2462. adox %rax,%r11
  2463. mov 6*8($aptr),%r15 # a[6]
  2464. adcx %r13,%r11
  2465. adox $zero,%r12 # of=0
  2466. adcx $zero,%r12 # cf=0
  2467. mov %r8,7*8($tptr) # t[7]
  2468. mov %r9,8*8($tptr) # t[8]
  2469. mulx %r14,%r9,%rax # a[5]*a[4]
  2470. mov 7*8($aptr),%r8 # a[7]
  2471. adcx %r10,%r9
  2472. mulx %r15,%r10,%rbx # a[6]*a[4]
  2473. adox %rax,%r10
  2474. adcx %r11,%r10
  2475. mulx %r8,%r11,%rax # a[7]*a[4]
  2476. mov %r14,%rdx # a[5]
  2477. adox %rbx,%r11
  2478. adcx %r12,%r11
  2479. #adox $zero,%rax # of=0
  2480. adcx $zero,%rax # cf=0
  2481. mulx %r15,%r14,%rbx # a[6]*a[5]
  2482. mulx %r8,%r12,%r13 # a[7]*a[5]
  2483. mov %r15,%rdx # a[6]
  2484. lea 8*8($aptr),$aptr
  2485. adcx %r14,%r11
  2486. adox %rbx,%r12
  2487. adcx %rax,%r12
  2488. adox $zero,%r13
  2489. .byte 0x67,0x67
  2490. mulx %r8,%r8,%r14 # a[7]*a[6]
  2491. adcx %r8,%r13
  2492. adcx $zero,%r14
  2493. cmp 8+8(%rsp),$aptr
  2494. je .Lsqrx8x_outer_break
  2495. neg $carry # mov $carry,%cf
  2496. mov \$-8,%rcx
  2497. mov $zero,%r15
  2498. mov 8*8($tptr),%r8
  2499. adcx 9*8($tptr),%r9 # +=t[9]
  2500. adcx 10*8($tptr),%r10 # ...
  2501. adcx 11*8($tptr),%r11
  2502. adc 12*8($tptr),%r12
  2503. adc 13*8($tptr),%r13
  2504. adc 14*8($tptr),%r14
  2505. adc 15*8($tptr),%r15
  2506. lea ($aptr),$aaptr
  2507. lea 2*64($tptr),$tptr
  2508. sbb %rax,%rax # mov %cf,$carry
  2509. mov -64($aptr),%rdx # a[0]
  2510. mov %rax,16+8(%rsp) # offload $carry
  2511. mov $tptr,24+8(%rsp)
  2512. #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
  2513. xor %eax,%eax # cf=0, of=0
  2514. jmp .Lsqrx8x_loop
  2515. .align 32
  2516. .Lsqrx8x_loop:
  2517. mov %r8,%rbx
  2518. mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
  2519. adcx %rax,%rbx # +=t[8]
  2520. adox %r9,%r8
  2521. mulx 1*8($aaptr),%rax,%r9 # ...
  2522. adcx %rax,%r8
  2523. adox %r10,%r9
  2524. mulx 2*8($aaptr),%rax,%r10
  2525. adcx %rax,%r9
  2526. adox %r11,%r10
  2527. mulx 3*8($aaptr),%rax,%r11
  2528. adcx %rax,%r10
  2529. adox %r12,%r11
  2530. .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
  2531. adcx %rax,%r11
  2532. adox %r13,%r12
  2533. mulx 5*8($aaptr),%rax,%r13
  2534. adcx %rax,%r12
  2535. adox %r14,%r13
  2536. mulx 6*8($aaptr),%rax,%r14
  2537. mov %rbx,($tptr,%rcx,8) # store t[8+i]
  2538. mov \$0,%ebx
  2539. adcx %rax,%r13
  2540. adox %r15,%r14
  2541. .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
  2542. mov 8($aptr,%rcx,8),%rdx # a[i]
  2543. adcx %rax,%r14
  2544. adox %rbx,%r15 # %rbx is 0, of=0
  2545. adcx %rbx,%r15 # cf=0
  2546. .byte 0x67
  2547. inc %rcx # of=0
  2548. jnz .Lsqrx8x_loop
  2549. lea 8*8($aaptr),$aaptr
  2550. mov \$-8,%rcx
  2551. cmp 8+8(%rsp),$aaptr # done?
  2552. je .Lsqrx8x_break
  2553. sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
  2554. .byte 0x66
  2555. mov -64($aptr),%rdx
  2556. adcx 0*8($tptr),%r8
  2557. adcx 1*8($tptr),%r9
  2558. adc 2*8($tptr),%r10
  2559. adc 3*8($tptr),%r11
  2560. adc 4*8($tptr),%r12
  2561. adc 5*8($tptr),%r13
  2562. adc 6*8($tptr),%r14
  2563. adc 7*8($tptr),%r15
  2564. lea 8*8($tptr),$tptr
  2565. .byte 0x67
  2566. sbb %rax,%rax # mov %cf,%rax
  2567. xor %ebx,%ebx # cf=0, of=0
  2568. mov %rax,16+8(%rsp) # offload carry
  2569. jmp .Lsqrx8x_loop
  2570. .align 32
  2571. .Lsqrx8x_break:
  2572. sub 16+8(%rsp),%r8 # consume last carry
  2573. mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
  2574. mov 0*8($aptr),%rdx # a[8], modulo-scheduled
  2575. xor %ebp,%ebp # xor $zero,$zero
  2576. mov %r8,0*8($tptr)
  2577. cmp $carry,$tptr # cf=0, of=0
  2578. je .Lsqrx8x_outer_loop
  2579. mov %r9,1*8($tptr)
  2580. mov 1*8($carry),%r9
  2581. mov %r10,2*8($tptr)
  2582. mov 2*8($carry),%r10
  2583. mov %r11,3*8($tptr)
  2584. mov 3*8($carry),%r11
  2585. mov %r12,4*8($tptr)
  2586. mov 4*8($carry),%r12
  2587. mov %r13,5*8($tptr)
  2588. mov 5*8($carry),%r13
  2589. mov %r14,6*8($tptr)
  2590. mov 6*8($carry),%r14
  2591. mov %r15,7*8($tptr)
  2592. mov 7*8($carry),%r15
  2593. mov $carry,$tptr
  2594. jmp .Lsqrx8x_outer_loop
  2595. .align 32
  2596. .Lsqrx8x_outer_break:
  2597. mov %r9,9*8($tptr) # t[9]
  2598. movq %xmm3,%rcx # -$num
  2599. mov %r10,10*8($tptr) # ...
  2600. mov %r11,11*8($tptr)
  2601. mov %r12,12*8($tptr)
  2602. mov %r13,13*8($tptr)
  2603. mov %r14,14*8($tptr)
  2604. ___
  2605. } {
  2606. my $i="%rcx";
  2607. $code.=<<___;
  2608. lea 48+8(%rsp),$tptr
  2609. mov ($aptr,$i),%rdx # a[0]
  2610. mov 8($tptr),$A0[1] # t[1]
  2611. xor $A0[0],$A0[0] # t[0], of=0, cf=0
  2612. mov 0+8(%rsp),$num # restore $num
  2613. adox $A0[1],$A0[1]
  2614. mov 16($tptr),$A1[0] # t[2] # prefetch
  2615. mov 24($tptr),$A1[1] # t[3] # prefetch
  2616. #jmp .Lsqrx4x_shift_n_add # happens to be aligned
  2617. .align 32
  2618. .Lsqrx4x_shift_n_add:
  2619. mulx %rdx,%rax,%rbx
  2620. adox $A1[0],$A1[0]
  2621. adcx $A0[0],%rax
  2622. .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
  2623. .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
  2624. adox $A1[1],$A1[1]
  2625. adcx $A0[1],%rbx
  2626. mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
  2627. mov %rax,0($tptr)
  2628. mov %rbx,8($tptr)
  2629. mulx %rdx,%rax,%rbx
  2630. adox $A0[0],$A0[0]
  2631. adcx $A1[0],%rax
  2632. mov 16($aptr,$i),%rdx # a[i+2] # prefetch
  2633. mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
  2634. adox $A0[1],$A0[1]
  2635. adcx $A1[1],%rbx
  2636. mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
  2637. mov %rax,16($tptr)
  2638. mov %rbx,24($tptr)
  2639. mulx %rdx,%rax,%rbx
  2640. adox $A1[0],$A1[0]
  2641. adcx $A0[0],%rax
  2642. mov 24($aptr,$i),%rdx # a[i+3] # prefetch
  2643. lea 32($i),$i
  2644. mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
  2645. adox $A1[1],$A1[1]
  2646. adcx $A0[1],%rbx
  2647. mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
  2648. mov %rax,32($tptr)
  2649. mov %rbx,40($tptr)
  2650. mulx %rdx,%rax,%rbx
  2651. adox $A0[0],$A0[0]
  2652. adcx $A1[0],%rax
  2653. jrcxz .Lsqrx4x_shift_n_add_break
  2654. .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
  2655. adox $A0[1],$A0[1]
  2656. adcx $A1[1],%rbx
  2657. mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
  2658. mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
  2659. mov %rax,48($tptr)
  2660. mov %rbx,56($tptr)
  2661. lea 64($tptr),$tptr
  2662. nop
  2663. jmp .Lsqrx4x_shift_n_add
  2664. .align 32
  2665. .Lsqrx4x_shift_n_add_break:
  2666. adcx $A1[1],%rbx
  2667. mov %rax,48($tptr)
  2668. mov %rbx,56($tptr)
  2669. lea 64($tptr),$tptr # end of t[] buffer
  2670. ___
  2671. }
  2672. ######################################################################
  2673. # Montgomery reduction part, "word-by-word" algorithm.
  2674. #
  2675. # This new path is inspired by multiple submissions from Intel, by
  2676. # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
  2677. # Vinodh Gopal...
  2678. {
  2679. my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
  2680. $code.=<<___;
  2681. movq %xmm2,$nptr
  2682. sqrx8x_reduction:
  2683. xor %eax,%eax # initial top-most carry bit
  2684. mov 32+8(%rsp),%rbx # n0
  2685. mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
  2686. lea -128($nptr,$num,2),%rcx # end of n[]
  2687. #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
  2688. mov %rcx, 0+8(%rsp) # save end of n[]
  2689. mov $tptr,8+8(%rsp) # save end of t[]
  2690. lea 48+8(%rsp),$tptr # initial t[] window
  2691. jmp .Lsqrx8x_reduction_loop
  2692. .align 32
  2693. .Lsqrx8x_reduction_loop:
  2694. mov 8*1($tptr),%r9
  2695. mov 8*2($tptr),%r10
  2696. mov 8*3($tptr),%r11
  2697. mov 8*4($tptr),%r12
  2698. mov %rdx,%r8
  2699. imulq %rbx,%rdx # n0*a[i]
  2700. mov 8*5($tptr),%r13
  2701. mov 8*6($tptr),%r14
  2702. mov 8*7($tptr),%r15
  2703. mov %rax,24+8(%rsp) # store top-most carry bit
  2704. lea 8*8($tptr),$tptr
  2705. xor $carry,$carry # cf=0,of=0
  2706. mov \$-8,%rcx
  2707. jmp .Lsqrx8x_reduce
  2708. .align 32
  2709. .Lsqrx8x_reduce:
  2710. mov %r8, %rbx
  2711. mulx 16*0($nptr),%rax,%r8 # n[0]
  2712. adcx %rbx,%rax # discarded
  2713. adox %r9,%r8
  2714. mulx 16*1($nptr),%rbx,%r9 # n[1]
  2715. adcx %rbx,%r8
  2716. adox %r10,%r9
  2717. mulx 16*2($nptr),%rbx,%r10
  2718. adcx %rbx,%r9
  2719. adox %r11,%r10
  2720. mulx 16*3($nptr),%rbx,%r11
  2721. adcx %rbx,%r10
  2722. adox %r12,%r11
  2723. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
  2724. mov %rdx,%rax
  2725. mov %r8,%rdx
  2726. adcx %rbx,%r11
  2727. adox %r13,%r12
  2728. mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
  2729. mov %rax,%rdx
  2730. mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
  2731. mulx 16*5($nptr),%rax,%r13
  2732. adcx %rax,%r12
  2733. adox %r14,%r13
  2734. mulx 16*6($nptr),%rax,%r14
  2735. adcx %rax,%r13
  2736. adox %r15,%r14
  2737. mulx 16*7($nptr),%rax,%r15
  2738. mov %rbx,%rdx
  2739. adcx %rax,%r14
  2740. adox $carry,%r15 # $carry is 0
  2741. adcx $carry,%r15 # cf=0
  2742. .byte 0x67,0x67,0x67
  2743. inc %rcx # of=0
  2744. jnz .Lsqrx8x_reduce
  2745. mov $carry,%rax # xor %rax,%rax
  2746. cmp 0+8(%rsp),$nptr # end of n[]?
  2747. jae .Lsqrx8x_no_tail
  2748. mov 48+8(%rsp),%rdx # pull n0*a[0]
  2749. add 8*0($tptr),%r8
  2750. lea 16*8($nptr),$nptr
  2751. mov \$-8,%rcx
  2752. adcx 8*1($tptr),%r9
  2753. adcx 8*2($tptr),%r10
  2754. adc 8*3($tptr),%r11
  2755. adc 8*4($tptr),%r12
  2756. adc 8*5($tptr),%r13
  2757. adc 8*6($tptr),%r14
  2758. adc 8*7($tptr),%r15
  2759. lea 8*8($tptr),$tptr
  2760. sbb %rax,%rax # top carry
  2761. xor $carry,$carry # of=0, cf=0
  2762. mov %rax,16+8(%rsp)
  2763. jmp .Lsqrx8x_tail
  2764. .align 32
  2765. .Lsqrx8x_tail:
  2766. mov %r8,%rbx
  2767. mulx 16*0($nptr),%rax,%r8
  2768. adcx %rax,%rbx
  2769. adox %r9,%r8
  2770. mulx 16*1($nptr),%rax,%r9
  2771. adcx %rax,%r8
  2772. adox %r10,%r9
  2773. mulx 16*2($nptr),%rax,%r10
  2774. adcx %rax,%r9
  2775. adox %r11,%r10
  2776. mulx 16*3($nptr),%rax,%r11
  2777. adcx %rax,%r10
  2778. adox %r12,%r11
  2779. .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
  2780. adcx %rax,%r11
  2781. adox %r13,%r12
  2782. mulx 16*5($nptr),%rax,%r13
  2783. adcx %rax,%r12
  2784. adox %r14,%r13
  2785. mulx 16*6($nptr),%rax,%r14
  2786. adcx %rax,%r13
  2787. adox %r15,%r14
  2788. mulx 16*7($nptr),%rax,%r15
  2789. mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
  2790. adcx %rax,%r14
  2791. adox $carry,%r15
  2792. mov %rbx,($tptr,%rcx,8) # save result
  2793. mov %r8,%rbx
  2794. adcx $carry,%r15 # cf=0
  2795. inc %rcx # of=0
  2796. jnz .Lsqrx8x_tail
  2797. cmp 0+8(%rsp),$nptr # end of n[]?
  2798. jae .Lsqrx8x_tail_done # break out of loop
  2799. sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
  2800. mov 48+8(%rsp),%rdx # pull n0*a[0]
  2801. lea 16*8($nptr),$nptr
  2802. adc 8*0($tptr),%r8
  2803. adc 8*1($tptr),%r9
  2804. adc 8*2($tptr),%r10
  2805. adc 8*3($tptr),%r11
  2806. adc 8*4($tptr),%r12
  2807. adc 8*5($tptr),%r13
  2808. adc 8*6($tptr),%r14
  2809. adc 8*7($tptr),%r15
  2810. lea 8*8($tptr),$tptr
  2811. sbb %rax,%rax
  2812. sub \$8,%rcx # mov \$-8,%rcx
  2813. xor $carry,$carry # of=0, cf=0
  2814. mov %rax,16+8(%rsp)
  2815. jmp .Lsqrx8x_tail
  2816. .align 32
  2817. .Lsqrx8x_tail_done:
  2818. add 24+8(%rsp),%r8 # can this overflow?
  2819. adc \$0,%r9
  2820. adc \$0,%r10
  2821. adc \$0,%r11
  2822. adc \$0,%r12
  2823. adc \$0,%r13
  2824. adc \$0,%r14
  2825. adc \$0,%r15 # can't overflow, because we
  2826. # started with "overhung" part
  2827. # of multiplication
  2828. mov $carry,%rax # xor %rax,%rax
  2829. sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
  2830. .Lsqrx8x_no_tail: # %cf is 0 if jumped here
  2831. adc 8*0($tptr),%r8
  2832. movq %xmm3,%rcx
  2833. adc 8*1($tptr),%r9
  2834. mov 16*7($nptr),$carry
  2835. movq %xmm2,$nptr # restore $nptr
  2836. adc 8*2($tptr),%r10
  2837. adc 8*3($tptr),%r11
  2838. adc 8*4($tptr),%r12
  2839. adc 8*5($tptr),%r13
  2840. adc 8*6($tptr),%r14
  2841. adc 8*7($tptr),%r15
  2842. adc %rax,%rax # top-most carry
  2843. mov 32+8(%rsp),%rbx # n0
  2844. mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
  2845. mov %r8,8*0($tptr) # store top 512 bits
  2846. lea 8*8($tptr),%r8 # borrow %r8
  2847. mov %r9,8*1($tptr)
  2848. mov %r10,8*2($tptr)
  2849. mov %r11,8*3($tptr)
  2850. mov %r12,8*4($tptr)
  2851. mov %r13,8*5($tptr)
  2852. mov %r14,8*6($tptr)
  2853. mov %r15,8*7($tptr)
  2854. lea 8*8($tptr,%rcx),$tptr # start of current t[] window
  2855. cmp 8+8(%rsp),%r8 # end of t[]?
  2856. jb .Lsqrx8x_reduction_loop
  2857. ___
  2858. }
  2859. ##############################################################
  2860. # Post-condition, 4x unrolled
  2861. #
  2862. {
  2863. my ($rptr,$nptr)=("%rdx","%rbp");
  2864. my @ri=map("%r$_",(10..13));
  2865. my @ni=map("%r$_",(14..15));
  2866. $code.=<<___;
  2867. xor %ebx,%ebx
  2868. sub %r15,%rsi # compare top-most words
  2869. adc %rbx,%rbx
  2870. mov %rcx,%r10 # -$num
  2871. or %rbx,%rax
  2872. mov %rcx,%r9 # -$num
  2873. xor \$1,%rax
  2874. sar \$3+2,%rcx # cf=0
  2875. #lea 48+8(%rsp,%r9),$tptr
  2876. lea ($nptr,%rax,8),$nptr
  2877. movq %xmm1,$rptr # restore $rptr
  2878. movq %xmm1,$aptr # prepare for back-to-back call
  2879. jmp .Lsqrx4x_sub
  2880. .align 32
  2881. .Lsqrx4x_sub:
  2882. .byte 0x66
  2883. mov 8*0($tptr),%r12
  2884. mov 8*1($tptr),%r13
  2885. sbb 16*0($nptr),%r12
  2886. mov 8*2($tptr),%r14
  2887. sbb 16*1($nptr),%r13
  2888. mov 8*3($tptr),%r15
  2889. lea 8*4($tptr),$tptr
  2890. sbb 16*2($nptr),%r14
  2891. mov %r12,8*0($rptr)
  2892. sbb 16*3($nptr),%r15
  2893. lea 16*4($nptr),$nptr
  2894. mov %r13,8*1($rptr)
  2895. mov %r14,8*2($rptr)
  2896. mov %r15,8*3($rptr)
  2897. lea 8*4($rptr),$rptr
  2898. inc %rcx
  2899. jnz .Lsqrx4x_sub
  2900. ___
  2901. }
  2902. $code.=<<___;
  2903. neg %r9 # restore $num
  2904. ret
  2905. .size bn_sqrx8x_internal,.-bn_sqrx8x_internal
  2906. ___
  2907. }}}
  2908. {
  2909. my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
  2910. ("%rdi","%esi","%rdx","%ecx"); # Unix order
  2911. my $out=$inp;
  2912. my $STRIDE=2**5*8;
  2913. my $N=$STRIDE/4;
  2914. $code.=<<___;
  2915. .globl bn_get_bits5
  2916. .type bn_get_bits5,\@abi-omnipotent
  2917. .align 16
  2918. bn_get_bits5:
  2919. lea 0($inp),%r10
  2920. lea 1($inp),%r11
  2921. mov $num,%ecx
  2922. shr \$4,$num
  2923. and \$15,%ecx
  2924. lea -8(%ecx),%eax
  2925. cmp \$11,%ecx
  2926. cmova %r11,%r10
  2927. cmova %eax,%ecx
  2928. movzw (%r10,$num,2),%eax
  2929. shrl %cl,%eax
  2930. and \$31,%eax
  2931. ret
  2932. .size bn_get_bits5,.-bn_get_bits5
  2933. .globl bn_scatter5
  2934. .type bn_scatter5,\@abi-omnipotent
  2935. .align 16
  2936. bn_scatter5:
  2937. cmp \$0, $num
  2938. jz .Lscatter_epilogue
  2939. lea ($tbl,$idx,8),$tbl
  2940. .Lscatter:
  2941. mov ($inp),%rax
  2942. lea 8($inp),$inp
  2943. mov %rax,($tbl)
  2944. lea 32*8($tbl),$tbl
  2945. sub \$1,$num
  2946. jnz .Lscatter
  2947. .Lscatter_epilogue:
  2948. ret
  2949. .size bn_scatter5,.-bn_scatter5
  2950. .globl bn_gather5
  2951. .type bn_gather5,\@abi-omnipotent
  2952. .align 16
  2953. bn_gather5:
  2954. ___
  2955. $code.=<<___ if ($win64);
  2956. .LSEH_begin_bn_gather5:
  2957. # I can't trust assembler to use specific encoding:-(
  2958. .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
  2959. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  2960. .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
  2961. ___
  2962. $code.=<<___;
  2963. mov $idx,%r11d
  2964. shr \$`log($N/8)/log(2)`,$idx
  2965. and \$`$N/8-1`,%r11
  2966. not $idx
  2967. lea .Lmagic_masks(%rip),%rax
  2968. and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
  2969. lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
  2970. movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
  2971. movq 8(%rax,$idx,8),%xmm5 # cache line contains element
  2972. movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
  2973. movq 24(%rax,$idx,8),%xmm7
  2974. jmp .Lgather
  2975. .align 16
  2976. .Lgather:
  2977. movq `0*$STRIDE/4-128`($tbl),%xmm0
  2978. movq `1*$STRIDE/4-128`($tbl),%xmm1
  2979. pand %xmm4,%xmm0
  2980. movq `2*$STRIDE/4-128`($tbl),%xmm2
  2981. pand %xmm5,%xmm1
  2982. movq `3*$STRIDE/4-128`($tbl),%xmm3
  2983. pand %xmm6,%xmm2
  2984. por %xmm1,%xmm0
  2985. pand %xmm7,%xmm3
  2986. .byte 0x67,0x67
  2987. por %xmm2,%xmm0
  2988. lea $STRIDE($tbl),$tbl
  2989. por %xmm3,%xmm0
  2990. movq %xmm0,($out) # m0=bp[0]
  2991. lea 8($out),$out
  2992. sub \$1,$num
  2993. jnz .Lgather
  2994. ___
  2995. $code.=<<___ if ($win64);
  2996. movaps (%rsp),%xmm6
  2997. movaps 0x10(%rsp),%xmm7
  2998. lea 0x28(%rsp),%rsp
  2999. ___
  3000. $code.=<<___;
  3001. ret
  3002. .LSEH_end_bn_gather5:
  3003. .size bn_gather5,.-bn_gather5
  3004. ___
  3005. }
  3006. $code.=<<___;
  3007. .align 64
  3008. .Lmagic_masks:
  3009. .long 0,0, 0,0, 0,0, -1,-1
  3010. .long 0,0, 0,0, 0,0, 0,0
  3011. .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  3012. ___
  3013. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  3014. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  3015. if ($win64) {
  3016. $rec="%rcx";
  3017. $frame="%rdx";
  3018. $context="%r8";
  3019. $disp="%r9";
  3020. $code.=<<___;
  3021. .extern __imp_RtlVirtualUnwind
  3022. .type mul_handler,\@abi-omnipotent
  3023. .align 16
  3024. mul_handler:
  3025. push %rsi
  3026. push %rdi
  3027. push %rbx
  3028. push %rbp
  3029. push %r12
  3030. push %r13
  3031. push %r14
  3032. push %r15
  3033. pushfq
  3034. sub \$64,%rsp
  3035. mov 120($context),%rax # pull context->Rax
  3036. mov 248($context),%rbx # pull context->Rip
  3037. mov 8($disp),%rsi # disp->ImageBase
  3038. mov 56($disp),%r11 # disp->HandlerData
  3039. mov 0(%r11),%r10d # HandlerData[0]
  3040. lea (%rsi,%r10),%r10 # end of prologue label
  3041. cmp %r10,%rbx # context->Rip<end of prologue label
  3042. jb .Lcommon_seh_tail
  3043. mov 152($context),%rax # pull context->Rsp
  3044. mov 4(%r11),%r10d # HandlerData[1]
  3045. lea (%rsi,%r10),%r10 # epilogue label
  3046. cmp %r10,%rbx # context->Rip>=epilogue label
  3047. jae .Lcommon_seh_tail
  3048. lea .Lmul_epilogue(%rip),%r10
  3049. cmp %r10,%rbx
  3050. jb .Lbody_40
  3051. mov 192($context),%r10 # pull $num
  3052. mov 8(%rax,%r10,8),%rax # pull saved stack pointer
  3053. jmp .Lbody_proceed
  3054. .Lbody_40:
  3055. mov 40(%rax),%rax # pull saved stack pointer
  3056. .Lbody_proceed:
  3057. movaps -88(%rax),%xmm0
  3058. movaps -72(%rax),%xmm1
  3059. mov -8(%rax),%rbx
  3060. mov -16(%rax),%rbp
  3061. mov -24(%rax),%r12
  3062. mov -32(%rax),%r13
  3063. mov -40(%rax),%r14
  3064. mov -48(%rax),%r15
  3065. mov %rbx,144($context) # restore context->Rbx
  3066. mov %rbp,160($context) # restore context->Rbp
  3067. mov %r12,216($context) # restore context->R12
  3068. mov %r13,224($context) # restore context->R13
  3069. mov %r14,232($context) # restore context->R14
  3070. mov %r15,240($context) # restore context->R15
  3071. movups %xmm0,512($context) # restore context->Xmm6
  3072. movups %xmm1,528($context) # restore context->Xmm7
  3073. .Lcommon_seh_tail:
  3074. mov 8(%rax),%rdi
  3075. mov 16(%rax),%rsi
  3076. mov %rax,152($context) # restore context->Rsp
  3077. mov %rsi,168($context) # restore context->Rsi
  3078. mov %rdi,176($context) # restore context->Rdi
  3079. mov 40($disp),%rdi # disp->ContextRecord
  3080. mov $context,%rsi # context
  3081. mov \$154,%ecx # sizeof(CONTEXT)
  3082. .long 0xa548f3fc # cld; rep movsq
  3083. mov $disp,%rsi
  3084. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  3085. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  3086. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  3087. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  3088. mov 40(%rsi),%r10 # disp->ContextRecord
  3089. lea 56(%rsi),%r11 # &disp->HandlerData
  3090. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  3091. mov %r10,32(%rsp) # arg5
  3092. mov %r11,40(%rsp) # arg6
  3093. mov %r12,48(%rsp) # arg7
  3094. mov %rcx,56(%rsp) # arg8, (NULL)
  3095. call *__imp_RtlVirtualUnwind(%rip)
  3096. mov \$1,%eax # ExceptionContinueSearch
  3097. add \$64,%rsp
  3098. popfq
  3099. pop %r15
  3100. pop %r14
  3101. pop %r13
  3102. pop %r12
  3103. pop %rbp
  3104. pop %rbx
  3105. pop %rdi
  3106. pop %rsi
  3107. ret
  3108. .size mul_handler,.-mul_handler
  3109. .section .pdata
  3110. .align 4
  3111. .rva .LSEH_begin_bn_mul_mont_gather5
  3112. .rva .LSEH_end_bn_mul_mont_gather5
  3113. .rva .LSEH_info_bn_mul_mont_gather5
  3114. .rva .LSEH_begin_bn_mul4x_mont_gather5
  3115. .rva .LSEH_end_bn_mul4x_mont_gather5
  3116. .rva .LSEH_info_bn_mul4x_mont_gather5
  3117. .rva .LSEH_begin_bn_power5
  3118. .rva .LSEH_end_bn_power5
  3119. .rva .LSEH_info_bn_power5
  3120. .rva .LSEH_begin_bn_from_mont8x
  3121. .rva .LSEH_end_bn_from_mont8x
  3122. .rva .LSEH_info_bn_from_mont8x
  3123. ___
  3124. $code.=<<___ if ($addx);
  3125. .rva .LSEH_begin_bn_mulx4x_mont_gather5
  3126. .rva .LSEH_end_bn_mulx4x_mont_gather5
  3127. .rva .LSEH_info_bn_mulx4x_mont_gather5
  3128. .rva .LSEH_begin_bn_powerx5
  3129. .rva .LSEH_end_bn_powerx5
  3130. .rva .LSEH_info_bn_powerx5
  3131. ___
  3132. $code.=<<___;
  3133. .rva .LSEH_begin_bn_gather5
  3134. .rva .LSEH_end_bn_gather5
  3135. .rva .LSEH_info_bn_gather5
  3136. .section .xdata
  3137. .align 8
  3138. .LSEH_info_bn_mul_mont_gather5:
  3139. .byte 9,0,0,0
  3140. .rva mul_handler
  3141. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  3142. .align 8
  3143. .LSEH_info_bn_mul4x_mont_gather5:
  3144. .byte 9,0,0,0
  3145. .rva mul_handler
  3146. .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
  3147. .align 8
  3148. .LSEH_info_bn_power5:
  3149. .byte 9,0,0,0
  3150. .rva mul_handler
  3151. .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
  3152. .align 8
  3153. .LSEH_info_bn_from_mont8x:
  3154. .byte 9,0,0,0
  3155. .rva mul_handler
  3156. .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
  3157. ___
  3158. $code.=<<___ if ($addx);
  3159. .align 8
  3160. .LSEH_info_bn_mulx4x_mont_gather5:
  3161. .byte 9,0,0,0
  3162. .rva mul_handler
  3163. .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
  3164. .align 8
  3165. .LSEH_info_bn_powerx5:
  3166. .byte 9,0,0,0
  3167. .rva mul_handler
  3168. .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
  3169. ___
  3170. $code.=<<___;
  3171. .align 8
  3172. .LSEH_info_bn_gather5:
  3173. .byte 0x01,0x0d,0x05,0x00
  3174. .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  3175. .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
  3176. .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
  3177. .align 8
  3178. ___
  3179. }
  3180. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  3181. print $code;
  3182. close STDOUT;