2
0

poly1305-x86_64.pl 98 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for x86_64.
  17. #
  18. # March 2015
  19. #
  20. # Initial release.
  21. #
  22. # December 2016
  23. #
  24. # Add AVX512F+VL+BW code path.
  25. #
  26. # November 2017
  27. #
  28. # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
  29. # executed even on Knights Landing. Trigger for modification was
  30. # observation that AVX512 code paths can negatively affect overall
  31. # Skylake-X system performance. Since we are likely to suppress
  32. # AVX512F capability flag [at least on Skylake-X], conversion serves
  33. # as kind of "investment protection". Note that next *lake processor,
  34. # Cannolake, has AVX512IFMA code path to execute...
  35. #
  36. # Numbers are cycles per processed byte with poly1305_blocks alone,
  37. # measured with rdtsc at fixed clock frequency.
  38. #
  39. # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
  40. # P4 4.46/+120% -
  41. # Core 2 2.41/+90% -
  42. # Westmere 1.88/+120% -
  43. # Sandy Bridge 1.39/+140% 1.10
  44. # Haswell 1.14/+175% 1.11 0.65
  45. # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
  46. # Silvermont 2.83/+95% -
  47. # Knights L 3.60/? 1.65 1.10 0.41(***)
  48. # Goldmont 1.70/+180% -
  49. # VIA Nano 1.82/+150% -
  50. # Sledgehammer 1.38/+160% -
  51. # Bulldozer 2.30/+130% 0.97
  52. # Ryzen 1.15/+200% 1.08 1.18
  53. #
  54. # (*) improvement coefficients relative to clang are more modest and
  55. # are ~50% on most processors, in both cases we are comparing to
  56. # __int128 code;
  57. # (**) SSE2 implementation was attempted, but among non-AVX processors
  58. # it was faster than integer-only code only on older Intel P4 and
  59. # Core processors, 50-30%, less newer processor is, but slower on
  60. # contemporary ones, for example almost 2x slower on Atom, and as
  61. # former are naturally disappearing, SSE2 is deemed unnecessary;
  62. # (***) strangely enough performance seems to vary from core to core,
  63. # listed result is best case;
  64. # $output is the last argument if it looks like a file (it has an extension)
  65. # $flavour is the first argument if it doesn't look like a file
  66. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  67. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  68. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  69. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  70. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  71. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  72. die "can't locate x86_64-xlate.pl";
  73. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  74. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  75. $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
  76. }
  77. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  78. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  79. $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
  80. $avx += 2 if ($1==2.11 && $2>=8);
  81. }
  82. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  83. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  84. $avx = ($1>=10) + ($1>=12);
  85. }
  86. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  87. $avx = ($2>=3.0) + ($2>3.0);
  88. }
  89. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  90. or die "can't call $xlate: $!";
  91. *STDOUT=*OUT;
  92. my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
  93. my ($mac,$nonce)=($inp,$len); # *_emit arguments
  94. my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
  95. my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
  96. sub poly1305_iteration {
  97. # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
  98. # output: $h0-$h2 *= $r0-$r1
  99. $code.=<<___;
  100. mulq $h0 # h0*r1
  101. mov %rax,$d2
  102. mov $r0,%rax
  103. mov %rdx,$d3
  104. mulq $h0 # h0*r0
  105. mov %rax,$h0 # future $h0
  106. mov $r0,%rax
  107. mov %rdx,$d1
  108. mulq $h1 # h1*r0
  109. add %rax,$d2
  110. mov $s1,%rax
  111. adc %rdx,$d3
  112. mulq $h1 # h1*s1
  113. mov $h2,$h1 # borrow $h1
  114. add %rax,$h0
  115. adc %rdx,$d1
  116. imulq $s1,$h1 # h2*s1
  117. add $h1,$d2
  118. mov $d1,$h1
  119. adc \$0,$d3
  120. imulq $r0,$h2 # h2*r0
  121. add $d2,$h1
  122. mov \$-4,%rax # mask value
  123. adc $h2,$d3
  124. and $d3,%rax # last reduction step
  125. mov $d3,$h2
  126. shr \$2,$d3
  127. and \$3,$h2
  128. add $d3,%rax
  129. add %rax,$h0
  130. adc \$0,$h1
  131. adc \$0,$h2
  132. ___
  133. }
  134. ########################################################################
  135. # Layout of opaque area is following.
  136. #
  137. # unsigned __int64 h[3]; # current hash value base 2^64
  138. # unsigned __int64 r[2]; # key value base 2^64
  139. $code.=<<___;
  140. .text
  141. .extern OPENSSL_ia32cap_P
  142. .globl poly1305_init
  143. .hidden poly1305_init
  144. .globl poly1305_blocks
  145. .hidden poly1305_blocks
  146. .globl poly1305_emit
  147. .hidden poly1305_emit
  148. .type poly1305_init,\@function,3
  149. .align 32
  150. poly1305_init:
  151. .cfi_startproc
  152. xor %rax,%rax
  153. mov %rax,0($ctx) # initialize hash value
  154. mov %rax,8($ctx)
  155. mov %rax,16($ctx)
  156. cmp \$0,$inp
  157. je .Lno_key
  158. lea poly1305_blocks(%rip),%r10
  159. lea poly1305_emit(%rip),%r11
  160. ___
  161. $code.=<<___ if ($avx);
  162. mov OPENSSL_ia32cap_P+4(%rip),%r9
  163. lea poly1305_blocks_avx(%rip),%rax
  164. lea poly1305_emit_avx(%rip),%rcx
  165. bt \$`60-32`,%r9 # AVX?
  166. cmovc %rax,%r10
  167. cmovc %rcx,%r11
  168. ___
  169. $code.=<<___ if ($avx>1);
  170. lea poly1305_blocks_avx2(%rip),%rax
  171. bt \$`5+32`,%r9 # AVX2?
  172. cmovc %rax,%r10
  173. ___
  174. $code.=<<___ if ($avx>3 && !$win64);
  175. mov \$`(1<<31|1<<21|1<<16)`,%rax
  176. shr \$32,%r9
  177. and %rax,%r9
  178. cmp %rax,%r9
  179. je .Linit_base2_44
  180. ___
  181. $code.=<<___;
  182. mov \$0x0ffffffc0fffffff,%rax
  183. mov \$0x0ffffffc0ffffffc,%rcx
  184. and 0($inp),%rax
  185. and 8($inp),%rcx
  186. mov %rax,24($ctx)
  187. mov %rcx,32($ctx)
  188. ___
  189. $code.=<<___ if ($flavour !~ /elf32/);
  190. mov %r10,0(%rdx)
  191. mov %r11,8(%rdx)
  192. ___
  193. $code.=<<___ if ($flavour =~ /elf32/);
  194. mov %r10d,0(%rdx)
  195. mov %r11d,4(%rdx)
  196. ___
  197. $code.=<<___;
  198. mov \$1,%eax
  199. .Lno_key:
  200. ret
  201. .cfi_endproc
  202. .size poly1305_init,.-poly1305_init
  203. .type poly1305_blocks,\@function,4
  204. .align 32
  205. poly1305_blocks:
  206. .cfi_startproc
  207. .Lblocks:
  208. shr \$4,$len
  209. jz .Lno_data # too short
  210. push %rbx
  211. .cfi_push %rbx
  212. push %rbp
  213. .cfi_push %rbp
  214. push %r12
  215. .cfi_push %r12
  216. push %r13
  217. .cfi_push %r13
  218. push %r14
  219. .cfi_push %r14
  220. push %r15
  221. .cfi_push %r15
  222. .Lblocks_body:
  223. mov $len,%r15 # reassign $len
  224. mov 24($ctx),$r0 # load r
  225. mov 32($ctx),$s1
  226. mov 0($ctx),$h0 # load hash value
  227. mov 8($ctx),$h1
  228. mov 16($ctx),$h2
  229. mov $s1,$r1
  230. shr \$2,$s1
  231. mov $r1,%rax
  232. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  233. jmp .Loop
  234. .align 32
  235. .Loop:
  236. add 0($inp),$h0 # accumulate input
  237. adc 8($inp),$h1
  238. lea 16($inp),$inp
  239. adc $padbit,$h2
  240. ___
  241. &poly1305_iteration();
  242. $code.=<<___;
  243. mov $r1,%rax
  244. dec %r15 # len-=16
  245. jnz .Loop
  246. mov $h0,0($ctx) # store hash value
  247. mov $h1,8($ctx)
  248. mov $h2,16($ctx)
  249. mov 0(%rsp),%r15
  250. .cfi_restore %r15
  251. mov 8(%rsp),%r14
  252. .cfi_restore %r14
  253. mov 16(%rsp),%r13
  254. .cfi_restore %r13
  255. mov 24(%rsp),%r12
  256. .cfi_restore %r12
  257. mov 32(%rsp),%rbp
  258. .cfi_restore %rbp
  259. mov 40(%rsp),%rbx
  260. .cfi_restore %rbx
  261. lea 48(%rsp),%rsp
  262. .cfi_adjust_cfa_offset -48
  263. .Lno_data:
  264. .Lblocks_epilogue:
  265. ret
  266. .cfi_endproc
  267. .size poly1305_blocks,.-poly1305_blocks
  268. .type poly1305_emit,\@function,3
  269. .align 32
  270. poly1305_emit:
  271. .cfi_startproc
  272. .Lemit:
  273. mov 0($ctx),%r8 # load hash value
  274. mov 8($ctx),%r9
  275. mov 16($ctx),%r10
  276. mov %r8,%rax
  277. add \$5,%r8 # compare to modulus
  278. mov %r9,%rcx
  279. adc \$0,%r9
  280. adc \$0,%r10
  281. shr \$2,%r10 # did 130-bit value overflow?
  282. cmovnz %r8,%rax
  283. cmovnz %r9,%rcx
  284. add 0($nonce),%rax # accumulate nonce
  285. adc 8($nonce),%rcx
  286. mov %rax,0($mac) # write result
  287. mov %rcx,8($mac)
  288. ret
  289. .cfi_endproc
  290. .size poly1305_emit,.-poly1305_emit
  291. ___
  292. if ($avx) {
  293. ########################################################################
  294. # Layout of opaque area is following.
  295. #
  296. # unsigned __int32 h[5]; # current hash value base 2^26
  297. # unsigned __int32 is_base2_26;
  298. # unsigned __int64 r[2]; # key value base 2^64
  299. # unsigned __int64 pad;
  300. # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
  301. #
  302. # where r^n are base 2^26 digits of degrees of multiplier key. There are
  303. # 5 digits, but last four are interleaved with multiples of 5, totalling
  304. # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
  305. my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
  306. map("%xmm$_",(0..15));
  307. $code.=<<___;
  308. .type __poly1305_block,\@abi-omnipotent
  309. .align 32
  310. __poly1305_block:
  311. .cfi_startproc
  312. ___
  313. &poly1305_iteration();
  314. $code.=<<___;
  315. ret
  316. .cfi_endproc
  317. .size __poly1305_block,.-__poly1305_block
  318. .type __poly1305_init_avx,\@abi-omnipotent
  319. .align 32
  320. __poly1305_init_avx:
  321. .cfi_startproc
  322. mov $r0,$h0
  323. mov $r1,$h1
  324. xor $h2,$h2
  325. lea 48+64($ctx),$ctx # size optimization
  326. mov $r1,%rax
  327. call __poly1305_block # r^2
  328. mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
  329. mov \$0x3ffffff,%edx
  330. mov $h0,$d1
  331. and $h0#d,%eax
  332. mov $r0,$d2
  333. and $r0#d,%edx
  334. mov %eax,`16*0+0-64`($ctx)
  335. shr \$26,$d1
  336. mov %edx,`16*0+4-64`($ctx)
  337. shr \$26,$d2
  338. mov \$0x3ffffff,%eax
  339. mov \$0x3ffffff,%edx
  340. and $d1#d,%eax
  341. and $d2#d,%edx
  342. mov %eax,`16*1+0-64`($ctx)
  343. lea (%rax,%rax,4),%eax # *5
  344. mov %edx,`16*1+4-64`($ctx)
  345. lea (%rdx,%rdx,4),%edx # *5
  346. mov %eax,`16*2+0-64`($ctx)
  347. shr \$26,$d1
  348. mov %edx,`16*2+4-64`($ctx)
  349. shr \$26,$d2
  350. mov $h1,%rax
  351. mov $r1,%rdx
  352. shl \$12,%rax
  353. shl \$12,%rdx
  354. or $d1,%rax
  355. or $d2,%rdx
  356. and \$0x3ffffff,%eax
  357. and \$0x3ffffff,%edx
  358. mov %eax,`16*3+0-64`($ctx)
  359. lea (%rax,%rax,4),%eax # *5
  360. mov %edx,`16*3+4-64`($ctx)
  361. lea (%rdx,%rdx,4),%edx # *5
  362. mov %eax,`16*4+0-64`($ctx)
  363. mov $h1,$d1
  364. mov %edx,`16*4+4-64`($ctx)
  365. mov $r1,$d2
  366. mov \$0x3ffffff,%eax
  367. mov \$0x3ffffff,%edx
  368. shr \$14,$d1
  369. shr \$14,$d2
  370. and $d1#d,%eax
  371. and $d2#d,%edx
  372. mov %eax,`16*5+0-64`($ctx)
  373. lea (%rax,%rax,4),%eax # *5
  374. mov %edx,`16*5+4-64`($ctx)
  375. lea (%rdx,%rdx,4),%edx # *5
  376. mov %eax,`16*6+0-64`($ctx)
  377. shr \$26,$d1
  378. mov %edx,`16*6+4-64`($ctx)
  379. shr \$26,$d2
  380. mov $h2,%rax
  381. shl \$24,%rax
  382. or %rax,$d1
  383. mov $d1#d,`16*7+0-64`($ctx)
  384. lea ($d1,$d1,4),$d1 # *5
  385. mov $d2#d,`16*7+4-64`($ctx)
  386. lea ($d2,$d2,4),$d2 # *5
  387. mov $d1#d,`16*8+0-64`($ctx)
  388. mov $d2#d,`16*8+4-64`($ctx)
  389. mov $r1,%rax
  390. call __poly1305_block # r^3
  391. mov \$0x3ffffff,%eax # save r^3 base 2^26
  392. mov $h0,$d1
  393. and $h0#d,%eax
  394. shr \$26,$d1
  395. mov %eax,`16*0+12-64`($ctx)
  396. mov \$0x3ffffff,%edx
  397. and $d1#d,%edx
  398. mov %edx,`16*1+12-64`($ctx)
  399. lea (%rdx,%rdx,4),%edx # *5
  400. shr \$26,$d1
  401. mov %edx,`16*2+12-64`($ctx)
  402. mov $h1,%rax
  403. shl \$12,%rax
  404. or $d1,%rax
  405. and \$0x3ffffff,%eax
  406. mov %eax,`16*3+12-64`($ctx)
  407. lea (%rax,%rax,4),%eax # *5
  408. mov $h1,$d1
  409. mov %eax,`16*4+12-64`($ctx)
  410. mov \$0x3ffffff,%edx
  411. shr \$14,$d1
  412. and $d1#d,%edx
  413. mov %edx,`16*5+12-64`($ctx)
  414. lea (%rdx,%rdx,4),%edx # *5
  415. shr \$26,$d1
  416. mov %edx,`16*6+12-64`($ctx)
  417. mov $h2,%rax
  418. shl \$24,%rax
  419. or %rax,$d1
  420. mov $d1#d,`16*7+12-64`($ctx)
  421. lea ($d1,$d1,4),$d1 # *5
  422. mov $d1#d,`16*8+12-64`($ctx)
  423. mov $r1,%rax
  424. call __poly1305_block # r^4
  425. mov \$0x3ffffff,%eax # save r^4 base 2^26
  426. mov $h0,$d1
  427. and $h0#d,%eax
  428. shr \$26,$d1
  429. mov %eax,`16*0+8-64`($ctx)
  430. mov \$0x3ffffff,%edx
  431. and $d1#d,%edx
  432. mov %edx,`16*1+8-64`($ctx)
  433. lea (%rdx,%rdx,4),%edx # *5
  434. shr \$26,$d1
  435. mov %edx,`16*2+8-64`($ctx)
  436. mov $h1,%rax
  437. shl \$12,%rax
  438. or $d1,%rax
  439. and \$0x3ffffff,%eax
  440. mov %eax,`16*3+8-64`($ctx)
  441. lea (%rax,%rax,4),%eax # *5
  442. mov $h1,$d1
  443. mov %eax,`16*4+8-64`($ctx)
  444. mov \$0x3ffffff,%edx
  445. shr \$14,$d1
  446. and $d1#d,%edx
  447. mov %edx,`16*5+8-64`($ctx)
  448. lea (%rdx,%rdx,4),%edx # *5
  449. shr \$26,$d1
  450. mov %edx,`16*6+8-64`($ctx)
  451. mov $h2,%rax
  452. shl \$24,%rax
  453. or %rax,$d1
  454. mov $d1#d,`16*7+8-64`($ctx)
  455. lea ($d1,$d1,4),$d1 # *5
  456. mov $d1#d,`16*8+8-64`($ctx)
  457. lea -48-64($ctx),$ctx # size [de-]optimization
  458. ret
  459. .cfi_endproc
  460. .size __poly1305_init_avx,.-__poly1305_init_avx
  461. .type poly1305_blocks_avx,\@function,4
  462. .align 32
  463. poly1305_blocks_avx:
  464. .cfi_startproc
  465. mov 20($ctx),%r8d # is_base2_26
  466. cmp \$128,$len
  467. jae .Lblocks_avx
  468. test %r8d,%r8d
  469. jz .Lblocks
  470. .Lblocks_avx:
  471. and \$-16,$len
  472. jz .Lno_data_avx
  473. vzeroupper
  474. test %r8d,%r8d
  475. jz .Lbase2_64_avx
  476. test \$31,$len
  477. jz .Leven_avx
  478. push %rbx
  479. .cfi_push %rbx
  480. push %rbp
  481. .cfi_push %rbp
  482. push %r12
  483. .cfi_push %r12
  484. push %r13
  485. .cfi_push %r13
  486. push %r14
  487. .cfi_push %r14
  488. push %r15
  489. .cfi_push %r15
  490. .Lblocks_avx_body:
  491. mov $len,%r15 # reassign $len
  492. mov 0($ctx),$d1 # load hash value
  493. mov 8($ctx),$d2
  494. mov 16($ctx),$h2#d
  495. mov 24($ctx),$r0 # load r
  496. mov 32($ctx),$s1
  497. ################################# base 2^26 -> base 2^64
  498. mov $d1#d,$h0#d
  499. and \$`-1*(1<<31)`,$d1
  500. mov $d2,$r1 # borrow $r1
  501. mov $d2#d,$h1#d
  502. and \$`-1*(1<<31)`,$d2
  503. shr \$6,$d1
  504. shl \$52,$r1
  505. add $d1,$h0
  506. shr \$12,$h1
  507. shr \$18,$d2
  508. add $r1,$h0
  509. adc $d2,$h1
  510. mov $h2,$d1
  511. shl \$40,$d1
  512. shr \$24,$h2
  513. add $d1,$h1
  514. adc \$0,$h2 # can be partially reduced...
  515. mov \$-4,$d2 # ... so reduce
  516. mov $h2,$d1
  517. and $h2,$d2
  518. shr \$2,$d1
  519. and \$3,$h2
  520. add $d2,$d1 # =*5
  521. add $d1,$h0
  522. adc \$0,$h1
  523. adc \$0,$h2
  524. mov $s1,$r1
  525. mov $s1,%rax
  526. shr \$2,$s1
  527. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  528. add 0($inp),$h0 # accumulate input
  529. adc 8($inp),$h1
  530. lea 16($inp),$inp
  531. adc $padbit,$h2
  532. call __poly1305_block
  533. test $padbit,$padbit # if $padbit is zero,
  534. jz .Lstore_base2_64_avx # store hash in base 2^64 format
  535. ################################# base 2^64 -> base 2^26
  536. mov $h0,%rax
  537. mov $h0,%rdx
  538. shr \$52,$h0
  539. mov $h1,$r0
  540. mov $h1,$r1
  541. shr \$26,%rdx
  542. and \$0x3ffffff,%rax # h[0]
  543. shl \$12,$r0
  544. and \$0x3ffffff,%rdx # h[1]
  545. shr \$14,$h1
  546. or $r0,$h0
  547. shl \$24,$h2
  548. and \$0x3ffffff,$h0 # h[2]
  549. shr \$40,$r1
  550. and \$0x3ffffff,$h1 # h[3]
  551. or $r1,$h2 # h[4]
  552. sub \$16,%r15
  553. jz .Lstore_base2_26_avx
  554. vmovd %rax#d,$H0
  555. vmovd %rdx#d,$H1
  556. vmovd $h0#d,$H2
  557. vmovd $h1#d,$H3
  558. vmovd $h2#d,$H4
  559. jmp .Lproceed_avx
  560. .align 32
  561. .Lstore_base2_64_avx:
  562. mov $h0,0($ctx)
  563. mov $h1,8($ctx)
  564. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  565. jmp .Ldone_avx
  566. .align 16
  567. .Lstore_base2_26_avx:
  568. mov %rax#d,0($ctx) # store hash value base 2^26
  569. mov %rdx#d,4($ctx)
  570. mov $h0#d,8($ctx)
  571. mov $h1#d,12($ctx)
  572. mov $h2#d,16($ctx)
  573. .align 16
  574. .Ldone_avx:
  575. mov 0(%rsp),%r15
  576. .cfi_restore %r15
  577. mov 8(%rsp),%r14
  578. .cfi_restore %r14
  579. mov 16(%rsp),%r13
  580. .cfi_restore %r13
  581. mov 24(%rsp),%r12
  582. .cfi_restore %r12
  583. mov 32(%rsp),%rbp
  584. .cfi_restore %rbp
  585. mov 40(%rsp),%rbx
  586. .cfi_restore %rbx
  587. lea 48(%rsp),%rsp
  588. .cfi_adjust_cfa_offset -48
  589. .Lno_data_avx:
  590. .Lblocks_avx_epilogue:
  591. ret
  592. .cfi_endproc
  593. .align 32
  594. .Lbase2_64_avx:
  595. .cfi_startproc
  596. push %rbx
  597. .cfi_push %rbx
  598. push %rbp
  599. .cfi_push %rbp
  600. push %r12
  601. .cfi_push %r12
  602. push %r13
  603. .cfi_push %r13
  604. push %r14
  605. .cfi_push %r14
  606. push %r15
  607. .cfi_push %r15
  608. .Lbase2_64_avx_body:
  609. mov $len,%r15 # reassign $len
  610. mov 24($ctx),$r0 # load r
  611. mov 32($ctx),$s1
  612. mov 0($ctx),$h0 # load hash value
  613. mov 8($ctx),$h1
  614. mov 16($ctx),$h2#d
  615. mov $s1,$r1
  616. mov $s1,%rax
  617. shr \$2,$s1
  618. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  619. test \$31,$len
  620. jz .Linit_avx
  621. add 0($inp),$h0 # accumulate input
  622. adc 8($inp),$h1
  623. lea 16($inp),$inp
  624. adc $padbit,$h2
  625. sub \$16,%r15
  626. call __poly1305_block
  627. .Linit_avx:
  628. ################################# base 2^64 -> base 2^26
  629. mov $h0,%rax
  630. mov $h0,%rdx
  631. shr \$52,$h0
  632. mov $h1,$d1
  633. mov $h1,$d2
  634. shr \$26,%rdx
  635. and \$0x3ffffff,%rax # h[0]
  636. shl \$12,$d1
  637. and \$0x3ffffff,%rdx # h[1]
  638. shr \$14,$h1
  639. or $d1,$h0
  640. shl \$24,$h2
  641. and \$0x3ffffff,$h0 # h[2]
  642. shr \$40,$d2
  643. and \$0x3ffffff,$h1 # h[3]
  644. or $d2,$h2 # h[4]
  645. vmovd %rax#d,$H0
  646. vmovd %rdx#d,$H1
  647. vmovd $h0#d,$H2
  648. vmovd $h1#d,$H3
  649. vmovd $h2#d,$H4
  650. movl \$1,20($ctx) # set is_base2_26
  651. call __poly1305_init_avx
  652. .Lproceed_avx:
  653. mov %r15,$len
  654. mov 0(%rsp),%r15
  655. .cfi_restore %r15
  656. mov 8(%rsp),%r14
  657. .cfi_restore %r14
  658. mov 16(%rsp),%r13
  659. .cfi_restore %r13
  660. mov 24(%rsp),%r12
  661. .cfi_restore %r12
  662. mov 32(%rsp),%rbp
  663. .cfi_restore %rbp
  664. mov 40(%rsp),%rbx
  665. .cfi_restore %rbx
  666. lea 48(%rsp),%rax
  667. lea 48(%rsp),%rsp
  668. .cfi_adjust_cfa_offset -48
  669. .Lbase2_64_avx_epilogue:
  670. jmp .Ldo_avx
  671. .cfi_endproc
  672. .align 32
  673. .Leven_avx:
  674. .cfi_startproc
  675. vmovd 4*0($ctx),$H0 # load hash value
  676. vmovd 4*1($ctx),$H1
  677. vmovd 4*2($ctx),$H2
  678. vmovd 4*3($ctx),$H3
  679. vmovd 4*4($ctx),$H4
  680. .Ldo_avx:
  681. ___
  682. $code.=<<___ if (!$win64);
  683. lea -0x58(%rsp),%r11
  684. .cfi_def_cfa %r11,0x60
  685. sub \$0x178,%rsp
  686. ___
  687. $code.=<<___ if ($win64);
  688. lea -0xf8(%rsp),%r11
  689. sub \$0x218,%rsp
  690. vmovdqa %xmm6,0x50(%r11)
  691. vmovdqa %xmm7,0x60(%r11)
  692. vmovdqa %xmm8,0x70(%r11)
  693. vmovdqa %xmm9,0x80(%r11)
  694. vmovdqa %xmm10,0x90(%r11)
  695. vmovdqa %xmm11,0xa0(%r11)
  696. vmovdqa %xmm12,0xb0(%r11)
  697. vmovdqa %xmm13,0xc0(%r11)
  698. vmovdqa %xmm14,0xd0(%r11)
  699. vmovdqa %xmm15,0xe0(%r11)
  700. .Ldo_avx_body:
  701. ___
  702. $code.=<<___;
  703. sub \$64,$len
  704. lea -32($inp),%rax
  705. cmovc %rax,$inp
  706. vmovdqu `16*3`($ctx),$D4 # preload r0^2
  707. lea `16*3+64`($ctx),$ctx # size optimization
  708. lea .Lconst(%rip),%rcx
  709. ################################################################
  710. # load input
  711. vmovdqu 16*2($inp),$T0
  712. vmovdqu 16*3($inp),$T1
  713. vmovdqa 64(%rcx),$MASK # .Lmask26
  714. vpsrldq \$6,$T0,$T2 # splat input
  715. vpsrldq \$6,$T1,$T3
  716. vpunpckhqdq $T1,$T0,$T4 # 4
  717. vpunpcklqdq $T1,$T0,$T0 # 0:1
  718. vpunpcklqdq $T3,$T2,$T3 # 2:3
  719. vpsrlq \$40,$T4,$T4 # 4
  720. vpsrlq \$26,$T0,$T1
  721. vpand $MASK,$T0,$T0 # 0
  722. vpsrlq \$4,$T3,$T2
  723. vpand $MASK,$T1,$T1 # 1
  724. vpsrlq \$30,$T3,$T3
  725. vpand $MASK,$T2,$T2 # 2
  726. vpand $MASK,$T3,$T3 # 3
  727. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  728. jbe .Lskip_loop_avx
  729. # expand and copy pre-calculated table to stack
  730. vmovdqu `16*1-64`($ctx),$D1
  731. vmovdqu `16*2-64`($ctx),$D2
  732. vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
  733. vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
  734. vmovdqa $D3,-0x90(%r11)
  735. vmovdqa $D0,0x00(%rsp)
  736. vpshufd \$0xEE,$D1,$D4
  737. vmovdqu `16*3-64`($ctx),$D0
  738. vpshufd \$0x44,$D1,$D1
  739. vmovdqa $D4,-0x80(%r11)
  740. vmovdqa $D1,0x10(%rsp)
  741. vpshufd \$0xEE,$D2,$D3
  742. vmovdqu `16*4-64`($ctx),$D1
  743. vpshufd \$0x44,$D2,$D2
  744. vmovdqa $D3,-0x70(%r11)
  745. vmovdqa $D2,0x20(%rsp)
  746. vpshufd \$0xEE,$D0,$D4
  747. vmovdqu `16*5-64`($ctx),$D2
  748. vpshufd \$0x44,$D0,$D0
  749. vmovdqa $D4,-0x60(%r11)
  750. vmovdqa $D0,0x30(%rsp)
  751. vpshufd \$0xEE,$D1,$D3
  752. vmovdqu `16*6-64`($ctx),$D0
  753. vpshufd \$0x44,$D1,$D1
  754. vmovdqa $D3,-0x50(%r11)
  755. vmovdqa $D1,0x40(%rsp)
  756. vpshufd \$0xEE,$D2,$D4
  757. vmovdqu `16*7-64`($ctx),$D1
  758. vpshufd \$0x44,$D2,$D2
  759. vmovdqa $D4,-0x40(%r11)
  760. vmovdqa $D2,0x50(%rsp)
  761. vpshufd \$0xEE,$D0,$D3
  762. vmovdqu `16*8-64`($ctx),$D2
  763. vpshufd \$0x44,$D0,$D0
  764. vmovdqa $D3,-0x30(%r11)
  765. vmovdqa $D0,0x60(%rsp)
  766. vpshufd \$0xEE,$D1,$D4
  767. vpshufd \$0x44,$D1,$D1
  768. vmovdqa $D4,-0x20(%r11)
  769. vmovdqa $D1,0x70(%rsp)
  770. vpshufd \$0xEE,$D2,$D3
  771. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  772. vpshufd \$0x44,$D2,$D2
  773. vmovdqa $D3,-0x10(%r11)
  774. vmovdqa $D2,0x80(%rsp)
  775. jmp .Loop_avx
  776. .align 32
  777. .Loop_avx:
  778. ################################################################
  779. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  780. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  781. # \___________________/
  782. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  783. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  784. # \___________________/ \____________________/
  785. #
  786. # Note that we start with inp[2:3]*r^2. This is because it
  787. # doesn't depend on reduction in previous iteration.
  788. ################################################################
  789. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  790. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  791. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  792. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  793. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  794. #
  795. # though note that $Tx and $Hx are "reversed" in this section,
  796. # and $D4 is preloaded with r0^2...
  797. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  798. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  799. vmovdqa $H2,0x20(%r11) # offload hash
  800. vpmuludq $T2,$D4,$D2 # d3 = h2*r0
  801. vmovdqa 0x10(%rsp),$H2 # r1^2
  802. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  803. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  804. vmovdqa $H0,0x00(%r11) #
  805. vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
  806. vmovdqa $H1,0x10(%r11) #
  807. vpmuludq $T3,$H2,$H1 # h3*r1
  808. vpaddq $H0,$D0,$D0 # d0 += h4*s1
  809. vpaddq $H1,$D4,$D4 # d4 += h3*r1
  810. vmovdqa $H3,0x30(%r11) #
  811. vpmuludq $T2,$H2,$H0 # h2*r1
  812. vpmuludq $T1,$H2,$H1 # h1*r1
  813. vpaddq $H0,$D3,$D3 # d3 += h2*r1
  814. vmovdqa 0x30(%rsp),$H3 # r2^2
  815. vpaddq $H1,$D2,$D2 # d2 += h1*r1
  816. vmovdqa $H4,0x40(%r11) #
  817. vpmuludq $T0,$H2,$H2 # h0*r1
  818. vpmuludq $T2,$H3,$H0 # h2*r2
  819. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  820. vmovdqa 0x40(%rsp),$H4 # s2^2
  821. vpaddq $H0,$D4,$D4 # d4 += h2*r2
  822. vpmuludq $T1,$H3,$H1 # h1*r2
  823. vpmuludq $T0,$H3,$H3 # h0*r2
  824. vpaddq $H1,$D3,$D3 # d3 += h1*r2
  825. vmovdqa 0x50(%rsp),$H2 # r3^2
  826. vpaddq $H3,$D2,$D2 # d2 += h0*r2
  827. vpmuludq $T4,$H4,$H0 # h4*s2
  828. vpmuludq $T3,$H4,$H4 # h3*s2
  829. vpaddq $H0,$D1,$D1 # d1 += h4*s2
  830. vmovdqa 0x60(%rsp),$H3 # s3^2
  831. vpaddq $H4,$D0,$D0 # d0 += h3*s2
  832. vmovdqa 0x80(%rsp),$H4 # s4^2
  833. vpmuludq $T1,$H2,$H1 # h1*r3
  834. vpmuludq $T0,$H2,$H2 # h0*r3
  835. vpaddq $H1,$D4,$D4 # d4 += h1*r3
  836. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  837. vpmuludq $T4,$H3,$H0 # h4*s3
  838. vpmuludq $T3,$H3,$H1 # h3*s3
  839. vpaddq $H0,$D2,$D2 # d2 += h4*s3
  840. vmovdqu 16*0($inp),$H0 # load input
  841. vpaddq $H1,$D1,$D1 # d1 += h3*s3
  842. vpmuludq $T2,$H3,$H3 # h2*s3
  843. vpmuludq $T2,$H4,$T2 # h2*s4
  844. vpaddq $H3,$D0,$D0 # d0 += h2*s3
  845. vmovdqu 16*1($inp),$H1 #
  846. vpaddq $T2,$D1,$D1 # d1 += h2*s4
  847. vpmuludq $T3,$H4,$T3 # h3*s4
  848. vpmuludq $T4,$H4,$T4 # h4*s4
  849. vpsrldq \$6,$H0,$H2 # splat input
  850. vpaddq $T3,$D2,$D2 # d2 += h3*s4
  851. vpaddq $T4,$D3,$D3 # d3 += h4*s4
  852. vpsrldq \$6,$H1,$H3 #
  853. vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
  854. vpmuludq $T1,$H4,$T0 # h1*s4
  855. vpunpckhqdq $H1,$H0,$H4 # 4
  856. vpaddq $T4,$D4,$D4 # d4 += h0*r4
  857. vmovdqa -0x90(%r11),$T4 # r0^4
  858. vpaddq $T0,$D0,$D0 # d0 += h1*s4
  859. vpunpcklqdq $H1,$H0,$H0 # 0:1
  860. vpunpcklqdq $H3,$H2,$H3 # 2:3
  861. #vpsrlq \$40,$H4,$H4 # 4
  862. vpsrldq \$`40/8`,$H4,$H4 # 4
  863. vpsrlq \$26,$H0,$H1
  864. vpand $MASK,$H0,$H0 # 0
  865. vpsrlq \$4,$H3,$H2
  866. vpand $MASK,$H1,$H1 # 1
  867. vpand 0(%rcx),$H4,$H4 # .Lmask24
  868. vpsrlq \$30,$H3,$H3
  869. vpand $MASK,$H2,$H2 # 2
  870. vpand $MASK,$H3,$H3 # 3
  871. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  872. vpaddq 0x00(%r11),$H0,$H0 # add hash value
  873. vpaddq 0x10(%r11),$H1,$H1
  874. vpaddq 0x20(%r11),$H2,$H2
  875. vpaddq 0x30(%r11),$H3,$H3
  876. vpaddq 0x40(%r11),$H4,$H4
  877. lea 16*2($inp),%rax
  878. lea 16*4($inp),$inp
  879. sub \$64,$len
  880. cmovc %rax,$inp
  881. ################################################################
  882. # Now we accumulate (inp[0:1]+hash)*r^4
  883. ################################################################
  884. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  885. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  886. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  887. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  888. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  889. vpmuludq $H0,$T4,$T0 # h0*r0
  890. vpmuludq $H1,$T4,$T1 # h1*r0
  891. vpaddq $T0,$D0,$D0
  892. vpaddq $T1,$D1,$D1
  893. vmovdqa -0x80(%r11),$T2 # r1^4
  894. vpmuludq $H2,$T4,$T0 # h2*r0
  895. vpmuludq $H3,$T4,$T1 # h3*r0
  896. vpaddq $T0,$D2,$D2
  897. vpaddq $T1,$D3,$D3
  898. vpmuludq $H4,$T4,$T4 # h4*r0
  899. vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
  900. vpaddq $T4,$D4,$D4
  901. vpaddq $T0,$D0,$D0 # d0 += h4*s1
  902. vpmuludq $H2,$T2,$T1 # h2*r1
  903. vpmuludq $H3,$T2,$T0 # h3*r1
  904. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  905. vmovdqa -0x60(%r11),$T3 # r2^4
  906. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  907. vpmuludq $H1,$T2,$T1 # h1*r1
  908. vpmuludq $H0,$T2,$T2 # h0*r1
  909. vpaddq $T1,$D2,$D2 # d2 += h1*r1
  910. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  911. vmovdqa -0x50(%r11),$T4 # s2^4
  912. vpmuludq $H2,$T3,$T0 # h2*r2
  913. vpmuludq $H1,$T3,$T1 # h1*r2
  914. vpaddq $T0,$D4,$D4 # d4 += h2*r2
  915. vpaddq $T1,$D3,$D3 # d3 += h1*r2
  916. vmovdqa -0x40(%r11),$T2 # r3^4
  917. vpmuludq $H0,$T3,$T3 # h0*r2
  918. vpmuludq $H4,$T4,$T0 # h4*s2
  919. vpaddq $T3,$D2,$D2 # d2 += h0*r2
  920. vpaddq $T0,$D1,$D1 # d1 += h4*s2
  921. vmovdqa -0x30(%r11),$T3 # s3^4
  922. vpmuludq $H3,$T4,$T4 # h3*s2
  923. vpmuludq $H1,$T2,$T1 # h1*r3
  924. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  925. vmovdqa -0x10(%r11),$T4 # s4^4
  926. vpaddq $T1,$D4,$D4 # d4 += h1*r3
  927. vpmuludq $H0,$T2,$T2 # h0*r3
  928. vpmuludq $H4,$T3,$T0 # h4*s3
  929. vpaddq $T2,$D3,$D3 # d3 += h0*r3
  930. vpaddq $T0,$D2,$D2 # d2 += h4*s3
  931. vmovdqu 16*2($inp),$T0 # load input
  932. vpmuludq $H3,$T3,$T2 # h3*s3
  933. vpmuludq $H2,$T3,$T3 # h2*s3
  934. vpaddq $T2,$D1,$D1 # d1 += h3*s3
  935. vmovdqu 16*3($inp),$T1 #
  936. vpaddq $T3,$D0,$D0 # d0 += h2*s3
  937. vpmuludq $H2,$T4,$H2 # h2*s4
  938. vpmuludq $H3,$T4,$H3 # h3*s4
  939. vpsrldq \$6,$T0,$T2 # splat input
  940. vpaddq $H2,$D1,$D1 # d1 += h2*s4
  941. vpmuludq $H4,$T4,$H4 # h4*s4
  942. vpsrldq \$6,$T1,$T3 #
  943. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
  944. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
  945. vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
  946. vpmuludq $H1,$T4,$H0
  947. vpunpckhqdq $T1,$T0,$T4 # 4
  948. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  949. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  950. vpunpcklqdq $T1,$T0,$T0 # 0:1
  951. vpunpcklqdq $T3,$T2,$T3 # 2:3
  952. #vpsrlq \$40,$T4,$T4 # 4
  953. vpsrldq \$`40/8`,$T4,$T4 # 4
  954. vpsrlq \$26,$T0,$T1
  955. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  956. vpand $MASK,$T0,$T0 # 0
  957. vpsrlq \$4,$T3,$T2
  958. vpand $MASK,$T1,$T1 # 1
  959. vpand 0(%rcx),$T4,$T4 # .Lmask24
  960. vpsrlq \$30,$T3,$T3
  961. vpand $MASK,$T2,$T2 # 2
  962. vpand $MASK,$T3,$T3 # 3
  963. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  964. ################################################################
  965. # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  966. # and P. Schwabe
  967. vpsrlq \$26,$H3,$D3
  968. vpand $MASK,$H3,$H3
  969. vpaddq $D3,$H4,$H4 # h3 -> h4
  970. vpsrlq \$26,$H0,$D0
  971. vpand $MASK,$H0,$H0
  972. vpaddq $D0,$D1,$H1 # h0 -> h1
  973. vpsrlq \$26,$H4,$D0
  974. vpand $MASK,$H4,$H4
  975. vpsrlq \$26,$H1,$D1
  976. vpand $MASK,$H1,$H1
  977. vpaddq $D1,$H2,$H2 # h1 -> h2
  978. vpaddq $D0,$H0,$H0
  979. vpsllq \$2,$D0,$D0
  980. vpaddq $D0,$H0,$H0 # h4 -> h0
  981. vpsrlq \$26,$H2,$D2
  982. vpand $MASK,$H2,$H2
  983. vpaddq $D2,$H3,$H3 # h2 -> h3
  984. vpsrlq \$26,$H0,$D0
  985. vpand $MASK,$H0,$H0
  986. vpaddq $D0,$H1,$H1 # h0 -> h1
  987. vpsrlq \$26,$H3,$D3
  988. vpand $MASK,$H3,$H3
  989. vpaddq $D3,$H4,$H4 # h3 -> h4
  990. ja .Loop_avx
  991. .Lskip_loop_avx:
  992. ################################################################
  993. # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  994. vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
  995. add \$32,$len
  996. jnz .Long_tail_avx
  997. vpaddq $H2,$T2,$T2
  998. vpaddq $H0,$T0,$T0
  999. vpaddq $H1,$T1,$T1
  1000. vpaddq $H3,$T3,$T3
  1001. vpaddq $H4,$T4,$T4
  1002. .Long_tail_avx:
  1003. vmovdqa $H2,0x20(%r11)
  1004. vmovdqa $H0,0x00(%r11)
  1005. vmovdqa $H1,0x10(%r11)
  1006. vmovdqa $H3,0x30(%r11)
  1007. vmovdqa $H4,0x40(%r11)
  1008. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1009. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1010. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1011. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1012. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1013. vpmuludq $T2,$D4,$D2 # d2 = h2*r0
  1014. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  1015. vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
  1016. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  1017. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  1018. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  1019. vpmuludq $T3,$H2,$H0 # h3*r1
  1020. vpaddq $H0,$D4,$D4 # d4 += h3*r1
  1021. vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
  1022. vpmuludq $T2,$H2,$H1 # h2*r1
  1023. vpaddq $H1,$D3,$D3 # d3 += h2*r1
  1024. vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
  1025. vpmuludq $T1,$H2,$H0 # h1*r1
  1026. vpaddq $H0,$D2,$D2 # d2 += h1*r1
  1027. vpmuludq $T0,$H2,$H2 # h0*r1
  1028. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  1029. vpmuludq $T4,$H3,$H3 # h4*s1
  1030. vpaddq $H3,$D0,$D0 # d0 += h4*s1
  1031. vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
  1032. vpmuludq $T2,$H4,$H1 # h2*r2
  1033. vpaddq $H1,$D4,$D4 # d4 += h2*r2
  1034. vpmuludq $T1,$H4,$H0 # h1*r2
  1035. vpaddq $H0,$D3,$D3 # d3 += h1*r2
  1036. vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
  1037. vpmuludq $T0,$H4,$H4 # h0*r2
  1038. vpaddq $H4,$D2,$D2 # d2 += h0*r2
  1039. vpmuludq $T4,$H2,$H1 # h4*s2
  1040. vpaddq $H1,$D1,$D1 # d1 += h4*s2
  1041. vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
  1042. vpmuludq $T3,$H2,$H2 # h3*s2
  1043. vpaddq $H2,$D0,$D0 # d0 += h3*s2
  1044. vpmuludq $T1,$H3,$H0 # h1*r3
  1045. vpaddq $H0,$D4,$D4 # d4 += h1*r3
  1046. vpmuludq $T0,$H3,$H3 # h0*r3
  1047. vpaddq $H3,$D3,$D3 # d3 += h0*r3
  1048. vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
  1049. vpmuludq $T4,$H4,$H1 # h4*s3
  1050. vpaddq $H1,$D2,$D2 # d2 += h4*s3
  1051. vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
  1052. vpmuludq $T3,$H4,$H0 # h3*s3
  1053. vpaddq $H0,$D1,$D1 # d1 += h3*s3
  1054. vpmuludq $T2,$H4,$H4 # h2*s3
  1055. vpaddq $H4,$D0,$D0 # d0 += h2*s3
  1056. vpmuludq $T0,$H2,$H2 # h0*r4
  1057. vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
  1058. vpmuludq $T4,$H3,$H1 # h4*s4
  1059. vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
  1060. vpmuludq $T3,$H3,$H0 # h3*s4
  1061. vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
  1062. vpmuludq $T2,$H3,$H1 # h2*s4
  1063. vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
  1064. vpmuludq $T1,$H3,$H3 # h1*s4
  1065. vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
  1066. jz .Lshort_tail_avx
  1067. vmovdqu 16*0($inp),$H0 # load input
  1068. vmovdqu 16*1($inp),$H1
  1069. vpsrldq \$6,$H0,$H2 # splat input
  1070. vpsrldq \$6,$H1,$H3
  1071. vpunpckhqdq $H1,$H0,$H4 # 4
  1072. vpunpcklqdq $H1,$H0,$H0 # 0:1
  1073. vpunpcklqdq $H3,$H2,$H3 # 2:3
  1074. vpsrlq \$40,$H4,$H4 # 4
  1075. vpsrlq \$26,$H0,$H1
  1076. vpand $MASK,$H0,$H0 # 0
  1077. vpsrlq \$4,$H3,$H2
  1078. vpand $MASK,$H1,$H1 # 1
  1079. vpsrlq \$30,$H3,$H3
  1080. vpand $MASK,$H2,$H2 # 2
  1081. vpand $MASK,$H3,$H3 # 3
  1082. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  1083. vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
  1084. vpaddq 0x00(%r11),$H0,$H0
  1085. vpaddq 0x10(%r11),$H1,$H1
  1086. vpaddq 0x20(%r11),$H2,$H2
  1087. vpaddq 0x30(%r11),$H3,$H3
  1088. vpaddq 0x40(%r11),$H4,$H4
  1089. ################################################################
  1090. # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
  1091. vpmuludq $H0,$T4,$T0 # h0*r0
  1092. vpaddq $T0,$D0,$D0 # d0 += h0*r0
  1093. vpmuludq $H1,$T4,$T1 # h1*r0
  1094. vpaddq $T1,$D1,$D1 # d1 += h1*r0
  1095. vpmuludq $H2,$T4,$T0 # h2*r0
  1096. vpaddq $T0,$D2,$D2 # d2 += h2*r0
  1097. vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
  1098. vpmuludq $H3,$T4,$T1 # h3*r0
  1099. vpaddq $T1,$D3,$D3 # d3 += h3*r0
  1100. vpmuludq $H4,$T4,$T4 # h4*r0
  1101. vpaddq $T4,$D4,$D4 # d4 += h4*r0
  1102. vpmuludq $H3,$T2,$T0 # h3*r1
  1103. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  1104. vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
  1105. vpmuludq $H2,$T2,$T1 # h2*r1
  1106. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  1107. vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
  1108. vpmuludq $H1,$T2,$T0 # h1*r1
  1109. vpaddq $T0,$D2,$D2 # d2 += h1*r1
  1110. vpmuludq $H0,$T2,$T2 # h0*r1
  1111. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  1112. vpmuludq $H4,$T3,$T3 # h4*s1
  1113. vpaddq $T3,$D0,$D0 # d0 += h4*s1
  1114. vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
  1115. vpmuludq $H2,$T4,$T1 # h2*r2
  1116. vpaddq $T1,$D4,$D4 # d4 += h2*r2
  1117. vpmuludq $H1,$T4,$T0 # h1*r2
  1118. vpaddq $T0,$D3,$D3 # d3 += h1*r2
  1119. vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
  1120. vpmuludq $H0,$T4,$T4 # h0*r2
  1121. vpaddq $T4,$D2,$D2 # d2 += h0*r2
  1122. vpmuludq $H4,$T2,$T1 # h4*s2
  1123. vpaddq $T1,$D1,$D1 # d1 += h4*s2
  1124. vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
  1125. vpmuludq $H3,$T2,$T2 # h3*s2
  1126. vpaddq $T2,$D0,$D0 # d0 += h3*s2
  1127. vpmuludq $H1,$T3,$T0 # h1*r3
  1128. vpaddq $T0,$D4,$D4 # d4 += h1*r3
  1129. vpmuludq $H0,$T3,$T3 # h0*r3
  1130. vpaddq $T3,$D3,$D3 # d3 += h0*r3
  1131. vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
  1132. vpmuludq $H4,$T4,$T1 # h4*s3
  1133. vpaddq $T1,$D2,$D2 # d2 += h4*s3
  1134. vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
  1135. vpmuludq $H3,$T4,$T0 # h3*s3
  1136. vpaddq $T0,$D1,$D1 # d1 += h3*s3
  1137. vpmuludq $H2,$T4,$T4 # h2*s3
  1138. vpaddq $T4,$D0,$D0 # d0 += h2*s3
  1139. vpmuludq $H0,$T2,$T2 # h0*r4
  1140. vpaddq $T2,$D4,$D4 # d4 += h0*r4
  1141. vpmuludq $H4,$T3,$T1 # h4*s4
  1142. vpaddq $T1,$D3,$D3 # d3 += h4*s4
  1143. vpmuludq $H3,$T3,$T0 # h3*s4
  1144. vpaddq $T0,$D2,$D2 # d2 += h3*s4
  1145. vpmuludq $H2,$T3,$T1 # h2*s4
  1146. vpaddq $T1,$D1,$D1 # d1 += h2*s4
  1147. vpmuludq $H1,$T3,$T3 # h1*s4
  1148. vpaddq $T3,$D0,$D0 # d0 += h1*s4
  1149. .Lshort_tail_avx:
  1150. ################################################################
  1151. # horizontal addition
  1152. vpsrldq \$8,$D4,$T4
  1153. vpsrldq \$8,$D3,$T3
  1154. vpsrldq \$8,$D1,$T1
  1155. vpsrldq \$8,$D0,$T0
  1156. vpsrldq \$8,$D2,$T2
  1157. vpaddq $T3,$D3,$D3
  1158. vpaddq $T4,$D4,$D4
  1159. vpaddq $T0,$D0,$D0
  1160. vpaddq $T1,$D1,$D1
  1161. vpaddq $T2,$D2,$D2
  1162. ################################################################
  1163. # lazy reduction
  1164. vpsrlq \$26,$D3,$H3
  1165. vpand $MASK,$D3,$D3
  1166. vpaddq $H3,$D4,$D4 # h3 -> h4
  1167. vpsrlq \$26,$D0,$H0
  1168. vpand $MASK,$D0,$D0
  1169. vpaddq $H0,$D1,$D1 # h0 -> h1
  1170. vpsrlq \$26,$D4,$H4
  1171. vpand $MASK,$D4,$D4
  1172. vpsrlq \$26,$D1,$H1
  1173. vpand $MASK,$D1,$D1
  1174. vpaddq $H1,$D2,$D2 # h1 -> h2
  1175. vpaddq $H4,$D0,$D0
  1176. vpsllq \$2,$H4,$H4
  1177. vpaddq $H4,$D0,$D0 # h4 -> h0
  1178. vpsrlq \$26,$D2,$H2
  1179. vpand $MASK,$D2,$D2
  1180. vpaddq $H2,$D3,$D3 # h2 -> h3
  1181. vpsrlq \$26,$D0,$H0
  1182. vpand $MASK,$D0,$D0
  1183. vpaddq $H0,$D1,$D1 # h0 -> h1
  1184. vpsrlq \$26,$D3,$H3
  1185. vpand $MASK,$D3,$D3
  1186. vpaddq $H3,$D4,$D4 # h3 -> h4
  1187. vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
  1188. vmovd $D1,`4*1-48-64`($ctx)
  1189. vmovd $D2,`4*2-48-64`($ctx)
  1190. vmovd $D3,`4*3-48-64`($ctx)
  1191. vmovd $D4,`4*4-48-64`($ctx)
  1192. ___
  1193. $code.=<<___ if ($win64);
  1194. vmovdqa 0x50(%r11),%xmm6
  1195. vmovdqa 0x60(%r11),%xmm7
  1196. vmovdqa 0x70(%r11),%xmm8
  1197. vmovdqa 0x80(%r11),%xmm9
  1198. vmovdqa 0x90(%r11),%xmm10
  1199. vmovdqa 0xa0(%r11),%xmm11
  1200. vmovdqa 0xb0(%r11),%xmm12
  1201. vmovdqa 0xc0(%r11),%xmm13
  1202. vmovdqa 0xd0(%r11),%xmm14
  1203. vmovdqa 0xe0(%r11),%xmm15
  1204. lea 0xf8(%r11),%rsp
  1205. .Ldo_avx_epilogue:
  1206. ___
  1207. $code.=<<___ if (!$win64);
  1208. lea 0x58(%r11),%rsp
  1209. .cfi_def_cfa %rsp,8
  1210. ___
  1211. $code.=<<___;
  1212. vzeroupper
  1213. ret
  1214. .cfi_endproc
  1215. .size poly1305_blocks_avx,.-poly1305_blocks_avx
  1216. .type poly1305_emit_avx,\@function,3
  1217. .align 32
  1218. poly1305_emit_avx:
  1219. .cfi_startproc
  1220. cmpl \$0,20($ctx) # is_base2_26?
  1221. je .Lemit
  1222. mov 0($ctx),%eax # load hash value base 2^26
  1223. mov 4($ctx),%ecx
  1224. mov 8($ctx),%r8d
  1225. mov 12($ctx),%r11d
  1226. mov 16($ctx),%r10d
  1227. shl \$26,%rcx # base 2^26 -> base 2^64
  1228. mov %r8,%r9
  1229. shl \$52,%r8
  1230. add %rcx,%rax
  1231. shr \$12,%r9
  1232. add %rax,%r8 # h0
  1233. adc \$0,%r9
  1234. shl \$14,%r11
  1235. mov %r10,%rax
  1236. shr \$24,%r10
  1237. add %r11,%r9
  1238. shl \$40,%rax
  1239. add %rax,%r9 # h1
  1240. adc \$0,%r10 # h2
  1241. mov %r10,%rax # could be partially reduced, so reduce
  1242. mov %r10,%rcx
  1243. and \$3,%r10
  1244. shr \$2,%rax
  1245. and \$-4,%rcx
  1246. add %rcx,%rax
  1247. add %rax,%r8
  1248. adc \$0,%r9
  1249. adc \$0,%r10
  1250. mov %r8,%rax
  1251. add \$5,%r8 # compare to modulus
  1252. mov %r9,%rcx
  1253. adc \$0,%r9
  1254. adc \$0,%r10
  1255. shr \$2,%r10 # did 130-bit value overflow?
  1256. cmovnz %r8,%rax
  1257. cmovnz %r9,%rcx
  1258. add 0($nonce),%rax # accumulate nonce
  1259. adc 8($nonce),%rcx
  1260. mov %rax,0($mac) # write result
  1261. mov %rcx,8($mac)
  1262. ret
  1263. .cfi_endproc
  1264. .size poly1305_emit_avx,.-poly1305_emit_avx
  1265. ___
  1266. if ($avx>1) {
  1267. my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
  1268. map("%ymm$_",(0..15));
  1269. my $S4=$MASK;
  1270. $code.=<<___;
  1271. .type poly1305_blocks_avx2,\@function,4
  1272. .align 32
  1273. poly1305_blocks_avx2:
  1274. .cfi_startproc
  1275. mov 20($ctx),%r8d # is_base2_26
  1276. cmp \$128,$len
  1277. jae .Lblocks_avx2
  1278. test %r8d,%r8d
  1279. jz .Lblocks
  1280. .Lblocks_avx2:
  1281. and \$-16,$len
  1282. jz .Lno_data_avx2
  1283. vzeroupper
  1284. test %r8d,%r8d
  1285. jz .Lbase2_64_avx2
  1286. test \$63,$len
  1287. jz .Leven_avx2
  1288. push %rbx
  1289. .cfi_push %rbx
  1290. push %rbp
  1291. .cfi_push %rbp
  1292. push %r12
  1293. .cfi_push %r12
  1294. push %r13
  1295. .cfi_push %r13
  1296. push %r14
  1297. .cfi_push %r14
  1298. push %r15
  1299. .cfi_push %r15
  1300. .Lblocks_avx2_body:
  1301. mov $len,%r15 # reassign $len
  1302. mov 0($ctx),$d1 # load hash value
  1303. mov 8($ctx),$d2
  1304. mov 16($ctx),$h2#d
  1305. mov 24($ctx),$r0 # load r
  1306. mov 32($ctx),$s1
  1307. ################################# base 2^26 -> base 2^64
  1308. mov $d1#d,$h0#d
  1309. and \$`-1*(1<<31)`,$d1
  1310. mov $d2,$r1 # borrow $r1
  1311. mov $d2#d,$h1#d
  1312. and \$`-1*(1<<31)`,$d2
  1313. shr \$6,$d1
  1314. shl \$52,$r1
  1315. add $d1,$h0
  1316. shr \$12,$h1
  1317. shr \$18,$d2
  1318. add $r1,$h0
  1319. adc $d2,$h1
  1320. mov $h2,$d1
  1321. shl \$40,$d1
  1322. shr \$24,$h2
  1323. add $d1,$h1
  1324. adc \$0,$h2 # can be partially reduced...
  1325. mov \$-4,$d2 # ... so reduce
  1326. mov $h2,$d1
  1327. and $h2,$d2
  1328. shr \$2,$d1
  1329. and \$3,$h2
  1330. add $d2,$d1 # =*5
  1331. add $d1,$h0
  1332. adc \$0,$h1
  1333. adc \$0,$h2
  1334. mov $s1,$r1
  1335. mov $s1,%rax
  1336. shr \$2,$s1
  1337. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1338. .Lbase2_26_pre_avx2:
  1339. add 0($inp),$h0 # accumulate input
  1340. adc 8($inp),$h1
  1341. lea 16($inp),$inp
  1342. adc $padbit,$h2
  1343. sub \$16,%r15
  1344. call __poly1305_block
  1345. mov $r1,%rax
  1346. test \$63,%r15
  1347. jnz .Lbase2_26_pre_avx2
  1348. test $padbit,$padbit # if $padbit is zero,
  1349. jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
  1350. ################################# base 2^64 -> base 2^26
  1351. mov $h0,%rax
  1352. mov $h0,%rdx
  1353. shr \$52,$h0
  1354. mov $h1,$r0
  1355. mov $h1,$r1
  1356. shr \$26,%rdx
  1357. and \$0x3ffffff,%rax # h[0]
  1358. shl \$12,$r0
  1359. and \$0x3ffffff,%rdx # h[1]
  1360. shr \$14,$h1
  1361. or $r0,$h0
  1362. shl \$24,$h2
  1363. and \$0x3ffffff,$h0 # h[2]
  1364. shr \$40,$r1
  1365. and \$0x3ffffff,$h1 # h[3]
  1366. or $r1,$h2 # h[4]
  1367. test %r15,%r15
  1368. jz .Lstore_base2_26_avx2
  1369. vmovd %rax#d,%x#$H0
  1370. vmovd %rdx#d,%x#$H1
  1371. vmovd $h0#d,%x#$H2
  1372. vmovd $h1#d,%x#$H3
  1373. vmovd $h2#d,%x#$H4
  1374. jmp .Lproceed_avx2
  1375. .align 32
  1376. .Lstore_base2_64_avx2:
  1377. mov $h0,0($ctx)
  1378. mov $h1,8($ctx)
  1379. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  1380. jmp .Ldone_avx2
  1381. .align 16
  1382. .Lstore_base2_26_avx2:
  1383. mov %rax#d,0($ctx) # store hash value base 2^26
  1384. mov %rdx#d,4($ctx)
  1385. mov $h0#d,8($ctx)
  1386. mov $h1#d,12($ctx)
  1387. mov $h2#d,16($ctx)
  1388. .align 16
  1389. .Ldone_avx2:
  1390. mov 0(%rsp),%r15
  1391. .cfi_restore %r15
  1392. mov 8(%rsp),%r14
  1393. .cfi_restore %r14
  1394. mov 16(%rsp),%r13
  1395. .cfi_restore %r13
  1396. mov 24(%rsp),%r12
  1397. .cfi_restore %r12
  1398. mov 32(%rsp),%rbp
  1399. .cfi_restore %rbp
  1400. mov 40(%rsp),%rbx
  1401. .cfi_restore %rbx
  1402. lea 48(%rsp),%rsp
  1403. .cfi_adjust_cfa_offset -48
  1404. .Lno_data_avx2:
  1405. .Lblocks_avx2_epilogue:
  1406. ret
  1407. .cfi_endproc
  1408. .align 32
  1409. .Lbase2_64_avx2:
  1410. .cfi_startproc
  1411. push %rbx
  1412. .cfi_push %rbx
  1413. push %rbp
  1414. .cfi_push %rbp
  1415. push %r12
  1416. .cfi_push %r12
  1417. push %r13
  1418. .cfi_push %r13
  1419. push %r14
  1420. .cfi_push %r14
  1421. push %r15
  1422. .cfi_push %r15
  1423. .Lbase2_64_avx2_body:
  1424. mov $len,%r15 # reassign $len
  1425. mov 24($ctx),$r0 # load r
  1426. mov 32($ctx),$s1
  1427. mov 0($ctx),$h0 # load hash value
  1428. mov 8($ctx),$h1
  1429. mov 16($ctx),$h2#d
  1430. mov $s1,$r1
  1431. mov $s1,%rax
  1432. shr \$2,$s1
  1433. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1434. test \$63,$len
  1435. jz .Linit_avx2
  1436. .Lbase2_64_pre_avx2:
  1437. add 0($inp),$h0 # accumulate input
  1438. adc 8($inp),$h1
  1439. lea 16($inp),$inp
  1440. adc $padbit,$h2
  1441. sub \$16,%r15
  1442. call __poly1305_block
  1443. mov $r1,%rax
  1444. test \$63,%r15
  1445. jnz .Lbase2_64_pre_avx2
  1446. .Linit_avx2:
  1447. ################################# base 2^64 -> base 2^26
  1448. mov $h0,%rax
  1449. mov $h0,%rdx
  1450. shr \$52,$h0
  1451. mov $h1,$d1
  1452. mov $h1,$d2
  1453. shr \$26,%rdx
  1454. and \$0x3ffffff,%rax # h[0]
  1455. shl \$12,$d1
  1456. and \$0x3ffffff,%rdx # h[1]
  1457. shr \$14,$h1
  1458. or $d1,$h0
  1459. shl \$24,$h2
  1460. and \$0x3ffffff,$h0 # h[2]
  1461. shr \$40,$d2
  1462. and \$0x3ffffff,$h1 # h[3]
  1463. or $d2,$h2 # h[4]
  1464. vmovd %rax#d,%x#$H0
  1465. vmovd %rdx#d,%x#$H1
  1466. vmovd $h0#d,%x#$H2
  1467. vmovd $h1#d,%x#$H3
  1468. vmovd $h2#d,%x#$H4
  1469. movl \$1,20($ctx) # set is_base2_26
  1470. call __poly1305_init_avx
  1471. .Lproceed_avx2:
  1472. mov %r15,$len # restore $len
  1473. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  1474. mov \$`(1<<31|1<<30|1<<16)`,%r11d
  1475. mov 0(%rsp),%r15
  1476. .cfi_restore %r15
  1477. mov 8(%rsp),%r14
  1478. .cfi_restore %r14
  1479. mov 16(%rsp),%r13
  1480. .cfi_restore %r13
  1481. mov 24(%rsp),%r12
  1482. .cfi_restore %r12
  1483. mov 32(%rsp),%rbp
  1484. .cfi_restore %rbp
  1485. mov 40(%rsp),%rbx
  1486. .cfi_restore %rbx
  1487. lea 48(%rsp),%rax
  1488. lea 48(%rsp),%rsp
  1489. .cfi_adjust_cfa_offset -48
  1490. .Lbase2_64_avx2_epilogue:
  1491. jmp .Ldo_avx2
  1492. .cfi_endproc
  1493. .align 32
  1494. .Leven_avx2:
  1495. .cfi_startproc
  1496. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  1497. vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
  1498. vmovd 4*1($ctx),%x#$H1
  1499. vmovd 4*2($ctx),%x#$H2
  1500. vmovd 4*3($ctx),%x#$H3
  1501. vmovd 4*4($ctx),%x#$H4
  1502. .Ldo_avx2:
  1503. ___
  1504. $code.=<<___ if ($avx>2);
  1505. cmp \$512,$len
  1506. jb .Lskip_avx512
  1507. and %r11d,%r10d
  1508. test \$`1<<16`,%r10d # check for AVX512F
  1509. jnz .Lblocks_avx512
  1510. .Lskip_avx512:
  1511. ___
  1512. $code.=<<___ if (!$win64);
  1513. lea -8(%rsp),%r11
  1514. .cfi_def_cfa %r11,16
  1515. sub \$0x128,%rsp
  1516. ___
  1517. $code.=<<___ if ($win64);
  1518. lea -0xf8(%rsp),%r11
  1519. sub \$0x1c8,%rsp
  1520. vmovdqa %xmm6,0x50(%r11)
  1521. vmovdqa %xmm7,0x60(%r11)
  1522. vmovdqa %xmm8,0x70(%r11)
  1523. vmovdqa %xmm9,0x80(%r11)
  1524. vmovdqa %xmm10,0x90(%r11)
  1525. vmovdqa %xmm11,0xa0(%r11)
  1526. vmovdqa %xmm12,0xb0(%r11)
  1527. vmovdqa %xmm13,0xc0(%r11)
  1528. vmovdqa %xmm14,0xd0(%r11)
  1529. vmovdqa %xmm15,0xe0(%r11)
  1530. .Ldo_avx2_body:
  1531. ___
  1532. $code.=<<___;
  1533. lea .Lconst(%rip),%rcx
  1534. lea 48+64($ctx),$ctx # size optimization
  1535. vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
  1536. # expand and copy pre-calculated table to stack
  1537. vmovdqu `16*0-64`($ctx),%x#$T2
  1538. and \$-512,%rsp
  1539. vmovdqu `16*1-64`($ctx),%x#$T3
  1540. vmovdqu `16*2-64`($ctx),%x#$T4
  1541. vmovdqu `16*3-64`($ctx),%x#$D0
  1542. vmovdqu `16*4-64`($ctx),%x#$D1
  1543. vmovdqu `16*5-64`($ctx),%x#$D2
  1544. lea 0x90(%rsp),%rax # size optimization
  1545. vmovdqu `16*6-64`($ctx),%x#$D3
  1546. vpermd $T2,$T0,$T2 # 00003412 -> 14243444
  1547. vmovdqu `16*7-64`($ctx),%x#$D4
  1548. vpermd $T3,$T0,$T3
  1549. vmovdqu `16*8-64`($ctx),%x#$MASK
  1550. vpermd $T4,$T0,$T4
  1551. vmovdqa $T2,0x00(%rsp)
  1552. vpermd $D0,$T0,$D0
  1553. vmovdqa $T3,0x20-0x90(%rax)
  1554. vpermd $D1,$T0,$D1
  1555. vmovdqa $T4,0x40-0x90(%rax)
  1556. vpermd $D2,$T0,$D2
  1557. vmovdqa $D0,0x60-0x90(%rax)
  1558. vpermd $D3,$T0,$D3
  1559. vmovdqa $D1,0x80-0x90(%rax)
  1560. vpermd $D4,$T0,$D4
  1561. vmovdqa $D2,0xa0-0x90(%rax)
  1562. vpermd $MASK,$T0,$MASK
  1563. vmovdqa $D3,0xc0-0x90(%rax)
  1564. vmovdqa $D4,0xe0-0x90(%rax)
  1565. vmovdqa $MASK,0x100-0x90(%rax)
  1566. vmovdqa 64(%rcx),$MASK # .Lmask26
  1567. ################################################################
  1568. # load input
  1569. vmovdqu 16*0($inp),%x#$T0
  1570. vmovdqu 16*1($inp),%x#$T1
  1571. vinserti128 \$1,16*2($inp),$T0,$T0
  1572. vinserti128 \$1,16*3($inp),$T1,$T1
  1573. lea 16*4($inp),$inp
  1574. vpsrldq \$6,$T0,$T2 # splat input
  1575. vpsrldq \$6,$T1,$T3
  1576. vpunpckhqdq $T1,$T0,$T4 # 4
  1577. vpunpcklqdq $T3,$T2,$T2 # 2:3
  1578. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1579. vpsrlq \$30,$T2,$T3
  1580. vpsrlq \$4,$T2,$T2
  1581. vpsrlq \$26,$T0,$T1
  1582. vpsrlq \$40,$T4,$T4 # 4
  1583. vpand $MASK,$T2,$T2 # 2
  1584. vpand $MASK,$T0,$T0 # 0
  1585. vpand $MASK,$T1,$T1 # 1
  1586. vpand $MASK,$T3,$T3 # 3
  1587. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1588. vpaddq $H2,$T2,$H2 # accumulate input
  1589. sub \$64,$len
  1590. jz .Ltail_avx2
  1591. jmp .Loop_avx2
  1592. .align 32
  1593. .Loop_avx2:
  1594. ################################################################
  1595. # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
  1596. # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
  1597. # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
  1598. # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
  1599. # \________/\__________/
  1600. ################################################################
  1601. #vpaddq $H2,$T2,$H2 # accumulate input
  1602. vpaddq $H0,$T0,$H0
  1603. vmovdqa `32*0`(%rsp),$T0 # r0^4
  1604. vpaddq $H1,$T1,$H1
  1605. vmovdqa `32*1`(%rsp),$T1 # r1^4
  1606. vpaddq $H3,$T3,$H3
  1607. vmovdqa `32*3`(%rsp),$T2 # r2^4
  1608. vpaddq $H4,$T4,$H4
  1609. vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
  1610. vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
  1611. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1612. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1613. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1614. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1615. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1616. #
  1617. # however, as h2 is "chronologically" first one available pull
  1618. # corresponding operations up, so it's
  1619. #
  1620. # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
  1621. # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
  1622. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1623. # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
  1624. # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
  1625. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1626. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1627. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1628. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1629. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1630. vpmuludq $H0,$T1,$T4 # h0*r1
  1631. vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
  1632. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1633. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1634. vpmuludq $H3,$T1,$T4 # h3*r1
  1635. vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
  1636. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1637. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1638. vmovdqa `32*4-0x90`(%rax),$T1 # s2
  1639. vpmuludq $H0,$T0,$T4 # h0*r0
  1640. vpmuludq $H1,$T0,$H2 # h1*r0
  1641. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1642. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1643. vpmuludq $H3,$T0,$T4 # h3*r0
  1644. vpmuludq $H4,$T0,$H2 # h4*r0
  1645. vmovdqu 16*0($inp),%x#$T0 # load input
  1646. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1647. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1648. vinserti128 \$1,16*2($inp),$T0,$T0
  1649. vpmuludq $H3,$T1,$T4 # h3*s2
  1650. vpmuludq $H4,$T1,$H2 # h4*s2
  1651. vmovdqu 16*1($inp),%x#$T1
  1652. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1653. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1654. vmovdqa `32*5-0x90`(%rax),$H2 # r3
  1655. vpmuludq $H1,$T2,$T4 # h1*r2
  1656. vpmuludq $H0,$T2,$T2 # h0*r2
  1657. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1658. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1659. vinserti128 \$1,16*3($inp),$T1,$T1
  1660. lea 16*4($inp),$inp
  1661. vpmuludq $H1,$H2,$T4 # h1*r3
  1662. vpmuludq $H0,$H2,$H2 # h0*r3
  1663. vpsrldq \$6,$T0,$T2 # splat input
  1664. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1665. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1666. vpmuludq $H3,$T3,$T4 # h3*s3
  1667. vpmuludq $H4,$T3,$H2 # h4*s3
  1668. vpsrldq \$6,$T1,$T3
  1669. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1670. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1671. vpunpckhqdq $T1,$T0,$T4 # 4
  1672. vpmuludq $H3,$S4,$H3 # h3*s4
  1673. vpmuludq $H4,$S4,$H4 # h4*s4
  1674. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1675. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1676. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1677. vpunpcklqdq $T3,$T2,$T3 # 2:3
  1678. vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
  1679. vpmuludq $H1,$S4,$H0 # h1*s4
  1680. vmovdqa 64(%rcx),$MASK # .Lmask26
  1681. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1682. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1683. ################################################################
  1684. # lazy reduction (interleaved with tail of input splat)
  1685. vpsrlq \$26,$H3,$D3
  1686. vpand $MASK,$H3,$H3
  1687. vpaddq $D3,$H4,$H4 # h3 -> h4
  1688. vpsrlq \$26,$H0,$D0
  1689. vpand $MASK,$H0,$H0
  1690. vpaddq $D0,$D1,$H1 # h0 -> h1
  1691. vpsrlq \$26,$H4,$D4
  1692. vpand $MASK,$H4,$H4
  1693. vpsrlq \$4,$T3,$T2
  1694. vpsrlq \$26,$H1,$D1
  1695. vpand $MASK,$H1,$H1
  1696. vpaddq $D1,$H2,$H2 # h1 -> h2
  1697. vpaddq $D4,$H0,$H0
  1698. vpsllq \$2,$D4,$D4
  1699. vpaddq $D4,$H0,$H0 # h4 -> h0
  1700. vpand $MASK,$T2,$T2 # 2
  1701. vpsrlq \$26,$T0,$T1
  1702. vpsrlq \$26,$H2,$D2
  1703. vpand $MASK,$H2,$H2
  1704. vpaddq $D2,$H3,$H3 # h2 -> h3
  1705. vpaddq $T2,$H2,$H2 # modulo-scheduled
  1706. vpsrlq \$30,$T3,$T3
  1707. vpsrlq \$26,$H0,$D0
  1708. vpand $MASK,$H0,$H0
  1709. vpaddq $D0,$H1,$H1 # h0 -> h1
  1710. vpsrlq \$40,$T4,$T4 # 4
  1711. vpsrlq \$26,$H3,$D3
  1712. vpand $MASK,$H3,$H3
  1713. vpaddq $D3,$H4,$H4 # h3 -> h4
  1714. vpand $MASK,$T0,$T0 # 0
  1715. vpand $MASK,$T1,$T1 # 1
  1716. vpand $MASK,$T3,$T3 # 3
  1717. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1718. sub \$64,$len
  1719. jnz .Loop_avx2
  1720. .byte 0x66,0x90
  1721. .Ltail_avx2:
  1722. ################################################################
  1723. # while above multiplications were by r^4 in all lanes, in last
  1724. # iteration we multiply least significant lane by r^4 and most
  1725. # significant one by r, so copy of above except that references
  1726. # to the precomputed table are displaced by 4...
  1727. #vpaddq $H2,$T2,$H2 # accumulate input
  1728. vpaddq $H0,$T0,$H0
  1729. vmovdqu `32*0+4`(%rsp),$T0 # r0^4
  1730. vpaddq $H1,$T1,$H1
  1731. vmovdqu `32*1+4`(%rsp),$T1 # r1^4
  1732. vpaddq $H3,$T3,$H3
  1733. vmovdqu `32*3+4`(%rsp),$T2 # r2^4
  1734. vpaddq $H4,$T4,$H4
  1735. vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
  1736. vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
  1737. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1738. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1739. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1740. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1741. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1742. vpmuludq $H0,$T1,$T4 # h0*r1
  1743. vpmuludq $H1,$T1,$H2 # h1*r1
  1744. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1745. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1746. vpmuludq $H3,$T1,$T4 # h3*r1
  1747. vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
  1748. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1749. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1750. vpmuludq $H0,$T0,$T4 # h0*r0
  1751. vpmuludq $H1,$T0,$H2 # h1*r0
  1752. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1753. vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
  1754. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1755. vpmuludq $H3,$T0,$T4 # h3*r0
  1756. vpmuludq $H4,$T0,$H2 # h4*r0
  1757. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1758. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1759. vpmuludq $H3,$T1,$T4 # h3*s2
  1760. vpmuludq $H4,$T1,$H2 # h4*s2
  1761. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1762. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1763. vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
  1764. vpmuludq $H1,$T2,$T4 # h1*r2
  1765. vpmuludq $H0,$T2,$T2 # h0*r2
  1766. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1767. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1768. vpmuludq $H1,$H2,$T4 # h1*r3
  1769. vpmuludq $H0,$H2,$H2 # h0*r3
  1770. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1771. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1772. vpmuludq $H3,$T3,$T4 # h3*s3
  1773. vpmuludq $H4,$T3,$H2 # h4*s3
  1774. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1775. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1776. vpmuludq $H3,$S4,$H3 # h3*s4
  1777. vpmuludq $H4,$S4,$H4 # h4*s4
  1778. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1779. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1780. vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
  1781. vpmuludq $H1,$S4,$H0 # h1*s4
  1782. vmovdqa 64(%rcx),$MASK # .Lmask26
  1783. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1784. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1785. ################################################################
  1786. # horizontal addition
  1787. vpsrldq \$8,$D1,$T1
  1788. vpsrldq \$8,$H2,$T2
  1789. vpsrldq \$8,$H3,$T3
  1790. vpsrldq \$8,$H4,$T4
  1791. vpsrldq \$8,$H0,$T0
  1792. vpaddq $T1,$D1,$D1
  1793. vpaddq $T2,$H2,$H2
  1794. vpaddq $T3,$H3,$H3
  1795. vpaddq $T4,$H4,$H4
  1796. vpaddq $T0,$H0,$H0
  1797. vpermq \$0x2,$H3,$T3
  1798. vpermq \$0x2,$H4,$T4
  1799. vpermq \$0x2,$H0,$T0
  1800. vpermq \$0x2,$D1,$T1
  1801. vpermq \$0x2,$H2,$T2
  1802. vpaddq $T3,$H3,$H3
  1803. vpaddq $T4,$H4,$H4
  1804. vpaddq $T0,$H0,$H0
  1805. vpaddq $T1,$D1,$D1
  1806. vpaddq $T2,$H2,$H2
  1807. ################################################################
  1808. # lazy reduction
  1809. vpsrlq \$26,$H3,$D3
  1810. vpand $MASK,$H3,$H3
  1811. vpaddq $D3,$H4,$H4 # h3 -> h4
  1812. vpsrlq \$26,$H0,$D0
  1813. vpand $MASK,$H0,$H0
  1814. vpaddq $D0,$D1,$H1 # h0 -> h1
  1815. vpsrlq \$26,$H4,$D4
  1816. vpand $MASK,$H4,$H4
  1817. vpsrlq \$26,$H1,$D1
  1818. vpand $MASK,$H1,$H1
  1819. vpaddq $D1,$H2,$H2 # h1 -> h2
  1820. vpaddq $D4,$H0,$H0
  1821. vpsllq \$2,$D4,$D4
  1822. vpaddq $D4,$H0,$H0 # h4 -> h0
  1823. vpsrlq \$26,$H2,$D2
  1824. vpand $MASK,$H2,$H2
  1825. vpaddq $D2,$H3,$H3 # h2 -> h3
  1826. vpsrlq \$26,$H0,$D0
  1827. vpand $MASK,$H0,$H0
  1828. vpaddq $D0,$H1,$H1 # h0 -> h1
  1829. vpsrlq \$26,$H3,$D3
  1830. vpand $MASK,$H3,$H3
  1831. vpaddq $D3,$H4,$H4 # h3 -> h4
  1832. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  1833. vmovd %x#$H1,`4*1-48-64`($ctx)
  1834. vmovd %x#$H2,`4*2-48-64`($ctx)
  1835. vmovd %x#$H3,`4*3-48-64`($ctx)
  1836. vmovd %x#$H4,`4*4-48-64`($ctx)
  1837. ___
  1838. $code.=<<___ if ($win64);
  1839. vmovdqa 0x50(%r11),%xmm6
  1840. vmovdqa 0x60(%r11),%xmm7
  1841. vmovdqa 0x70(%r11),%xmm8
  1842. vmovdqa 0x80(%r11),%xmm9
  1843. vmovdqa 0x90(%r11),%xmm10
  1844. vmovdqa 0xa0(%r11),%xmm11
  1845. vmovdqa 0xb0(%r11),%xmm12
  1846. vmovdqa 0xc0(%r11),%xmm13
  1847. vmovdqa 0xd0(%r11),%xmm14
  1848. vmovdqa 0xe0(%r11),%xmm15
  1849. lea 0xf8(%r11),%rsp
  1850. .Ldo_avx2_epilogue:
  1851. ___
  1852. $code.=<<___ if (!$win64);
  1853. lea 8(%r11),%rsp
  1854. .cfi_def_cfa %rsp,8
  1855. ___
  1856. $code.=<<___;
  1857. vzeroupper
  1858. ret
  1859. .cfi_endproc
  1860. .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
  1861. ___
  1862. #######################################################################
  1863. if ($avx>2) {
  1864. # On entry we have input length divisible by 64. But since inner loop
  1865. # processes 128 bytes per iteration, cases when length is not divisible
  1866. # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
  1867. # reason stack layout is kept identical to poly1305_blocks_avx2. If not
  1868. # for this tail, we wouldn't have to even allocate stack frame...
  1869. my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
  1870. my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
  1871. my $PADBIT="%zmm30";
  1872. map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
  1873. map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
  1874. map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
  1875. map(s/%y/%z/,($MASK));
  1876. $code.=<<___;
  1877. .type poly1305_blocks_avx512,\@function,4
  1878. .align 32
  1879. poly1305_blocks_avx512:
  1880. .cfi_startproc
  1881. .Lblocks_avx512:
  1882. mov \$15,%eax
  1883. kmovw %eax,%k2
  1884. ___
  1885. $code.=<<___ if (!$win64);
  1886. lea -8(%rsp),%r11
  1887. .cfi_def_cfa %r11,16
  1888. sub \$0x128,%rsp
  1889. ___
  1890. $code.=<<___ if ($win64);
  1891. lea -0xf8(%rsp),%r11
  1892. sub \$0x1c8,%rsp
  1893. vmovdqa %xmm6,0x50(%r11)
  1894. vmovdqa %xmm7,0x60(%r11)
  1895. vmovdqa %xmm8,0x70(%r11)
  1896. vmovdqa %xmm9,0x80(%r11)
  1897. vmovdqa %xmm10,0x90(%r11)
  1898. vmovdqa %xmm11,0xa0(%r11)
  1899. vmovdqa %xmm12,0xb0(%r11)
  1900. vmovdqa %xmm13,0xc0(%r11)
  1901. vmovdqa %xmm14,0xd0(%r11)
  1902. vmovdqa %xmm15,0xe0(%r11)
  1903. .Ldo_avx512_body:
  1904. ___
  1905. $code.=<<___;
  1906. lea .Lconst(%rip),%rcx
  1907. lea 48+64($ctx),$ctx # size optimization
  1908. vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
  1909. # expand pre-calculated table
  1910. vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
  1911. and \$-512,%rsp
  1912. vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
  1913. mov \$0x20,%rax
  1914. vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
  1915. vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
  1916. vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
  1917. vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
  1918. vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
  1919. vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
  1920. vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
  1921. vpermd $D0,$T2,$R0 # 00003412 -> 14243444
  1922. vpbroadcastq 64(%rcx),$MASK # .Lmask26
  1923. vpermd $D1,$T2,$R1
  1924. vpermd $T0,$T2,$S1
  1925. vpermd $D2,$T2,$R2
  1926. vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
  1927. vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
  1928. vpermd $T1,$T2,$S2
  1929. vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
  1930. vpsrlq \$32,$R1,$T1
  1931. vpermd $D3,$T2,$R3
  1932. vmovdqa64 $S1,0x40(%rsp){%k2}
  1933. vpermd $T3,$T2,$S3
  1934. vpermd $D4,$T2,$R4
  1935. vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
  1936. vpermd $T4,$T2,$S4
  1937. vmovdqa64 $S2,0x80(%rsp){%k2}
  1938. vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
  1939. vmovdqa64 $S3,0xc0(%rsp){%k2}
  1940. vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
  1941. vmovdqa64 $S4,0x100(%rsp){%k2}
  1942. ################################################################
  1943. # calculate 5th through 8th powers of the key
  1944. #
  1945. # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
  1946. # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
  1947. # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
  1948. # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
  1949. # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
  1950. vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
  1951. vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
  1952. vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
  1953. vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
  1954. vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
  1955. vpsrlq \$32,$R2,$T2
  1956. vpmuludq $T1,$S4,$M0
  1957. vpmuludq $T1,$R0,$M1
  1958. vpmuludq $T1,$R1,$M2
  1959. vpmuludq $T1,$R2,$M3
  1960. vpmuludq $T1,$R3,$M4
  1961. vpsrlq \$32,$R3,$T3
  1962. vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
  1963. vpaddq $M1,$D1,$D1 # d1 += r1'*r0
  1964. vpaddq $M2,$D2,$D2 # d2 += r1'*r1
  1965. vpaddq $M3,$D3,$D3 # d3 += r1'*r2
  1966. vpaddq $M4,$D4,$D4 # d4 += r1'*r3
  1967. vpmuludq $T2,$S3,$M0
  1968. vpmuludq $T2,$S4,$M1
  1969. vpmuludq $T2,$R1,$M3
  1970. vpmuludq $T2,$R2,$M4
  1971. vpmuludq $T2,$R0,$M2
  1972. vpsrlq \$32,$R4,$T4
  1973. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
  1974. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
  1975. vpaddq $M3,$D3,$D3 # d3 += r2'*r1
  1976. vpaddq $M4,$D4,$D4 # d4 += r2'*r2
  1977. vpaddq $M2,$D2,$D2 # d2 += r2'*r0
  1978. vpmuludq $T3,$S2,$M0
  1979. vpmuludq $T3,$R0,$M3
  1980. vpmuludq $T3,$R1,$M4
  1981. vpmuludq $T3,$S3,$M1
  1982. vpmuludq $T3,$S4,$M2
  1983. vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
  1984. vpaddq $M3,$D3,$D3 # d3 += r3'*r0
  1985. vpaddq $M4,$D4,$D4 # d4 += r3'*r1
  1986. vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
  1987. vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
  1988. vpmuludq $T4,$S4,$M3
  1989. vpmuludq $T4,$R0,$M4
  1990. vpmuludq $T4,$S1,$M0
  1991. vpmuludq $T4,$S2,$M1
  1992. vpmuludq $T4,$S3,$M2
  1993. vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
  1994. vpaddq $M4,$D4,$D4 # d4 += r2'*r0
  1995. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
  1996. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
  1997. vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
  1998. ################################################################
  1999. # load input
  2000. vmovdqu64 16*0($inp),%z#$T3
  2001. vmovdqu64 16*4($inp),%z#$T4
  2002. lea 16*8($inp),$inp
  2003. ################################################################
  2004. # lazy reduction
  2005. vpsrlq \$26,$D3,$M3
  2006. vpandq $MASK,$D3,$D3
  2007. vpaddq $M3,$D4,$D4 # d3 -> d4
  2008. vpsrlq \$26,$D0,$M0
  2009. vpandq $MASK,$D0,$D0
  2010. vpaddq $M0,$D1,$D1 # d0 -> d1
  2011. vpsrlq \$26,$D4,$M4
  2012. vpandq $MASK,$D4,$D4
  2013. vpsrlq \$26,$D1,$M1
  2014. vpandq $MASK,$D1,$D1
  2015. vpaddq $M1,$D2,$D2 # d1 -> d2
  2016. vpaddq $M4,$D0,$D0
  2017. vpsllq \$2,$M4,$M4
  2018. vpaddq $M4,$D0,$D0 # d4 -> d0
  2019. vpsrlq \$26,$D2,$M2
  2020. vpandq $MASK,$D2,$D2
  2021. vpaddq $M2,$D3,$D3 # d2 -> d3
  2022. vpsrlq \$26,$D0,$M0
  2023. vpandq $MASK,$D0,$D0
  2024. vpaddq $M0,$D1,$D1 # d0 -> d1
  2025. vpsrlq \$26,$D3,$M3
  2026. vpandq $MASK,$D3,$D3
  2027. vpaddq $M3,$D4,$D4 # d3 -> d4
  2028. ################################################################
  2029. # at this point we have 14243444 in $R0-$S4 and 05060708 in
  2030. # $D0-$D4, ...
  2031. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2032. vpunpckhqdq $T4,$T3,$T4
  2033. # ... since input 64-bit lanes are ordered as 73625140, we could
  2034. # "vperm" it to 76543210 (here and in each loop iteration), *or*
  2035. # we could just flow along, hence the goal for $R0-$S4 is
  2036. # 1858286838784888 ...
  2037. vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
  2038. mov \$0x7777,%eax
  2039. kmovw %eax,%k1
  2040. vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
  2041. vpermd $R1,$M0,$R1
  2042. vpermd $R2,$M0,$R2
  2043. vpermd $R3,$M0,$R3
  2044. vpermd $R4,$M0,$R4
  2045. vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
  2046. vpermd $D1,$M0,${R1}{%k1}
  2047. vpermd $D2,$M0,${R2}{%k1}
  2048. vpermd $D3,$M0,${R3}{%k1}
  2049. vpermd $D4,$M0,${R4}{%k1}
  2050. vpslld \$2,$R1,$S1 # *5
  2051. vpslld \$2,$R2,$S2
  2052. vpslld \$2,$R3,$S3
  2053. vpslld \$2,$R4,$S4
  2054. vpaddd $R1,$S1,$S1
  2055. vpaddd $R2,$S2,$S2
  2056. vpaddd $R3,$S3,$S3
  2057. vpaddd $R4,$S4,$S4
  2058. vpbroadcastq 32(%rcx),$PADBIT # .L129
  2059. vpsrlq \$52,$T0,$T2 # splat input
  2060. vpsllq \$12,$T4,$T3
  2061. vporq $T3,$T2,$T2
  2062. vpsrlq \$26,$T0,$T1
  2063. vpsrlq \$14,$T4,$T3
  2064. vpsrlq \$40,$T4,$T4 # 4
  2065. vpandq $MASK,$T2,$T2 # 2
  2066. vpandq $MASK,$T0,$T0 # 0
  2067. #vpandq $MASK,$T1,$T1 # 1
  2068. #vpandq $MASK,$T3,$T3 # 3
  2069. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2070. vpaddq $H2,$T2,$H2 # accumulate input
  2071. sub \$192,$len
  2072. jbe .Ltail_avx512
  2073. jmp .Loop_avx512
  2074. .align 32
  2075. .Loop_avx512:
  2076. ################################################################
  2077. # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
  2078. # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
  2079. # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
  2080. # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
  2081. # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
  2082. # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
  2083. # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
  2084. # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
  2085. # \________/\___________/
  2086. ################################################################
  2087. #vpaddq $H2,$T2,$H2 # accumulate input
  2088. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  2089. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  2090. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  2091. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  2092. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  2093. #
  2094. # however, as h2 is "chronologically" first one available pull
  2095. # corresponding operations up, so it's
  2096. #
  2097. # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
  2098. # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
  2099. # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
  2100. # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
  2101. # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
  2102. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2103. vpaddq $H0,$T0,$H0
  2104. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2105. vpandq $MASK,$T1,$T1 # 1
  2106. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2107. vpandq $MASK,$T3,$T3 # 3
  2108. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2109. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2110. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2111. vpaddq $H1,$T1,$H1 # accumulate input
  2112. vpaddq $H3,$T3,$H3
  2113. vpaddq $H4,$T4,$H4
  2114. vmovdqu64 16*0($inp),$T3 # load input
  2115. vmovdqu64 16*4($inp),$T4
  2116. lea 16*8($inp),$inp
  2117. vpmuludq $H0,$R3,$M3
  2118. vpmuludq $H0,$R4,$M4
  2119. vpmuludq $H0,$R0,$M0
  2120. vpmuludq $H0,$R1,$M1
  2121. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2122. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2123. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2124. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2125. vpmuludq $H1,$R2,$M3
  2126. vpmuludq $H1,$R3,$M4
  2127. vpmuludq $H1,$S4,$M0
  2128. vpmuludq $H0,$R2,$M2
  2129. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2130. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2131. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2132. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2133. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2134. vpunpckhqdq $T4,$T3,$T4
  2135. vpmuludq $H3,$R0,$M3
  2136. vpmuludq $H3,$R1,$M4
  2137. vpmuludq $H1,$R0,$M1
  2138. vpmuludq $H1,$R1,$M2
  2139. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2140. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2141. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2142. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2143. vpmuludq $H4,$S4,$M3
  2144. vpmuludq $H4,$R0,$M4
  2145. vpmuludq $H3,$S2,$M0
  2146. vpmuludq $H3,$S3,$M1
  2147. vpaddq $M3,$D3,$D3 # d3 += h4*s4
  2148. vpmuludq $H3,$S4,$M2
  2149. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2150. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2151. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2152. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2153. vpmuludq $H4,$S1,$M0
  2154. vpmuludq $H4,$S2,$M1
  2155. vpmuludq $H4,$S3,$M2
  2156. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2157. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2158. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2159. ################################################################
  2160. # lazy reduction (interleaved with input splat)
  2161. vpsrlq \$52,$T0,$T2 # splat input
  2162. vpsllq \$12,$T4,$T3
  2163. vpsrlq \$26,$D3,$H3
  2164. vpandq $MASK,$D3,$D3
  2165. vpaddq $H3,$D4,$H4 # h3 -> h4
  2166. vporq $T3,$T2,$T2
  2167. vpsrlq \$26,$H0,$D0
  2168. vpandq $MASK,$H0,$H0
  2169. vpaddq $D0,$H1,$H1 # h0 -> h1
  2170. vpandq $MASK,$T2,$T2 # 2
  2171. vpsrlq \$26,$H4,$D4
  2172. vpandq $MASK,$H4,$H4
  2173. vpsrlq \$26,$H1,$D1
  2174. vpandq $MASK,$H1,$H1
  2175. vpaddq $D1,$H2,$H2 # h1 -> h2
  2176. vpaddq $D4,$H0,$H0
  2177. vpsllq \$2,$D4,$D4
  2178. vpaddq $D4,$H0,$H0 # h4 -> h0
  2179. vpaddq $T2,$H2,$H2 # modulo-scheduled
  2180. vpsrlq \$26,$T0,$T1
  2181. vpsrlq \$26,$H2,$D2
  2182. vpandq $MASK,$H2,$H2
  2183. vpaddq $D2,$D3,$H3 # h2 -> h3
  2184. vpsrlq \$14,$T4,$T3
  2185. vpsrlq \$26,$H0,$D0
  2186. vpandq $MASK,$H0,$H0
  2187. vpaddq $D0,$H1,$H1 # h0 -> h1
  2188. vpsrlq \$40,$T4,$T4 # 4
  2189. vpsrlq \$26,$H3,$D3
  2190. vpandq $MASK,$H3,$H3
  2191. vpaddq $D3,$H4,$H4 # h3 -> h4
  2192. vpandq $MASK,$T0,$T0 # 0
  2193. #vpandq $MASK,$T1,$T1 # 1
  2194. #vpandq $MASK,$T3,$T3 # 3
  2195. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2196. sub \$128,$len
  2197. ja .Loop_avx512
  2198. .Ltail_avx512:
  2199. ################################################################
  2200. # while above multiplications were by r^8 in all lanes, in last
  2201. # iteration we multiply least significant lane by r^8 and most
  2202. # significant one by r, that's why table gets shifted...
  2203. vpsrlq \$32,$R0,$R0 # 0105020603070408
  2204. vpsrlq \$32,$R1,$R1
  2205. vpsrlq \$32,$R2,$R2
  2206. vpsrlq \$32,$S3,$S3
  2207. vpsrlq \$32,$S4,$S4
  2208. vpsrlq \$32,$R3,$R3
  2209. vpsrlq \$32,$R4,$R4
  2210. vpsrlq \$32,$S1,$S1
  2211. vpsrlq \$32,$S2,$S2
  2212. ################################################################
  2213. # load either next or last 64 byte of input
  2214. lea ($inp,$len),$inp
  2215. #vpaddq $H2,$T2,$H2 # accumulate input
  2216. vpaddq $H0,$T0,$H0
  2217. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2218. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2219. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2220. vpandq $MASK,$T1,$T1 # 1
  2221. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2222. vpandq $MASK,$T3,$T3 # 3
  2223. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2224. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2225. vpaddq $H1,$T1,$H1 # accumulate input
  2226. vpaddq $H3,$T3,$H3
  2227. vpaddq $H4,$T4,$H4
  2228. vmovdqu 16*0($inp),%x#$T0
  2229. vpmuludq $H0,$R3,$M3
  2230. vpmuludq $H0,$R4,$M4
  2231. vpmuludq $H0,$R0,$M0
  2232. vpmuludq $H0,$R1,$M1
  2233. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2234. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2235. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2236. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2237. vmovdqu 16*1($inp),%x#$T1
  2238. vpmuludq $H1,$R2,$M3
  2239. vpmuludq $H1,$R3,$M4
  2240. vpmuludq $H1,$S4,$M0
  2241. vpmuludq $H0,$R2,$M2
  2242. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2243. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2244. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2245. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2246. vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
  2247. vpmuludq $H3,$R0,$M3
  2248. vpmuludq $H3,$R1,$M4
  2249. vpmuludq $H1,$R0,$M1
  2250. vpmuludq $H1,$R1,$M2
  2251. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2252. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2253. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2254. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2255. vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
  2256. vpmuludq $H4,$S4,$M3
  2257. vpmuludq $H4,$R0,$M4
  2258. vpmuludq $H3,$S2,$M0
  2259. vpmuludq $H3,$S3,$M1
  2260. vpmuludq $H3,$S4,$M2
  2261. vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
  2262. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2263. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2264. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2265. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2266. vpmuludq $H4,$S1,$M0
  2267. vpmuludq $H4,$S2,$M1
  2268. vpmuludq $H4,$S3,$M2
  2269. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2270. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2271. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2272. ################################################################
  2273. # horizontal addition
  2274. mov \$1,%eax
  2275. vpermq \$0xb1,$H3,$D3
  2276. vpermq \$0xb1,$D4,$H4
  2277. vpermq \$0xb1,$H0,$D0
  2278. vpermq \$0xb1,$H1,$D1
  2279. vpermq \$0xb1,$H2,$D2
  2280. vpaddq $D3,$H3,$H3
  2281. vpaddq $D4,$H4,$H4
  2282. vpaddq $D0,$H0,$H0
  2283. vpaddq $D1,$H1,$H1
  2284. vpaddq $D2,$H2,$H2
  2285. kmovw %eax,%k3
  2286. vpermq \$0x2,$H3,$D3
  2287. vpermq \$0x2,$H4,$D4
  2288. vpermq \$0x2,$H0,$D0
  2289. vpermq \$0x2,$H1,$D1
  2290. vpermq \$0x2,$H2,$D2
  2291. vpaddq $D3,$H3,$H3
  2292. vpaddq $D4,$H4,$H4
  2293. vpaddq $D0,$H0,$H0
  2294. vpaddq $D1,$H1,$H1
  2295. vpaddq $D2,$H2,$H2
  2296. vextracti64x4 \$0x1,$H3,%y#$D3
  2297. vextracti64x4 \$0x1,$H4,%y#$D4
  2298. vextracti64x4 \$0x1,$H0,%y#$D0
  2299. vextracti64x4 \$0x1,$H1,%y#$D1
  2300. vextracti64x4 \$0x1,$H2,%y#$D2
  2301. vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
  2302. vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
  2303. vpaddq $D0,$H0,${H0}{%k3}{z}
  2304. vpaddq $D1,$H1,${H1}{%k3}{z}
  2305. vpaddq $D2,$H2,${H2}{%k3}{z}
  2306. ___
  2307. map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
  2308. map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
  2309. $code.=<<___;
  2310. ################################################################
  2311. # lazy reduction (interleaved with input splat)
  2312. vpsrlq \$26,$H3,$D3
  2313. vpand $MASK,$H3,$H3
  2314. vpsrldq \$6,$T0,$T2 # splat input
  2315. vpsrldq \$6,$T1,$T3
  2316. vpunpckhqdq $T1,$T0,$T4 # 4
  2317. vpaddq $D3,$H4,$H4 # h3 -> h4
  2318. vpsrlq \$26,$H0,$D0
  2319. vpand $MASK,$H0,$H0
  2320. vpunpcklqdq $T3,$T2,$T2 # 2:3
  2321. vpunpcklqdq $T1,$T0,$T0 # 0:1
  2322. vpaddq $D0,$H1,$H1 # h0 -> h1
  2323. vpsrlq \$26,$H4,$D4
  2324. vpand $MASK,$H4,$H4
  2325. vpsrlq \$26,$H1,$D1
  2326. vpand $MASK,$H1,$H1
  2327. vpsrlq \$30,$T2,$T3
  2328. vpsrlq \$4,$T2,$T2
  2329. vpaddq $D1,$H2,$H2 # h1 -> h2
  2330. vpaddq $D4,$H0,$H0
  2331. vpsllq \$2,$D4,$D4
  2332. vpsrlq \$26,$T0,$T1
  2333. vpsrlq \$40,$T4,$T4 # 4
  2334. vpaddq $D4,$H0,$H0 # h4 -> h0
  2335. vpsrlq \$26,$H2,$D2
  2336. vpand $MASK,$H2,$H2
  2337. vpand $MASK,$T2,$T2 # 2
  2338. vpand $MASK,$T0,$T0 # 0
  2339. vpaddq $D2,$H3,$H3 # h2 -> h3
  2340. vpsrlq \$26,$H0,$D0
  2341. vpand $MASK,$H0,$H0
  2342. vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
  2343. vpand $MASK,$T1,$T1 # 1
  2344. vpaddq $D0,$H1,$H1 # h0 -> h1
  2345. vpsrlq \$26,$H3,$D3
  2346. vpand $MASK,$H3,$H3
  2347. vpand $MASK,$T3,$T3 # 3
  2348. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  2349. vpaddq $D3,$H4,$H4 # h3 -> h4
  2350. lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
  2351. add \$64,$len
  2352. jnz .Ltail_avx2
  2353. vpsubq $T2,$H2,$H2 # undo input accumulation
  2354. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  2355. vmovd %x#$H1,`4*1-48-64`($ctx)
  2356. vmovd %x#$H2,`4*2-48-64`($ctx)
  2357. vmovd %x#$H3,`4*3-48-64`($ctx)
  2358. vmovd %x#$H4,`4*4-48-64`($ctx)
  2359. vzeroall
  2360. ___
  2361. $code.=<<___ if ($win64);
  2362. movdqa 0x50(%r11),%xmm6
  2363. movdqa 0x60(%r11),%xmm7
  2364. movdqa 0x70(%r11),%xmm8
  2365. movdqa 0x80(%r11),%xmm9
  2366. movdqa 0x90(%r11),%xmm10
  2367. movdqa 0xa0(%r11),%xmm11
  2368. movdqa 0xb0(%r11),%xmm12
  2369. movdqa 0xc0(%r11),%xmm13
  2370. movdqa 0xd0(%r11),%xmm14
  2371. movdqa 0xe0(%r11),%xmm15
  2372. lea 0xf8(%r11),%rsp
  2373. .Ldo_avx512_epilogue:
  2374. ___
  2375. $code.=<<___ if (!$win64);
  2376. lea 8(%r11),%rsp
  2377. .cfi_def_cfa %rsp,8
  2378. ___
  2379. $code.=<<___;
  2380. ret
  2381. .cfi_endproc
  2382. .size poly1305_blocks_avx512,.-poly1305_blocks_avx512
  2383. ___
  2384. if ($avx>3 && !$win64) {
  2385. ########################################################################
  2386. # VPMADD52 version using 2^44 radix.
  2387. #
  2388. # One can argue that base 2^52 would be more natural. Well, even though
  2389. # some operations would be more natural, one has to recognize couple of
  2390. # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
  2391. # at amount of multiply-n-accumulate operations. Secondly, it makes it
  2392. # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
  2393. # reference implementations], which means that more such operations
  2394. # would have to be performed in inner loop, which in turn makes critical
  2395. # path longer. In other words, even though base 2^44 reduction might
  2396. # look less elegant, overall critical path is actually shorter...
  2397. ########################################################################
  2398. # Layout of opaque area is following.
  2399. #
  2400. # unsigned __int64 h[3]; # current hash value base 2^44
  2401. # unsigned __int64 s[2]; # key value*20 base 2^44
  2402. # unsigned __int64 r[3]; # key value base 2^44
  2403. # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
  2404. # # r^n positions reflect
  2405. # # placement in register, not
  2406. # # memory, R[3] is R[1]*20
  2407. $code.=<<___;
  2408. .type poly1305_init_base2_44,\@function,3
  2409. .align 32
  2410. poly1305_init_base2_44:
  2411. .cfi_startproc
  2412. xor %rax,%rax
  2413. mov %rax,0($ctx) # initialize hash value
  2414. mov %rax,8($ctx)
  2415. mov %rax,16($ctx)
  2416. .Linit_base2_44:
  2417. lea poly1305_blocks_vpmadd52(%rip),%r10
  2418. lea poly1305_emit_base2_44(%rip),%r11
  2419. mov \$0x0ffffffc0fffffff,%rax
  2420. mov \$0x0ffffffc0ffffffc,%rcx
  2421. and 0($inp),%rax
  2422. mov \$0x00000fffffffffff,%r8
  2423. and 8($inp),%rcx
  2424. mov \$0x00000fffffffffff,%r9
  2425. and %rax,%r8
  2426. shrd \$44,%rcx,%rax
  2427. mov %r8,40($ctx) # r0
  2428. and %r9,%rax
  2429. shr \$24,%rcx
  2430. mov %rax,48($ctx) # r1
  2431. lea (%rax,%rax,4),%rax # *5
  2432. mov %rcx,56($ctx) # r2
  2433. shl \$2,%rax # magic <<2
  2434. lea (%rcx,%rcx,4),%rcx # *5
  2435. shl \$2,%rcx # magic <<2
  2436. mov %rax,24($ctx) # s1
  2437. mov %rcx,32($ctx) # s2
  2438. movq \$-1,64($ctx) # write impossible value
  2439. ___
  2440. $code.=<<___ if ($flavour !~ /elf32/);
  2441. mov %r10,0(%rdx)
  2442. mov %r11,8(%rdx)
  2443. ___
  2444. $code.=<<___ if ($flavour =~ /elf32/);
  2445. mov %r10d,0(%rdx)
  2446. mov %r11d,4(%rdx)
  2447. ___
  2448. $code.=<<___;
  2449. mov \$1,%eax
  2450. ret
  2451. .cfi_endproc
  2452. .size poly1305_init_base2_44,.-poly1305_init_base2_44
  2453. ___
  2454. {
  2455. my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
  2456. my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
  2457. my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
  2458. $code.=<<___;
  2459. .type poly1305_blocks_vpmadd52,\@function,4
  2460. .align 32
  2461. poly1305_blocks_vpmadd52:
  2462. .cfi_startproc
  2463. endbranch
  2464. shr \$4,$len
  2465. jz .Lno_data_vpmadd52 # too short
  2466. shl \$40,$padbit
  2467. mov 64($ctx),%r8 # peek on power of the key
  2468. # if powers of the key are not calculated yet, process up to 3
  2469. # blocks with this single-block subroutine, otherwise ensure that
  2470. # length is divisible by 2 blocks and pass the rest down to next
  2471. # subroutine...
  2472. mov \$3,%rax
  2473. mov \$1,%r10
  2474. cmp \$4,$len # is input long
  2475. cmovae %r10,%rax
  2476. test %r8,%r8 # is power value impossible?
  2477. cmovns %r10,%rax
  2478. and $len,%rax # is input of favourable length?
  2479. jz .Lblocks_vpmadd52_4x
  2480. sub %rax,$len
  2481. mov \$7,%r10d
  2482. mov \$1,%r11d
  2483. kmovw %r10d,%k7
  2484. lea .L2_44_inp_permd(%rip),%r10
  2485. kmovw %r11d,%k1
  2486. vmovq $padbit,%x#$PAD
  2487. vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
  2488. vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
  2489. vpermq \$0xcf,$PAD,$PAD
  2490. vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
  2491. vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
  2492. vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
  2493. vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
  2494. vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
  2495. vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
  2496. vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
  2497. jmp .Loop_vpmadd52
  2498. .align 32
  2499. .Loop_vpmadd52:
  2500. vmovdqu32 0($inp),%x#$T0 # load input as ----3210
  2501. lea 16($inp),$inp
  2502. vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
  2503. vpsrlvq $inp_shift,$T0,$T0
  2504. vpandq $reduc_mask,$T0,$T0
  2505. vporq $PAD,$T0,$T0
  2506. vpaddq $T0,$Dlo,$Dlo # accumulate input
  2507. vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
  2508. vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
  2509. vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
  2510. vpxord $Dlo,$Dlo,$Dlo
  2511. vpxord $Dhi,$Dhi,$Dhi
  2512. vpmadd52luq $r2r1r0,$H0,$Dlo
  2513. vpmadd52huq $r2r1r0,$H0,$Dhi
  2514. vpmadd52luq $r1r0s2,$H1,$Dlo
  2515. vpmadd52huq $r1r0s2,$H1,$Dhi
  2516. vpmadd52luq $r0s2s1,$H2,$Dlo
  2517. vpmadd52huq $r0s2s1,$H2,$Dhi
  2518. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
  2519. vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
  2520. vpandq $reduc_mask,$Dlo,$Dlo
  2521. vpaddq $T0,$Dhi,$Dhi
  2522. vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
  2523. vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
  2524. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
  2525. vpandq $reduc_mask,$Dlo,$Dlo
  2526. vpermq \$0b10010011,$T0,$T0
  2527. vpaddq $T0,$Dlo,$Dlo
  2528. vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
  2529. vpaddq $T0,$Dlo,$Dlo
  2530. vpsllq \$2,$T0,$T0
  2531. vpaddq $T0,$Dlo,$Dlo
  2532. dec %rax # len-=16
  2533. jnz .Loop_vpmadd52
  2534. vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
  2535. test $len,$len
  2536. jnz .Lblocks_vpmadd52_4x
  2537. .Lno_data_vpmadd52:
  2538. ret
  2539. .cfi_endproc
  2540. .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
  2541. ___
  2542. }
  2543. {
  2544. ########################################################################
  2545. # As implied by its name 4x subroutine processes 4 blocks in parallel
  2546. # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
  2547. # and is handled in 256-bit %ymm registers.
  2548. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2549. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2550. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2551. $code.=<<___;
  2552. .type poly1305_blocks_vpmadd52_4x,\@function,4
  2553. .align 32
  2554. poly1305_blocks_vpmadd52_4x:
  2555. .cfi_startproc
  2556. shr \$4,$len
  2557. jz .Lno_data_vpmadd52_4x # too short
  2558. shl \$40,$padbit
  2559. mov 64($ctx),%r8 # peek on power of the key
  2560. .Lblocks_vpmadd52_4x:
  2561. vpbroadcastq $padbit,$PAD
  2562. vmovdqa64 .Lx_mask44(%rip),$mask44
  2563. mov \$5,%eax
  2564. vmovdqa64 .Lx_mask42(%rip),$mask42
  2565. kmovw %eax,%k1 # used in 2x path
  2566. test %r8,%r8 # is power value impossible?
  2567. js .Linit_vpmadd52 # if it is, then init R[4]
  2568. vmovq 0($ctx),%x#$H0 # load current hash value
  2569. vmovq 8($ctx),%x#$H1
  2570. vmovq 16($ctx),%x#$H2
  2571. test \$3,$len # is length 4*n+2?
  2572. jnz .Lblocks_vpmadd52_2x_do
  2573. .Lblocks_vpmadd52_4x_do:
  2574. vpbroadcastq 64($ctx),$R0 # load 4th power of the key
  2575. vpbroadcastq 96($ctx),$R1
  2576. vpbroadcastq 128($ctx),$R2
  2577. vpbroadcastq 160($ctx),$S1
  2578. .Lblocks_vpmadd52_4x_key_loaded:
  2579. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2580. vpaddq $R2,$S2,$S2
  2581. vpsllq \$2,$S2,$S2
  2582. test \$7,$len # is len 8*n?
  2583. jz .Lblocks_vpmadd52_8x
  2584. vmovdqu64 16*0($inp),$T2 # load data
  2585. vmovdqu64 16*2($inp),$T3
  2586. lea 16*4($inp),$inp
  2587. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2588. vpunpckhqdq $T3,$T2,$T3
  2589. # at this point 64-bit lanes are ordered as 3-1-2-0
  2590. vpsrlq \$24,$T3,$T2 # splat the data
  2591. vporq $PAD,$T2,$T2
  2592. vpaddq $T2,$H2,$H2 # accumulate input
  2593. vpandq $mask44,$T1,$T0
  2594. vpsrlq \$44,$T1,$T1
  2595. vpsllq \$20,$T3,$T3
  2596. vporq $T3,$T1,$T1
  2597. vpandq $mask44,$T1,$T1
  2598. sub \$4,$len
  2599. jz .Ltail_vpmadd52_4x
  2600. jmp .Loop_vpmadd52_4x
  2601. ud2
  2602. .align 32
  2603. .Linit_vpmadd52:
  2604. vmovq 24($ctx),%x#$S1 # load key
  2605. vmovq 56($ctx),%x#$H2
  2606. vmovq 32($ctx),%x#$S2
  2607. vmovq 40($ctx),%x#$R0
  2608. vmovq 48($ctx),%x#$R1
  2609. vmovdqa $R0,$H0
  2610. vmovdqa $R1,$H1
  2611. vmovdqa $H2,$R2
  2612. mov \$2,%eax
  2613. .Lmul_init_vpmadd52:
  2614. vpxorq $D0lo,$D0lo,$D0lo
  2615. vpmadd52luq $H2,$S1,$D0lo
  2616. vpxorq $D0hi,$D0hi,$D0hi
  2617. vpmadd52huq $H2,$S1,$D0hi
  2618. vpxorq $D1lo,$D1lo,$D1lo
  2619. vpmadd52luq $H2,$S2,$D1lo
  2620. vpxorq $D1hi,$D1hi,$D1hi
  2621. vpmadd52huq $H2,$S2,$D1hi
  2622. vpxorq $D2lo,$D2lo,$D2lo
  2623. vpmadd52luq $H2,$R0,$D2lo
  2624. vpxorq $D2hi,$D2hi,$D2hi
  2625. vpmadd52huq $H2,$R0,$D2hi
  2626. vpmadd52luq $H0,$R0,$D0lo
  2627. vpmadd52huq $H0,$R0,$D0hi
  2628. vpmadd52luq $H0,$R1,$D1lo
  2629. vpmadd52huq $H0,$R1,$D1hi
  2630. vpmadd52luq $H0,$R2,$D2lo
  2631. vpmadd52huq $H0,$R2,$D2hi
  2632. vpmadd52luq $H1,$S2,$D0lo
  2633. vpmadd52huq $H1,$S2,$D0hi
  2634. vpmadd52luq $H1,$R0,$D1lo
  2635. vpmadd52huq $H1,$R0,$D1hi
  2636. vpmadd52luq $H1,$R1,$D2lo
  2637. vpmadd52huq $H1,$R1,$D2hi
  2638. ################################################################
  2639. # partial reduction
  2640. vpsrlq \$44,$D0lo,$tmp
  2641. vpsllq \$8,$D0hi,$D0hi
  2642. vpandq $mask44,$D0lo,$H0
  2643. vpaddq $tmp,$D0hi,$D0hi
  2644. vpaddq $D0hi,$D1lo,$D1lo
  2645. vpsrlq \$44,$D1lo,$tmp
  2646. vpsllq \$8,$D1hi,$D1hi
  2647. vpandq $mask44,$D1lo,$H1
  2648. vpaddq $tmp,$D1hi,$D1hi
  2649. vpaddq $D1hi,$D2lo,$D2lo
  2650. vpsrlq \$42,$D2lo,$tmp
  2651. vpsllq \$10,$D2hi,$D2hi
  2652. vpandq $mask42,$D2lo,$H2
  2653. vpaddq $tmp,$D2hi,$D2hi
  2654. vpaddq $D2hi,$H0,$H0
  2655. vpsllq \$2,$D2hi,$D2hi
  2656. vpaddq $D2hi,$H0,$H0
  2657. vpsrlq \$44,$H0,$tmp # additional step
  2658. vpandq $mask44,$H0,$H0
  2659. vpaddq $tmp,$H1,$H1
  2660. dec %eax
  2661. jz .Ldone_init_vpmadd52
  2662. vpunpcklqdq $R1,$H1,$R1 # 1,2
  2663. vpbroadcastq %x#$H1,%x#$H1 # 2,2
  2664. vpunpcklqdq $R2,$H2,$R2
  2665. vpbroadcastq %x#$H2,%x#$H2
  2666. vpunpcklqdq $R0,$H0,$R0
  2667. vpbroadcastq %x#$H0,%x#$H0
  2668. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2669. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2670. vpaddq $R1,$S1,$S1
  2671. vpaddq $R2,$S2,$S2
  2672. vpsllq \$2,$S1,$S1
  2673. vpsllq \$2,$S2,$S2
  2674. jmp .Lmul_init_vpmadd52
  2675. ud2
  2676. .align 32
  2677. .Ldone_init_vpmadd52:
  2678. vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
  2679. vinserti128 \$1,%x#$R2,$H2,$R2
  2680. vinserti128 \$1,%x#$R0,$H0,$R0
  2681. vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
  2682. vpermq \$0b11011000,$R2,$R2
  2683. vpermq \$0b11011000,$R0,$R0
  2684. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2685. vpaddq $R1,$S1,$S1
  2686. vpsllq \$2,$S1,$S1
  2687. vmovq 0($ctx),%x#$H0 # load current hash value
  2688. vmovq 8($ctx),%x#$H1
  2689. vmovq 16($ctx),%x#$H2
  2690. test \$3,$len # is length 4*n+2?
  2691. jnz .Ldone_init_vpmadd52_2x
  2692. vmovdqu64 $R0,64($ctx) # save key powers
  2693. vpbroadcastq %x#$R0,$R0 # broadcast 4th power
  2694. vmovdqu64 $R1,96($ctx)
  2695. vpbroadcastq %x#$R1,$R1
  2696. vmovdqu64 $R2,128($ctx)
  2697. vpbroadcastq %x#$R2,$R2
  2698. vmovdqu64 $S1,160($ctx)
  2699. vpbroadcastq %x#$S1,$S1
  2700. jmp .Lblocks_vpmadd52_4x_key_loaded
  2701. ud2
  2702. .align 32
  2703. .Ldone_init_vpmadd52_2x:
  2704. vmovdqu64 $R0,64($ctx) # save key powers
  2705. vpsrldq \$8,$R0,$R0 # 0-1-0-2
  2706. vmovdqu64 $R1,96($ctx)
  2707. vpsrldq \$8,$R1,$R1
  2708. vmovdqu64 $R2,128($ctx)
  2709. vpsrldq \$8,$R2,$R2
  2710. vmovdqu64 $S1,160($ctx)
  2711. vpsrldq \$8,$S1,$S1
  2712. jmp .Lblocks_vpmadd52_2x_key_loaded
  2713. ud2
  2714. .align 32
  2715. .Lblocks_vpmadd52_2x_do:
  2716. vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
  2717. vmovdqu64 160+8($ctx),${S1}{%k1}{z}
  2718. vmovdqu64 64+8($ctx),${R0}{%k1}{z}
  2719. vmovdqu64 96+8($ctx),${R1}{%k1}{z}
  2720. .Lblocks_vpmadd52_2x_key_loaded:
  2721. vmovdqu64 16*0($inp),$T2 # load data
  2722. vpxorq $T3,$T3,$T3
  2723. lea 16*2($inp),$inp
  2724. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2725. vpunpckhqdq $T3,$T2,$T3
  2726. # at this point 64-bit lanes are ordered as x-1-x-0
  2727. vpsrlq \$24,$T3,$T2 # splat the data
  2728. vporq $PAD,$T2,$T2
  2729. vpaddq $T2,$H2,$H2 # accumulate input
  2730. vpandq $mask44,$T1,$T0
  2731. vpsrlq \$44,$T1,$T1
  2732. vpsllq \$20,$T3,$T3
  2733. vporq $T3,$T1,$T1
  2734. vpandq $mask44,$T1,$T1
  2735. jmp .Ltail_vpmadd52_2x
  2736. ud2
  2737. .align 32
  2738. .Loop_vpmadd52_4x:
  2739. #vpaddq $T2,$H2,$H2 # accumulate input
  2740. vpaddq $T0,$H0,$H0
  2741. vpaddq $T1,$H1,$H1
  2742. vpxorq $D0lo,$D0lo,$D0lo
  2743. vpmadd52luq $H2,$S1,$D0lo
  2744. vpxorq $D0hi,$D0hi,$D0hi
  2745. vpmadd52huq $H2,$S1,$D0hi
  2746. vpxorq $D1lo,$D1lo,$D1lo
  2747. vpmadd52luq $H2,$S2,$D1lo
  2748. vpxorq $D1hi,$D1hi,$D1hi
  2749. vpmadd52huq $H2,$S2,$D1hi
  2750. vpxorq $D2lo,$D2lo,$D2lo
  2751. vpmadd52luq $H2,$R0,$D2lo
  2752. vpxorq $D2hi,$D2hi,$D2hi
  2753. vpmadd52huq $H2,$R0,$D2hi
  2754. vmovdqu64 16*0($inp),$T2 # load data
  2755. vmovdqu64 16*2($inp),$T3
  2756. lea 16*4($inp),$inp
  2757. vpmadd52luq $H0,$R0,$D0lo
  2758. vpmadd52huq $H0,$R0,$D0hi
  2759. vpmadd52luq $H0,$R1,$D1lo
  2760. vpmadd52huq $H0,$R1,$D1hi
  2761. vpmadd52luq $H0,$R2,$D2lo
  2762. vpmadd52huq $H0,$R2,$D2hi
  2763. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2764. vpunpckhqdq $T3,$T2,$T3
  2765. vpmadd52luq $H1,$S2,$D0lo
  2766. vpmadd52huq $H1,$S2,$D0hi
  2767. vpmadd52luq $H1,$R0,$D1lo
  2768. vpmadd52huq $H1,$R0,$D1hi
  2769. vpmadd52luq $H1,$R1,$D2lo
  2770. vpmadd52huq $H1,$R1,$D2hi
  2771. ################################################################
  2772. # partial reduction (interleaved with data splat)
  2773. vpsrlq \$44,$D0lo,$tmp
  2774. vpsllq \$8,$D0hi,$D0hi
  2775. vpandq $mask44,$D0lo,$H0
  2776. vpaddq $tmp,$D0hi,$D0hi
  2777. vpsrlq \$24,$T3,$T2
  2778. vporq $PAD,$T2,$T2
  2779. vpaddq $D0hi,$D1lo,$D1lo
  2780. vpsrlq \$44,$D1lo,$tmp
  2781. vpsllq \$8,$D1hi,$D1hi
  2782. vpandq $mask44,$D1lo,$H1
  2783. vpaddq $tmp,$D1hi,$D1hi
  2784. vpandq $mask44,$T1,$T0
  2785. vpsrlq \$44,$T1,$T1
  2786. vpsllq \$20,$T3,$T3
  2787. vpaddq $D1hi,$D2lo,$D2lo
  2788. vpsrlq \$42,$D2lo,$tmp
  2789. vpsllq \$10,$D2hi,$D2hi
  2790. vpandq $mask42,$D2lo,$H2
  2791. vpaddq $tmp,$D2hi,$D2hi
  2792. vpaddq $T2,$H2,$H2 # accumulate input
  2793. vpaddq $D2hi,$H0,$H0
  2794. vpsllq \$2,$D2hi,$D2hi
  2795. vpaddq $D2hi,$H0,$H0
  2796. vporq $T3,$T1,$T1
  2797. vpandq $mask44,$T1,$T1
  2798. vpsrlq \$44,$H0,$tmp # additional step
  2799. vpandq $mask44,$H0,$H0
  2800. vpaddq $tmp,$H1,$H1
  2801. sub \$4,$len # len-=64
  2802. jnz .Loop_vpmadd52_4x
  2803. .Ltail_vpmadd52_4x:
  2804. vmovdqu64 128($ctx),$R2 # load all key powers
  2805. vmovdqu64 160($ctx),$S1
  2806. vmovdqu64 64($ctx),$R0
  2807. vmovdqu64 96($ctx),$R1
  2808. .Ltail_vpmadd52_2x:
  2809. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2810. vpaddq $R2,$S2,$S2
  2811. vpsllq \$2,$S2,$S2
  2812. #vpaddq $T2,$H2,$H2 # accumulate input
  2813. vpaddq $T0,$H0,$H0
  2814. vpaddq $T1,$H1,$H1
  2815. vpxorq $D0lo,$D0lo,$D0lo
  2816. vpmadd52luq $H2,$S1,$D0lo
  2817. vpxorq $D0hi,$D0hi,$D0hi
  2818. vpmadd52huq $H2,$S1,$D0hi
  2819. vpxorq $D1lo,$D1lo,$D1lo
  2820. vpmadd52luq $H2,$S2,$D1lo
  2821. vpxorq $D1hi,$D1hi,$D1hi
  2822. vpmadd52huq $H2,$S2,$D1hi
  2823. vpxorq $D2lo,$D2lo,$D2lo
  2824. vpmadd52luq $H2,$R0,$D2lo
  2825. vpxorq $D2hi,$D2hi,$D2hi
  2826. vpmadd52huq $H2,$R0,$D2hi
  2827. vpmadd52luq $H0,$R0,$D0lo
  2828. vpmadd52huq $H0,$R0,$D0hi
  2829. vpmadd52luq $H0,$R1,$D1lo
  2830. vpmadd52huq $H0,$R1,$D1hi
  2831. vpmadd52luq $H0,$R2,$D2lo
  2832. vpmadd52huq $H0,$R2,$D2hi
  2833. vpmadd52luq $H1,$S2,$D0lo
  2834. vpmadd52huq $H1,$S2,$D0hi
  2835. vpmadd52luq $H1,$R0,$D1lo
  2836. vpmadd52huq $H1,$R0,$D1hi
  2837. vpmadd52luq $H1,$R1,$D2lo
  2838. vpmadd52huq $H1,$R1,$D2hi
  2839. ################################################################
  2840. # horizontal addition
  2841. mov \$1,%eax
  2842. kmovw %eax,%k1
  2843. vpsrldq \$8,$D0lo,$T0
  2844. vpsrldq \$8,$D0hi,$H0
  2845. vpsrldq \$8,$D1lo,$T1
  2846. vpsrldq \$8,$D1hi,$H1
  2847. vpaddq $T0,$D0lo,$D0lo
  2848. vpaddq $H0,$D0hi,$D0hi
  2849. vpsrldq \$8,$D2lo,$T2
  2850. vpsrldq \$8,$D2hi,$H2
  2851. vpaddq $T1,$D1lo,$D1lo
  2852. vpaddq $H1,$D1hi,$D1hi
  2853. vpermq \$0x2,$D0lo,$T0
  2854. vpermq \$0x2,$D0hi,$H0
  2855. vpaddq $T2,$D2lo,$D2lo
  2856. vpaddq $H2,$D2hi,$D2hi
  2857. vpermq \$0x2,$D1lo,$T1
  2858. vpermq \$0x2,$D1hi,$H1
  2859. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  2860. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  2861. vpermq \$0x2,$D2lo,$T2
  2862. vpermq \$0x2,$D2hi,$H2
  2863. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  2864. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  2865. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  2866. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  2867. ################################################################
  2868. # partial reduction
  2869. vpsrlq \$44,$D0lo,$tmp
  2870. vpsllq \$8,$D0hi,$D0hi
  2871. vpandq $mask44,$D0lo,$H0
  2872. vpaddq $tmp,$D0hi,$D0hi
  2873. vpaddq $D0hi,$D1lo,$D1lo
  2874. vpsrlq \$44,$D1lo,$tmp
  2875. vpsllq \$8,$D1hi,$D1hi
  2876. vpandq $mask44,$D1lo,$H1
  2877. vpaddq $tmp,$D1hi,$D1hi
  2878. vpaddq $D1hi,$D2lo,$D2lo
  2879. vpsrlq \$42,$D2lo,$tmp
  2880. vpsllq \$10,$D2hi,$D2hi
  2881. vpandq $mask42,$D2lo,$H2
  2882. vpaddq $tmp,$D2hi,$D2hi
  2883. vpaddq $D2hi,$H0,$H0
  2884. vpsllq \$2,$D2hi,$D2hi
  2885. vpaddq $D2hi,$H0,$H0
  2886. vpsrlq \$44,$H0,$tmp # additional step
  2887. vpandq $mask44,$H0,$H0
  2888. vpaddq $tmp,$H1,$H1
  2889. # at this point $len is
  2890. # either 4*n+2 or 0...
  2891. sub \$2,$len # len-=32
  2892. ja .Lblocks_vpmadd52_4x_do
  2893. vmovq %x#$H0,0($ctx)
  2894. vmovq %x#$H1,8($ctx)
  2895. vmovq %x#$H2,16($ctx)
  2896. vzeroall
  2897. .Lno_data_vpmadd52_4x:
  2898. ret
  2899. .cfi_endproc
  2900. .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
  2901. ___
  2902. }
  2903. {
  2904. ########################################################################
  2905. # As implied by its name 8x subroutine processes 8 blocks in parallel...
  2906. # This is intermediate version, as it's used only in cases when input
  2907. # length is either 8*n, 8*n+1 or 8*n+2...
  2908. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2909. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2910. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2911. my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
  2912. $code.=<<___;
  2913. .type poly1305_blocks_vpmadd52_8x,\@function,4
  2914. .align 32
  2915. poly1305_blocks_vpmadd52_8x:
  2916. .cfi_startproc
  2917. shr \$4,$len
  2918. jz .Lno_data_vpmadd52_8x # too short
  2919. shl \$40,$padbit
  2920. mov 64($ctx),%r8 # peek on power of the key
  2921. vmovdqa64 .Lx_mask44(%rip),$mask44
  2922. vmovdqa64 .Lx_mask42(%rip),$mask42
  2923. test %r8,%r8 # is power value impossible?
  2924. js .Linit_vpmadd52 # if it is, then init R[4]
  2925. vmovq 0($ctx),%x#$H0 # load current hash value
  2926. vmovq 8($ctx),%x#$H1
  2927. vmovq 16($ctx),%x#$H2
  2928. .Lblocks_vpmadd52_8x:
  2929. ################################################################
  2930. # fist we calculate more key powers
  2931. vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
  2932. vmovdqu64 160($ctx),$S1
  2933. vmovdqu64 64($ctx),$R0
  2934. vmovdqu64 96($ctx),$R1
  2935. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2936. vpaddq $R2,$S2,$S2
  2937. vpsllq \$2,$S2,$S2
  2938. vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
  2939. vpbroadcastq %x#$R0,$RR0
  2940. vpbroadcastq %x#$R1,$RR1
  2941. vpxorq $D0lo,$D0lo,$D0lo
  2942. vpmadd52luq $RR2,$S1,$D0lo
  2943. vpxorq $D0hi,$D0hi,$D0hi
  2944. vpmadd52huq $RR2,$S1,$D0hi
  2945. vpxorq $D1lo,$D1lo,$D1lo
  2946. vpmadd52luq $RR2,$S2,$D1lo
  2947. vpxorq $D1hi,$D1hi,$D1hi
  2948. vpmadd52huq $RR2,$S2,$D1hi
  2949. vpxorq $D2lo,$D2lo,$D2lo
  2950. vpmadd52luq $RR2,$R0,$D2lo
  2951. vpxorq $D2hi,$D2hi,$D2hi
  2952. vpmadd52huq $RR2,$R0,$D2hi
  2953. vpmadd52luq $RR0,$R0,$D0lo
  2954. vpmadd52huq $RR0,$R0,$D0hi
  2955. vpmadd52luq $RR0,$R1,$D1lo
  2956. vpmadd52huq $RR0,$R1,$D1hi
  2957. vpmadd52luq $RR0,$R2,$D2lo
  2958. vpmadd52huq $RR0,$R2,$D2hi
  2959. vpmadd52luq $RR1,$S2,$D0lo
  2960. vpmadd52huq $RR1,$S2,$D0hi
  2961. vpmadd52luq $RR1,$R0,$D1lo
  2962. vpmadd52huq $RR1,$R0,$D1hi
  2963. vpmadd52luq $RR1,$R1,$D2lo
  2964. vpmadd52huq $RR1,$R1,$D2hi
  2965. ################################################################
  2966. # partial reduction
  2967. vpsrlq \$44,$D0lo,$tmp
  2968. vpsllq \$8,$D0hi,$D0hi
  2969. vpandq $mask44,$D0lo,$RR0
  2970. vpaddq $tmp,$D0hi,$D0hi
  2971. vpaddq $D0hi,$D1lo,$D1lo
  2972. vpsrlq \$44,$D1lo,$tmp
  2973. vpsllq \$8,$D1hi,$D1hi
  2974. vpandq $mask44,$D1lo,$RR1
  2975. vpaddq $tmp,$D1hi,$D1hi
  2976. vpaddq $D1hi,$D2lo,$D2lo
  2977. vpsrlq \$42,$D2lo,$tmp
  2978. vpsllq \$10,$D2hi,$D2hi
  2979. vpandq $mask42,$D2lo,$RR2
  2980. vpaddq $tmp,$D2hi,$D2hi
  2981. vpaddq $D2hi,$RR0,$RR0
  2982. vpsllq \$2,$D2hi,$D2hi
  2983. vpaddq $D2hi,$RR0,$RR0
  2984. vpsrlq \$44,$RR0,$tmp # additional step
  2985. vpandq $mask44,$RR0,$RR0
  2986. vpaddq $tmp,$RR1,$RR1
  2987. ################################################################
  2988. # At this point Rx holds 1324 powers, RRx - 5768, and the goal
  2989. # is 15263748, which reflects how data is loaded...
  2990. vpunpcklqdq $R2,$RR2,$T2 # 3748
  2991. vpunpckhqdq $R2,$RR2,$R2 # 1526
  2992. vpunpcklqdq $R0,$RR0,$T0
  2993. vpunpckhqdq $R0,$RR0,$R0
  2994. vpunpcklqdq $R1,$RR1,$T1
  2995. vpunpckhqdq $R1,$RR1,$R1
  2996. ___
  2997. ######## switch to %zmm
  2998. map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  2999. map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3000. map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3001. map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
  3002. $code.=<<___;
  3003. vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
  3004. vshufi64x2 \$0x44,$R0,$T0,$RR0
  3005. vshufi64x2 \$0x44,$R1,$T1,$RR1
  3006. vmovdqu64 16*0($inp),$T2 # load data
  3007. vmovdqu64 16*4($inp),$T3
  3008. lea 16*8($inp),$inp
  3009. vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
  3010. vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
  3011. vpaddq $RR2,$SS2,$SS2
  3012. vpaddq $RR1,$SS1,$SS1
  3013. vpsllq \$2,$SS2,$SS2
  3014. vpsllq \$2,$SS1,$SS1
  3015. vpbroadcastq $padbit,$PAD
  3016. vpbroadcastq %x#$mask44,$mask44
  3017. vpbroadcastq %x#$mask42,$mask42
  3018. vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
  3019. vpbroadcastq %x#$SS2,$S2
  3020. vpbroadcastq %x#$RR0,$R0
  3021. vpbroadcastq %x#$RR1,$R1
  3022. vpbroadcastq %x#$RR2,$R2
  3023. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3024. vpunpckhqdq $T3,$T2,$T3
  3025. # at this point 64-bit lanes are ordered as 73625140
  3026. vpsrlq \$24,$T3,$T2 # splat the data
  3027. vporq $PAD,$T2,$T2
  3028. vpaddq $T2,$H2,$H2 # accumulate input
  3029. vpandq $mask44,$T1,$T0
  3030. vpsrlq \$44,$T1,$T1
  3031. vpsllq \$20,$T3,$T3
  3032. vporq $T3,$T1,$T1
  3033. vpandq $mask44,$T1,$T1
  3034. sub \$8,$len
  3035. jz .Ltail_vpmadd52_8x
  3036. jmp .Loop_vpmadd52_8x
  3037. .align 32
  3038. .Loop_vpmadd52_8x:
  3039. #vpaddq $T2,$H2,$H2 # accumulate input
  3040. vpaddq $T0,$H0,$H0
  3041. vpaddq $T1,$H1,$H1
  3042. vpxorq $D0lo,$D0lo,$D0lo
  3043. vpmadd52luq $H2,$S1,$D0lo
  3044. vpxorq $D0hi,$D0hi,$D0hi
  3045. vpmadd52huq $H2,$S1,$D0hi
  3046. vpxorq $D1lo,$D1lo,$D1lo
  3047. vpmadd52luq $H2,$S2,$D1lo
  3048. vpxorq $D1hi,$D1hi,$D1hi
  3049. vpmadd52huq $H2,$S2,$D1hi
  3050. vpxorq $D2lo,$D2lo,$D2lo
  3051. vpmadd52luq $H2,$R0,$D2lo
  3052. vpxorq $D2hi,$D2hi,$D2hi
  3053. vpmadd52huq $H2,$R0,$D2hi
  3054. vmovdqu64 16*0($inp),$T2 # load data
  3055. vmovdqu64 16*4($inp),$T3
  3056. lea 16*8($inp),$inp
  3057. vpmadd52luq $H0,$R0,$D0lo
  3058. vpmadd52huq $H0,$R0,$D0hi
  3059. vpmadd52luq $H0,$R1,$D1lo
  3060. vpmadd52huq $H0,$R1,$D1hi
  3061. vpmadd52luq $H0,$R2,$D2lo
  3062. vpmadd52huq $H0,$R2,$D2hi
  3063. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3064. vpunpckhqdq $T3,$T2,$T3
  3065. vpmadd52luq $H1,$S2,$D0lo
  3066. vpmadd52huq $H1,$S2,$D0hi
  3067. vpmadd52luq $H1,$R0,$D1lo
  3068. vpmadd52huq $H1,$R0,$D1hi
  3069. vpmadd52luq $H1,$R1,$D2lo
  3070. vpmadd52huq $H1,$R1,$D2hi
  3071. ################################################################
  3072. # partial reduction (interleaved with data splat)
  3073. vpsrlq \$44,$D0lo,$tmp
  3074. vpsllq \$8,$D0hi,$D0hi
  3075. vpandq $mask44,$D0lo,$H0
  3076. vpaddq $tmp,$D0hi,$D0hi
  3077. vpsrlq \$24,$T3,$T2
  3078. vporq $PAD,$T2,$T2
  3079. vpaddq $D0hi,$D1lo,$D1lo
  3080. vpsrlq \$44,$D1lo,$tmp
  3081. vpsllq \$8,$D1hi,$D1hi
  3082. vpandq $mask44,$D1lo,$H1
  3083. vpaddq $tmp,$D1hi,$D1hi
  3084. vpandq $mask44,$T1,$T0
  3085. vpsrlq \$44,$T1,$T1
  3086. vpsllq \$20,$T3,$T3
  3087. vpaddq $D1hi,$D2lo,$D2lo
  3088. vpsrlq \$42,$D2lo,$tmp
  3089. vpsllq \$10,$D2hi,$D2hi
  3090. vpandq $mask42,$D2lo,$H2
  3091. vpaddq $tmp,$D2hi,$D2hi
  3092. vpaddq $T2,$H2,$H2 # accumulate input
  3093. vpaddq $D2hi,$H0,$H0
  3094. vpsllq \$2,$D2hi,$D2hi
  3095. vpaddq $D2hi,$H0,$H0
  3096. vporq $T3,$T1,$T1
  3097. vpandq $mask44,$T1,$T1
  3098. vpsrlq \$44,$H0,$tmp # additional step
  3099. vpandq $mask44,$H0,$H0
  3100. vpaddq $tmp,$H1,$H1
  3101. sub \$8,$len # len-=128
  3102. jnz .Loop_vpmadd52_8x
  3103. .Ltail_vpmadd52_8x:
  3104. #vpaddq $T2,$H2,$H2 # accumulate input
  3105. vpaddq $T0,$H0,$H0
  3106. vpaddq $T1,$H1,$H1
  3107. vpxorq $D0lo,$D0lo,$D0lo
  3108. vpmadd52luq $H2,$SS1,$D0lo
  3109. vpxorq $D0hi,$D0hi,$D0hi
  3110. vpmadd52huq $H2,$SS1,$D0hi
  3111. vpxorq $D1lo,$D1lo,$D1lo
  3112. vpmadd52luq $H2,$SS2,$D1lo
  3113. vpxorq $D1hi,$D1hi,$D1hi
  3114. vpmadd52huq $H2,$SS2,$D1hi
  3115. vpxorq $D2lo,$D2lo,$D2lo
  3116. vpmadd52luq $H2,$RR0,$D2lo
  3117. vpxorq $D2hi,$D2hi,$D2hi
  3118. vpmadd52huq $H2,$RR0,$D2hi
  3119. vpmadd52luq $H0,$RR0,$D0lo
  3120. vpmadd52huq $H0,$RR0,$D0hi
  3121. vpmadd52luq $H0,$RR1,$D1lo
  3122. vpmadd52huq $H0,$RR1,$D1hi
  3123. vpmadd52luq $H0,$RR2,$D2lo
  3124. vpmadd52huq $H0,$RR2,$D2hi
  3125. vpmadd52luq $H1,$SS2,$D0lo
  3126. vpmadd52huq $H1,$SS2,$D0hi
  3127. vpmadd52luq $H1,$RR0,$D1lo
  3128. vpmadd52huq $H1,$RR0,$D1hi
  3129. vpmadd52luq $H1,$RR1,$D2lo
  3130. vpmadd52huq $H1,$RR1,$D2hi
  3131. ################################################################
  3132. # horizontal addition
  3133. mov \$1,%eax
  3134. kmovw %eax,%k1
  3135. vpsrldq \$8,$D0lo,$T0
  3136. vpsrldq \$8,$D0hi,$H0
  3137. vpsrldq \$8,$D1lo,$T1
  3138. vpsrldq \$8,$D1hi,$H1
  3139. vpaddq $T0,$D0lo,$D0lo
  3140. vpaddq $H0,$D0hi,$D0hi
  3141. vpsrldq \$8,$D2lo,$T2
  3142. vpsrldq \$8,$D2hi,$H2
  3143. vpaddq $T1,$D1lo,$D1lo
  3144. vpaddq $H1,$D1hi,$D1hi
  3145. vpermq \$0x2,$D0lo,$T0
  3146. vpermq \$0x2,$D0hi,$H0
  3147. vpaddq $T2,$D2lo,$D2lo
  3148. vpaddq $H2,$D2hi,$D2hi
  3149. vpermq \$0x2,$D1lo,$T1
  3150. vpermq \$0x2,$D1hi,$H1
  3151. vpaddq $T0,$D0lo,$D0lo
  3152. vpaddq $H0,$D0hi,$D0hi
  3153. vpermq \$0x2,$D2lo,$T2
  3154. vpermq \$0x2,$D2hi,$H2
  3155. vpaddq $T1,$D1lo,$D1lo
  3156. vpaddq $H1,$D1hi,$D1hi
  3157. vextracti64x4 \$1,$D0lo,%y#$T0
  3158. vextracti64x4 \$1,$D0hi,%y#$H0
  3159. vpaddq $T2,$D2lo,$D2lo
  3160. vpaddq $H2,$D2hi,$D2hi
  3161. vextracti64x4 \$1,$D1lo,%y#$T1
  3162. vextracti64x4 \$1,$D1hi,%y#$H1
  3163. vextracti64x4 \$1,$D2lo,%y#$T2
  3164. vextracti64x4 \$1,$D2hi,%y#$H2
  3165. ___
  3166. ######## switch back to %ymm
  3167. map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  3168. map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3169. map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3170. $code.=<<___;
  3171. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  3172. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  3173. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  3174. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  3175. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  3176. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  3177. ################################################################
  3178. # partial reduction
  3179. vpsrlq \$44,$D0lo,$tmp
  3180. vpsllq \$8,$D0hi,$D0hi
  3181. vpandq $mask44,$D0lo,$H0
  3182. vpaddq $tmp,$D0hi,$D0hi
  3183. vpaddq $D0hi,$D1lo,$D1lo
  3184. vpsrlq \$44,$D1lo,$tmp
  3185. vpsllq \$8,$D1hi,$D1hi
  3186. vpandq $mask44,$D1lo,$H1
  3187. vpaddq $tmp,$D1hi,$D1hi
  3188. vpaddq $D1hi,$D2lo,$D2lo
  3189. vpsrlq \$42,$D2lo,$tmp
  3190. vpsllq \$10,$D2hi,$D2hi
  3191. vpandq $mask42,$D2lo,$H2
  3192. vpaddq $tmp,$D2hi,$D2hi
  3193. vpaddq $D2hi,$H0,$H0
  3194. vpsllq \$2,$D2hi,$D2hi
  3195. vpaddq $D2hi,$H0,$H0
  3196. vpsrlq \$44,$H0,$tmp # additional step
  3197. vpandq $mask44,$H0,$H0
  3198. vpaddq $tmp,$H1,$H1
  3199. ################################################################
  3200. vmovq %x#$H0,0($ctx)
  3201. vmovq %x#$H1,8($ctx)
  3202. vmovq %x#$H2,16($ctx)
  3203. vzeroall
  3204. .Lno_data_vpmadd52_8x:
  3205. ret
  3206. .cfi_endproc
  3207. .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
  3208. ___
  3209. }
  3210. $code.=<<___;
  3211. .type poly1305_emit_base2_44,\@function,3
  3212. .align 32
  3213. poly1305_emit_base2_44:
  3214. .cfi_startproc
  3215. endbranch
  3216. mov 0($ctx),%r8 # load hash value
  3217. mov 8($ctx),%r9
  3218. mov 16($ctx),%r10
  3219. mov %r9,%rax
  3220. shr \$20,%r9
  3221. shl \$44,%rax
  3222. mov %r10,%rcx
  3223. shr \$40,%r10
  3224. shl \$24,%rcx
  3225. add %rax,%r8
  3226. adc %rcx,%r9
  3227. adc \$0,%r10
  3228. mov %r8,%rax
  3229. add \$5,%r8 # compare to modulus
  3230. mov %r9,%rcx
  3231. adc \$0,%r9
  3232. adc \$0,%r10
  3233. shr \$2,%r10 # did 130-bit value overflow?
  3234. cmovnz %r8,%rax
  3235. cmovnz %r9,%rcx
  3236. add 0($nonce),%rax # accumulate nonce
  3237. adc 8($nonce),%rcx
  3238. mov %rax,0($mac) # write result
  3239. mov %rcx,8($mac)
  3240. ret
  3241. .cfi_endproc
  3242. .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
  3243. ___
  3244. } } }
  3245. $code.=<<___;
  3246. .align 64
  3247. .Lconst:
  3248. .Lmask24:
  3249. .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
  3250. .L129:
  3251. .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
  3252. .Lmask26:
  3253. .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
  3254. .Lpermd_avx2:
  3255. .long 2,2,2,3,2,0,2,1
  3256. .Lpermd_avx512:
  3257. .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
  3258. .L2_44_inp_permd:
  3259. .long 0,1,1,2,2,3,7,7
  3260. .L2_44_inp_shift:
  3261. .quad 0,12,24,64
  3262. .L2_44_mask:
  3263. .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
  3264. .L2_44_shift_rgt:
  3265. .quad 44,44,42,64
  3266. .L2_44_shift_lft:
  3267. .quad 8,8,10,64
  3268. .align 64
  3269. .Lx_mask44:
  3270. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  3271. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  3272. .Lx_mask42:
  3273. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  3274. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  3275. ___
  3276. }
  3277. $code.=<<___;
  3278. .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  3279. .align 16
  3280. ___
  3281. { # chacha20-poly1305 helpers
  3282. my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  3283. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  3284. $code.=<<___;
  3285. .globl xor128_encrypt_n_pad
  3286. .type xor128_encrypt_n_pad,\@abi-omnipotent
  3287. .align 16
  3288. xor128_encrypt_n_pad:
  3289. .cfi_startproc
  3290. sub $otp,$inp
  3291. sub $otp,$out
  3292. mov $len,%r10 # put len aside
  3293. shr \$4,$len # len / 16
  3294. jz .Ltail_enc
  3295. nop
  3296. .Loop_enc_xmm:
  3297. movdqu ($inp,$otp),%xmm0
  3298. pxor ($otp),%xmm0
  3299. movdqu %xmm0,($out,$otp)
  3300. movdqa %xmm0,($otp)
  3301. lea 16($otp),$otp
  3302. dec $len
  3303. jnz .Loop_enc_xmm
  3304. and \$15,%r10 # len % 16
  3305. jz .Ldone_enc
  3306. .Ltail_enc:
  3307. mov \$16,$len
  3308. sub %r10,$len
  3309. xor %eax,%eax
  3310. .Loop_enc_byte:
  3311. mov ($inp,$otp),%al
  3312. xor ($otp),%al
  3313. mov %al,($out,$otp)
  3314. mov %al,($otp)
  3315. lea 1($otp),$otp
  3316. dec %r10
  3317. jnz .Loop_enc_byte
  3318. xor %eax,%eax
  3319. .Loop_enc_pad:
  3320. mov %al,($otp)
  3321. lea 1($otp),$otp
  3322. dec $len
  3323. jnz .Loop_enc_pad
  3324. .Ldone_enc:
  3325. mov $otp,%rax
  3326. ret
  3327. .cfi_endproc
  3328. .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
  3329. .globl xor128_decrypt_n_pad
  3330. .type xor128_decrypt_n_pad,\@abi-omnipotent
  3331. .align 16
  3332. xor128_decrypt_n_pad:
  3333. .cfi_startproc
  3334. sub $otp,$inp
  3335. sub $otp,$out
  3336. mov $len,%r10 # put len aside
  3337. shr \$4,$len # len / 16
  3338. jz .Ltail_dec
  3339. nop
  3340. .Loop_dec_xmm:
  3341. movdqu ($inp,$otp),%xmm0
  3342. movdqa ($otp),%xmm1
  3343. pxor %xmm0,%xmm1
  3344. movdqu %xmm1,($out,$otp)
  3345. movdqa %xmm0,($otp)
  3346. lea 16($otp),$otp
  3347. dec $len
  3348. jnz .Loop_dec_xmm
  3349. pxor %xmm1,%xmm1
  3350. and \$15,%r10 # len % 16
  3351. jz .Ldone_dec
  3352. .Ltail_dec:
  3353. mov \$16,$len
  3354. sub %r10,$len
  3355. xor %eax,%eax
  3356. xor %r11,%r11
  3357. .Loop_dec_byte:
  3358. mov ($inp,$otp),%r11b
  3359. mov ($otp),%al
  3360. xor %r11b,%al
  3361. mov %al,($out,$otp)
  3362. mov %r11b,($otp)
  3363. lea 1($otp),$otp
  3364. dec %r10
  3365. jnz .Loop_dec_byte
  3366. xor %eax,%eax
  3367. .Loop_dec_pad:
  3368. mov %al,($otp)
  3369. lea 1($otp),$otp
  3370. dec $len
  3371. jnz .Loop_dec_pad
  3372. .Ldone_dec:
  3373. mov $otp,%rax
  3374. ret
  3375. .cfi_endproc
  3376. .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
  3377. ___
  3378. }
  3379. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  3380. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  3381. if ($win64) {
  3382. $rec="%rcx";
  3383. $frame="%rdx";
  3384. $context="%r8";
  3385. $disp="%r9";
  3386. $code.=<<___;
  3387. .extern __imp_RtlVirtualUnwind
  3388. .type se_handler,\@abi-omnipotent
  3389. .align 16
  3390. se_handler:
  3391. push %rsi
  3392. push %rdi
  3393. push %rbx
  3394. push %rbp
  3395. push %r12
  3396. push %r13
  3397. push %r14
  3398. push %r15
  3399. pushfq
  3400. sub \$64,%rsp
  3401. mov 120($context),%rax # pull context->Rax
  3402. mov 248($context),%rbx # pull context->Rip
  3403. mov 8($disp),%rsi # disp->ImageBase
  3404. mov 56($disp),%r11 # disp->HandlerData
  3405. mov 0(%r11),%r10d # HandlerData[0]
  3406. lea (%rsi,%r10),%r10 # prologue label
  3407. cmp %r10,%rbx # context->Rip<.Lprologue
  3408. jb .Lcommon_seh_tail
  3409. mov 152($context),%rax # pull context->Rsp
  3410. mov 4(%r11),%r10d # HandlerData[1]
  3411. lea (%rsi,%r10),%r10 # epilogue label
  3412. cmp %r10,%rbx # context->Rip>=.Lepilogue
  3413. jae .Lcommon_seh_tail
  3414. lea 48(%rax),%rax
  3415. mov -8(%rax),%rbx
  3416. mov -16(%rax),%rbp
  3417. mov -24(%rax),%r12
  3418. mov -32(%rax),%r13
  3419. mov -40(%rax),%r14
  3420. mov -48(%rax),%r15
  3421. mov %rbx,144($context) # restore context->Rbx
  3422. mov %rbp,160($context) # restore context->Rbp
  3423. mov %r12,216($context) # restore context->R12
  3424. mov %r13,224($context) # restore context->R13
  3425. mov %r14,232($context) # restore context->R14
  3426. mov %r15,240($context) # restore context->R14
  3427. jmp .Lcommon_seh_tail
  3428. .size se_handler,.-se_handler
  3429. .type avx_handler,\@abi-omnipotent
  3430. .align 16
  3431. avx_handler:
  3432. push %rsi
  3433. push %rdi
  3434. push %rbx
  3435. push %rbp
  3436. push %r12
  3437. push %r13
  3438. push %r14
  3439. push %r15
  3440. pushfq
  3441. sub \$64,%rsp
  3442. mov 120($context),%rax # pull context->Rax
  3443. mov 248($context),%rbx # pull context->Rip
  3444. mov 8($disp),%rsi # disp->ImageBase
  3445. mov 56($disp),%r11 # disp->HandlerData
  3446. mov 0(%r11),%r10d # HandlerData[0]
  3447. lea (%rsi,%r10),%r10 # prologue label
  3448. cmp %r10,%rbx # context->Rip<prologue label
  3449. jb .Lcommon_seh_tail
  3450. mov 152($context),%rax # pull context->Rsp
  3451. mov 4(%r11),%r10d # HandlerData[1]
  3452. lea (%rsi,%r10),%r10 # epilogue label
  3453. cmp %r10,%rbx # context->Rip>=epilogue label
  3454. jae .Lcommon_seh_tail
  3455. mov 208($context),%rax # pull context->R11
  3456. lea 0x50(%rax),%rsi
  3457. lea 0xf8(%rax),%rax
  3458. lea 512($context),%rdi # &context.Xmm6
  3459. mov \$20,%ecx
  3460. .long 0xa548f3fc # cld; rep movsq
  3461. .Lcommon_seh_tail:
  3462. mov 8(%rax),%rdi
  3463. mov 16(%rax),%rsi
  3464. mov %rax,152($context) # restore context->Rsp
  3465. mov %rsi,168($context) # restore context->Rsi
  3466. mov %rdi,176($context) # restore context->Rdi
  3467. mov 40($disp),%rdi # disp->ContextRecord
  3468. mov $context,%rsi # context
  3469. mov \$154,%ecx # sizeof(CONTEXT)
  3470. .long 0xa548f3fc # cld; rep movsq
  3471. mov $disp,%rsi
  3472. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  3473. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  3474. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  3475. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  3476. mov 40(%rsi),%r10 # disp->ContextRecord
  3477. lea 56(%rsi),%r11 # &disp->HandlerData
  3478. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  3479. mov %r10,32(%rsp) # arg5
  3480. mov %r11,40(%rsp) # arg6
  3481. mov %r12,48(%rsp) # arg7
  3482. mov %rcx,56(%rsp) # arg8, (NULL)
  3483. call *__imp_RtlVirtualUnwind(%rip)
  3484. mov \$1,%eax # ExceptionContinueSearch
  3485. add \$64,%rsp
  3486. popfq
  3487. pop %r15
  3488. pop %r14
  3489. pop %r13
  3490. pop %r12
  3491. pop %rbp
  3492. pop %rbx
  3493. pop %rdi
  3494. pop %rsi
  3495. ret
  3496. .size avx_handler,.-avx_handler
  3497. .section .pdata
  3498. .align 4
  3499. .rva .LSEH_begin_poly1305_init
  3500. .rva .LSEH_end_poly1305_init
  3501. .rva .LSEH_info_poly1305_init
  3502. .rva .LSEH_begin_poly1305_blocks
  3503. .rva .LSEH_end_poly1305_blocks
  3504. .rva .LSEH_info_poly1305_blocks
  3505. .rva .LSEH_begin_poly1305_emit
  3506. .rva .LSEH_end_poly1305_emit
  3507. .rva .LSEH_info_poly1305_emit
  3508. ___
  3509. $code.=<<___ if ($avx);
  3510. .rva .LSEH_begin_poly1305_blocks_avx
  3511. .rva .Lbase2_64_avx
  3512. .rva .LSEH_info_poly1305_blocks_avx_1
  3513. .rva .Lbase2_64_avx
  3514. .rva .Leven_avx
  3515. .rva .LSEH_info_poly1305_blocks_avx_2
  3516. .rva .Leven_avx
  3517. .rva .LSEH_end_poly1305_blocks_avx
  3518. .rva .LSEH_info_poly1305_blocks_avx_3
  3519. .rva .LSEH_begin_poly1305_emit_avx
  3520. .rva .LSEH_end_poly1305_emit_avx
  3521. .rva .LSEH_info_poly1305_emit_avx
  3522. ___
  3523. $code.=<<___ if ($avx>1);
  3524. .rva .LSEH_begin_poly1305_blocks_avx2
  3525. .rva .Lbase2_64_avx2
  3526. .rva .LSEH_info_poly1305_blocks_avx2_1
  3527. .rva .Lbase2_64_avx2
  3528. .rva .Leven_avx2
  3529. .rva .LSEH_info_poly1305_blocks_avx2_2
  3530. .rva .Leven_avx2
  3531. .rva .LSEH_end_poly1305_blocks_avx2
  3532. .rva .LSEH_info_poly1305_blocks_avx2_3
  3533. ___
  3534. $code.=<<___ if ($avx>2);
  3535. .rva .LSEH_begin_poly1305_blocks_avx512
  3536. .rva .LSEH_end_poly1305_blocks_avx512
  3537. .rva .LSEH_info_poly1305_blocks_avx512
  3538. ___
  3539. $code.=<<___;
  3540. .section .xdata
  3541. .align 8
  3542. .LSEH_info_poly1305_init:
  3543. .byte 9,0,0,0
  3544. .rva se_handler
  3545. .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
  3546. .LSEH_info_poly1305_blocks:
  3547. .byte 9,0,0,0
  3548. .rva se_handler
  3549. .rva .Lblocks_body,.Lblocks_epilogue
  3550. .LSEH_info_poly1305_emit:
  3551. .byte 9,0,0,0
  3552. .rva se_handler
  3553. .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
  3554. ___
  3555. $code.=<<___ if ($avx);
  3556. .LSEH_info_poly1305_blocks_avx_1:
  3557. .byte 9,0,0,0
  3558. .rva se_handler
  3559. .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
  3560. .LSEH_info_poly1305_blocks_avx_2:
  3561. .byte 9,0,0,0
  3562. .rva se_handler
  3563. .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
  3564. .LSEH_info_poly1305_blocks_avx_3:
  3565. .byte 9,0,0,0
  3566. .rva avx_handler
  3567. .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
  3568. .LSEH_info_poly1305_emit_avx:
  3569. .byte 9,0,0,0
  3570. .rva se_handler
  3571. .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
  3572. ___
  3573. $code.=<<___ if ($avx>1);
  3574. .LSEH_info_poly1305_blocks_avx2_1:
  3575. .byte 9,0,0,0
  3576. .rva se_handler
  3577. .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
  3578. .LSEH_info_poly1305_blocks_avx2_2:
  3579. .byte 9,0,0,0
  3580. .rva se_handler
  3581. .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
  3582. .LSEH_info_poly1305_blocks_avx2_3:
  3583. .byte 9,0,0,0
  3584. .rva avx_handler
  3585. .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
  3586. ___
  3587. $code.=<<___ if ($avx>2);
  3588. .LSEH_info_poly1305_blocks_avx512:
  3589. .byte 9,0,0,0
  3590. .rva avx_handler
  3591. .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
  3592. ___
  3593. }
  3594. foreach (split('\n',$code)) {
  3595. s/\`([^\`]*)\`/eval($1)/ge;
  3596. s/%r([a-z]+)#d/%e$1/g;
  3597. s/%r([0-9]+)#d/%r$1d/g;
  3598. s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
  3599. print $_,"\n";
  3600. }
  3601. close STDOUT or die "error closing STDOUT: $!";