poly1305-x86_64.pl 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for x86_64.
  17. #
  18. # March 2015
  19. #
  20. # Initial release.
  21. #
  22. # December 2016
  23. #
  24. # Add AVX512F+VL+BW code path.
  25. #
  26. # November 2017
  27. #
  28. # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
  29. # executed even on Knights Landing. Trigger for modification was
  30. # observation that AVX512 code paths can negatively affect overall
  31. # Skylake-X system performance. Since we are likely to suppress
  32. # AVX512F capability flag [at least on Skylake-X], conversion serves
  33. # as kind of "investment protection". Note that next *lake processor,
  34. # Cannolake, has AVX512IFMA code path to execute...
  35. #
  36. # Numbers are cycles per processed byte with poly1305_blocks alone,
  37. # measured with rdtsc at fixed clock frequency.
  38. #
  39. # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
  40. # P4 4.46/+120% -
  41. # Core 2 2.41/+90% -
  42. # Westmere 1.88/+120% -
  43. # Sandy Bridge 1.39/+140% 1.10
  44. # Haswell 1.14/+175% 1.11 0.65
  45. # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
  46. # Silvermont 2.83/+95% -
  47. # Knights L 3.60/? 1.65 1.10 0.41(***)
  48. # Goldmont 1.70/+180% -
  49. # VIA Nano 1.82/+150% -
  50. # Sledgehammer 1.38/+160% -
  51. # Bulldozer 2.30/+130% 0.97
  52. # Ryzen 1.15/+200% 1.08 1.18
  53. #
  54. # (*) improvement coefficients relative to clang are more modest and
  55. # are ~50% on most processors, in both cases we are comparing to
  56. # __int128 code;
  57. # (**) SSE2 implementation was attempted, but among non-AVX processors
  58. # it was faster than integer-only code only on older Intel P4 and
  59. # Core processors, 50-30%, less newer processor is, but slower on
  60. # contemporary ones, for example almost 2x slower on Atom, and as
  61. # former are naturally disappearing, SSE2 is deemed unnecessary;
  62. # (***) strangely enough performance seems to vary from core to core,
  63. # listed result is best case;
  64. # $output is the last argument if it looks like a file (it has an extension)
  65. # $flavour is the first argument if it doesn't look like a file
  66. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  67. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  68. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  69. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  70. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  71. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  72. die "can't locate x86_64-xlate.pl";
  73. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  74. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  75. $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
  76. }
  77. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  78. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  79. $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
  80. $avx += 2 if ($1==2.11 && $2>=8);
  81. }
  82. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  83. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  84. $avx = ($1>=10) + ($1>=12);
  85. }
  86. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  87. $avx = ($2>=3.0) + ($2>3.0);
  88. }
  89. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  90. or die "can't call $xlate: $!";
  91. *STDOUT=*OUT;
  92. my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
  93. my ($mac,$nonce)=($inp,$len); # *_emit arguments
  94. my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
  95. my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
  96. sub poly1305_iteration {
  97. # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
  98. # output: $h0-$h2 *= $r0-$r1
  99. $code.=<<___;
  100. mulq $h0 # h0*r1
  101. mov %rax,$d2
  102. mov $r0,%rax
  103. mov %rdx,$d3
  104. mulq $h0 # h0*r0
  105. mov %rax,$h0 # future $h0
  106. mov $r0,%rax
  107. mov %rdx,$d1
  108. mulq $h1 # h1*r0
  109. add %rax,$d2
  110. mov $s1,%rax
  111. adc %rdx,$d3
  112. mulq $h1 # h1*s1
  113. mov $h2,$h1 # borrow $h1
  114. add %rax,$h0
  115. adc %rdx,$d1
  116. imulq $s1,$h1 # h2*s1
  117. add $h1,$d2
  118. mov $d1,$h1
  119. adc \$0,$d3
  120. imulq $r0,$h2 # h2*r0
  121. add $d2,$h1
  122. mov \$-4,%rax # mask value
  123. adc $h2,$d3
  124. and $d3,%rax # last reduction step
  125. mov $d3,$h2
  126. shr \$2,$d3
  127. and \$3,$h2
  128. add $d3,%rax
  129. add %rax,$h0
  130. adc \$0,$h1
  131. adc \$0,$h2
  132. ___
  133. }
  134. ########################################################################
  135. # Layout of opaque area is following.
  136. #
  137. # unsigned __int64 h[3]; # current hash value base 2^64
  138. # unsigned __int64 r[2]; # key value base 2^64
  139. $code.=<<___;
  140. .text
  141. .extern OPENSSL_ia32cap_P
  142. .globl poly1305_init
  143. .hidden poly1305_init
  144. .globl poly1305_blocks
  145. .hidden poly1305_blocks
  146. .globl poly1305_emit
  147. .hidden poly1305_emit
  148. .type poly1305_init,\@function,3
  149. .align 32
  150. poly1305_init:
  151. .cfi_startproc
  152. xor %rax,%rax
  153. mov %rax,0($ctx) # initialize hash value
  154. mov %rax,8($ctx)
  155. mov %rax,16($ctx)
  156. cmp \$0,$inp
  157. je .Lno_key
  158. lea poly1305_blocks(%rip),%r10
  159. lea poly1305_emit(%rip),%r11
  160. ___
  161. $code.=<<___ if ($avx);
  162. mov OPENSSL_ia32cap_P+4(%rip),%r9
  163. lea poly1305_blocks_avx(%rip),%rax
  164. lea poly1305_emit_avx(%rip),%rcx
  165. bt \$`60-32`,%r9 # AVX?
  166. cmovc %rax,%r10
  167. cmovc %rcx,%r11
  168. ___
  169. $code.=<<___ if ($avx>1);
  170. lea poly1305_blocks_avx2(%rip),%rax
  171. bt \$`5+32`,%r9 # AVX2?
  172. cmovc %rax,%r10
  173. ___
  174. $code.=<<___ if ($avx>3 && !$win64);
  175. mov \$`(1<<31|1<<21|1<<16)`,%rax
  176. shr \$32,%r9
  177. and %rax,%r9
  178. cmp %rax,%r9
  179. je .Linit_base2_44
  180. ___
  181. $code.=<<___;
  182. mov \$0x0ffffffc0fffffff,%rax
  183. mov \$0x0ffffffc0ffffffc,%rcx
  184. and 0($inp),%rax
  185. and 8($inp),%rcx
  186. mov %rax,24($ctx)
  187. mov %rcx,32($ctx)
  188. ___
  189. $code.=<<___ if ($flavour !~ /elf32/);
  190. mov %r10,0(%rdx)
  191. mov %r11,8(%rdx)
  192. ___
  193. $code.=<<___ if ($flavour =~ /elf32/);
  194. mov %r10d,0(%rdx)
  195. mov %r11d,4(%rdx)
  196. ___
  197. $code.=<<___;
  198. mov \$1,%eax
  199. .Lno_key:
  200. ret
  201. .cfi_endproc
  202. .size poly1305_init,.-poly1305_init
  203. .type poly1305_blocks,\@function,4
  204. .align 32
  205. poly1305_blocks:
  206. .cfi_startproc
  207. endbranch
  208. .Lblocks:
  209. shr \$4,$len
  210. jz .Lno_data # too short
  211. push %rbx
  212. .cfi_push %rbx
  213. push %rbp
  214. .cfi_push %rbp
  215. push %r12
  216. .cfi_push %r12
  217. push %r13
  218. .cfi_push %r13
  219. push %r14
  220. .cfi_push %r14
  221. push %r15
  222. .cfi_push %r15
  223. .Lblocks_body:
  224. mov $len,%r15 # reassign $len
  225. mov 24($ctx),$r0 # load r
  226. mov 32($ctx),$s1
  227. mov 0($ctx),$h0 # load hash value
  228. mov 8($ctx),$h1
  229. mov 16($ctx),$h2
  230. mov $s1,$r1
  231. shr \$2,$s1
  232. mov $r1,%rax
  233. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  234. jmp .Loop
  235. .align 32
  236. .Loop:
  237. add 0($inp),$h0 # accumulate input
  238. adc 8($inp),$h1
  239. lea 16($inp),$inp
  240. adc $padbit,$h2
  241. ___
  242. &poly1305_iteration();
  243. $code.=<<___;
  244. mov $r1,%rax
  245. dec %r15 # len-=16
  246. jnz .Loop
  247. mov $h0,0($ctx) # store hash value
  248. mov $h1,8($ctx)
  249. mov $h2,16($ctx)
  250. mov 0(%rsp),%r15
  251. .cfi_restore %r15
  252. mov 8(%rsp),%r14
  253. .cfi_restore %r14
  254. mov 16(%rsp),%r13
  255. .cfi_restore %r13
  256. mov 24(%rsp),%r12
  257. .cfi_restore %r12
  258. mov 32(%rsp),%rbp
  259. .cfi_restore %rbp
  260. mov 40(%rsp),%rbx
  261. .cfi_restore %rbx
  262. lea 48(%rsp),%rsp
  263. .cfi_adjust_cfa_offset -48
  264. .Lno_data:
  265. .Lblocks_epilogue:
  266. ret
  267. .cfi_endproc
  268. .size poly1305_blocks,.-poly1305_blocks
  269. .type poly1305_emit,\@function,3
  270. .align 32
  271. poly1305_emit:
  272. .cfi_startproc
  273. endbranch
  274. .Lemit:
  275. mov 0($ctx),%r8 # load hash value
  276. mov 8($ctx),%r9
  277. mov 16($ctx),%r10
  278. mov %r8,%rax
  279. add \$5,%r8 # compare to modulus
  280. mov %r9,%rcx
  281. adc \$0,%r9
  282. adc \$0,%r10
  283. shr \$2,%r10 # did 130-bit value overflow?
  284. cmovnz %r8,%rax
  285. cmovnz %r9,%rcx
  286. add 0($nonce),%rax # accumulate nonce
  287. adc 8($nonce),%rcx
  288. mov %rax,0($mac) # write result
  289. mov %rcx,8($mac)
  290. ret
  291. .cfi_endproc
  292. .size poly1305_emit,.-poly1305_emit
  293. ___
  294. if ($avx) {
  295. ########################################################################
  296. # Layout of opaque area is following.
  297. #
  298. # unsigned __int32 h[5]; # current hash value base 2^26
  299. # unsigned __int32 is_base2_26;
  300. # unsigned __int64 r[2]; # key value base 2^64
  301. # unsigned __int64 pad;
  302. # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
  303. #
  304. # where r^n are base 2^26 digits of degrees of multiplier key. There are
  305. # 5 digits, but last four are interleaved with multiples of 5, totalling
  306. # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
  307. my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
  308. map("%xmm$_",(0..15));
  309. $code.=<<___;
  310. .type __poly1305_block,\@abi-omnipotent
  311. .align 32
  312. __poly1305_block:
  313. .cfi_startproc
  314. ___
  315. &poly1305_iteration();
  316. $code.=<<___;
  317. ret
  318. .cfi_endproc
  319. .size __poly1305_block,.-__poly1305_block
  320. .type __poly1305_init_avx,\@abi-omnipotent
  321. .align 32
  322. __poly1305_init_avx:
  323. .cfi_startproc
  324. mov $r0,$h0
  325. mov $r1,$h1
  326. xor $h2,$h2
  327. lea 48+64($ctx),$ctx # size optimization
  328. mov $r1,%rax
  329. call __poly1305_block # r^2
  330. mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
  331. mov \$0x3ffffff,%edx
  332. mov $h0,$d1
  333. and $h0#d,%eax
  334. mov $r0,$d2
  335. and $r0#d,%edx
  336. mov %eax,`16*0+0-64`($ctx)
  337. shr \$26,$d1
  338. mov %edx,`16*0+4-64`($ctx)
  339. shr \$26,$d2
  340. mov \$0x3ffffff,%eax
  341. mov \$0x3ffffff,%edx
  342. and $d1#d,%eax
  343. and $d2#d,%edx
  344. mov %eax,`16*1+0-64`($ctx)
  345. lea (%rax,%rax,4),%eax # *5
  346. mov %edx,`16*1+4-64`($ctx)
  347. lea (%rdx,%rdx,4),%edx # *5
  348. mov %eax,`16*2+0-64`($ctx)
  349. shr \$26,$d1
  350. mov %edx,`16*2+4-64`($ctx)
  351. shr \$26,$d2
  352. mov $h1,%rax
  353. mov $r1,%rdx
  354. shl \$12,%rax
  355. shl \$12,%rdx
  356. or $d1,%rax
  357. or $d2,%rdx
  358. and \$0x3ffffff,%eax
  359. and \$0x3ffffff,%edx
  360. mov %eax,`16*3+0-64`($ctx)
  361. lea (%rax,%rax,4),%eax # *5
  362. mov %edx,`16*3+4-64`($ctx)
  363. lea (%rdx,%rdx,4),%edx # *5
  364. mov %eax,`16*4+0-64`($ctx)
  365. mov $h1,$d1
  366. mov %edx,`16*4+4-64`($ctx)
  367. mov $r1,$d2
  368. mov \$0x3ffffff,%eax
  369. mov \$0x3ffffff,%edx
  370. shr \$14,$d1
  371. shr \$14,$d2
  372. and $d1#d,%eax
  373. and $d2#d,%edx
  374. mov %eax,`16*5+0-64`($ctx)
  375. lea (%rax,%rax,4),%eax # *5
  376. mov %edx,`16*5+4-64`($ctx)
  377. lea (%rdx,%rdx,4),%edx # *5
  378. mov %eax,`16*6+0-64`($ctx)
  379. shr \$26,$d1
  380. mov %edx,`16*6+4-64`($ctx)
  381. shr \$26,$d2
  382. mov $h2,%rax
  383. shl \$24,%rax
  384. or %rax,$d1
  385. mov $d1#d,`16*7+0-64`($ctx)
  386. lea ($d1,$d1,4),$d1 # *5
  387. mov $d2#d,`16*7+4-64`($ctx)
  388. lea ($d2,$d2,4),$d2 # *5
  389. mov $d1#d,`16*8+0-64`($ctx)
  390. mov $d2#d,`16*8+4-64`($ctx)
  391. mov $r1,%rax
  392. call __poly1305_block # r^3
  393. mov \$0x3ffffff,%eax # save r^3 base 2^26
  394. mov $h0,$d1
  395. and $h0#d,%eax
  396. shr \$26,$d1
  397. mov %eax,`16*0+12-64`($ctx)
  398. mov \$0x3ffffff,%edx
  399. and $d1#d,%edx
  400. mov %edx,`16*1+12-64`($ctx)
  401. lea (%rdx,%rdx,4),%edx # *5
  402. shr \$26,$d1
  403. mov %edx,`16*2+12-64`($ctx)
  404. mov $h1,%rax
  405. shl \$12,%rax
  406. or $d1,%rax
  407. and \$0x3ffffff,%eax
  408. mov %eax,`16*3+12-64`($ctx)
  409. lea (%rax,%rax,4),%eax # *5
  410. mov $h1,$d1
  411. mov %eax,`16*4+12-64`($ctx)
  412. mov \$0x3ffffff,%edx
  413. shr \$14,$d1
  414. and $d1#d,%edx
  415. mov %edx,`16*5+12-64`($ctx)
  416. lea (%rdx,%rdx,4),%edx # *5
  417. shr \$26,$d1
  418. mov %edx,`16*6+12-64`($ctx)
  419. mov $h2,%rax
  420. shl \$24,%rax
  421. or %rax,$d1
  422. mov $d1#d,`16*7+12-64`($ctx)
  423. lea ($d1,$d1,4),$d1 # *5
  424. mov $d1#d,`16*8+12-64`($ctx)
  425. mov $r1,%rax
  426. call __poly1305_block # r^4
  427. mov \$0x3ffffff,%eax # save r^4 base 2^26
  428. mov $h0,$d1
  429. and $h0#d,%eax
  430. shr \$26,$d1
  431. mov %eax,`16*0+8-64`($ctx)
  432. mov \$0x3ffffff,%edx
  433. and $d1#d,%edx
  434. mov %edx,`16*1+8-64`($ctx)
  435. lea (%rdx,%rdx,4),%edx # *5
  436. shr \$26,$d1
  437. mov %edx,`16*2+8-64`($ctx)
  438. mov $h1,%rax
  439. shl \$12,%rax
  440. or $d1,%rax
  441. and \$0x3ffffff,%eax
  442. mov %eax,`16*3+8-64`($ctx)
  443. lea (%rax,%rax,4),%eax # *5
  444. mov $h1,$d1
  445. mov %eax,`16*4+8-64`($ctx)
  446. mov \$0x3ffffff,%edx
  447. shr \$14,$d1
  448. and $d1#d,%edx
  449. mov %edx,`16*5+8-64`($ctx)
  450. lea (%rdx,%rdx,4),%edx # *5
  451. shr \$26,$d1
  452. mov %edx,`16*6+8-64`($ctx)
  453. mov $h2,%rax
  454. shl \$24,%rax
  455. or %rax,$d1
  456. mov $d1#d,`16*7+8-64`($ctx)
  457. lea ($d1,$d1,4),$d1 # *5
  458. mov $d1#d,`16*8+8-64`($ctx)
  459. lea -48-64($ctx),$ctx # size [de-]optimization
  460. ret
  461. .cfi_endproc
  462. .size __poly1305_init_avx,.-__poly1305_init_avx
  463. .type poly1305_blocks_avx,\@function,4
  464. .align 32
  465. poly1305_blocks_avx:
  466. .cfi_startproc
  467. endbranch
  468. mov 20($ctx),%r8d # is_base2_26
  469. cmp \$128,$len
  470. jae .Lblocks_avx
  471. test %r8d,%r8d
  472. jz .Lblocks
  473. .Lblocks_avx:
  474. and \$-16,$len
  475. jz .Lno_data_avx
  476. vzeroupper
  477. test %r8d,%r8d
  478. jz .Lbase2_64_avx
  479. test \$31,$len
  480. jz .Leven_avx
  481. push %rbx
  482. .cfi_push %rbx
  483. push %rbp
  484. .cfi_push %rbp
  485. push %r12
  486. .cfi_push %r12
  487. push %r13
  488. .cfi_push %r13
  489. push %r14
  490. .cfi_push %r14
  491. push %r15
  492. .cfi_push %r15
  493. .Lblocks_avx_body:
  494. mov $len,%r15 # reassign $len
  495. mov 0($ctx),$d1 # load hash value
  496. mov 8($ctx),$d2
  497. mov 16($ctx),$h2#d
  498. mov 24($ctx),$r0 # load r
  499. mov 32($ctx),$s1
  500. ################################# base 2^26 -> base 2^64
  501. mov $d1#d,$h0#d
  502. and \$`-1*(1<<31)`,$d1
  503. mov $d2,$r1 # borrow $r1
  504. mov $d2#d,$h1#d
  505. and \$`-1*(1<<31)`,$d2
  506. shr \$6,$d1
  507. shl \$52,$r1
  508. add $d1,$h0
  509. shr \$12,$h1
  510. shr \$18,$d2
  511. add $r1,$h0
  512. adc $d2,$h1
  513. mov $h2,$d1
  514. shl \$40,$d1
  515. shr \$24,$h2
  516. add $d1,$h1
  517. adc \$0,$h2 # can be partially reduced...
  518. mov \$-4,$d2 # ... so reduce
  519. mov $h2,$d1
  520. and $h2,$d2
  521. shr \$2,$d1
  522. and \$3,$h2
  523. add $d2,$d1 # =*5
  524. add $d1,$h0
  525. adc \$0,$h1
  526. adc \$0,$h2
  527. mov $s1,$r1
  528. mov $s1,%rax
  529. shr \$2,$s1
  530. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  531. add 0($inp),$h0 # accumulate input
  532. adc 8($inp),$h1
  533. lea 16($inp),$inp
  534. adc $padbit,$h2
  535. call __poly1305_block
  536. test $padbit,$padbit # if $padbit is zero,
  537. jz .Lstore_base2_64_avx # store hash in base 2^64 format
  538. ################################# base 2^64 -> base 2^26
  539. mov $h0,%rax
  540. mov $h0,%rdx
  541. shr \$52,$h0
  542. mov $h1,$r0
  543. mov $h1,$r1
  544. shr \$26,%rdx
  545. and \$0x3ffffff,%rax # h[0]
  546. shl \$12,$r0
  547. and \$0x3ffffff,%rdx # h[1]
  548. shr \$14,$h1
  549. or $r0,$h0
  550. shl \$24,$h2
  551. and \$0x3ffffff,$h0 # h[2]
  552. shr \$40,$r1
  553. and \$0x3ffffff,$h1 # h[3]
  554. or $r1,$h2 # h[4]
  555. sub \$16,%r15
  556. jz .Lstore_base2_26_avx
  557. vmovd %rax#d,$H0
  558. vmovd %rdx#d,$H1
  559. vmovd $h0#d,$H2
  560. vmovd $h1#d,$H3
  561. vmovd $h2#d,$H4
  562. jmp .Lproceed_avx
  563. .align 32
  564. .Lstore_base2_64_avx:
  565. mov $h0,0($ctx)
  566. mov $h1,8($ctx)
  567. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  568. jmp .Ldone_avx
  569. .align 16
  570. .Lstore_base2_26_avx:
  571. mov %rax#d,0($ctx) # store hash value base 2^26
  572. mov %rdx#d,4($ctx)
  573. mov $h0#d,8($ctx)
  574. mov $h1#d,12($ctx)
  575. mov $h2#d,16($ctx)
  576. .align 16
  577. .Ldone_avx:
  578. mov 0(%rsp),%r15
  579. .cfi_restore %r15
  580. mov 8(%rsp),%r14
  581. .cfi_restore %r14
  582. mov 16(%rsp),%r13
  583. .cfi_restore %r13
  584. mov 24(%rsp),%r12
  585. .cfi_restore %r12
  586. mov 32(%rsp),%rbp
  587. .cfi_restore %rbp
  588. mov 40(%rsp),%rbx
  589. .cfi_restore %rbx
  590. lea 48(%rsp),%rsp
  591. .cfi_adjust_cfa_offset -48
  592. .Lno_data_avx:
  593. .Lblocks_avx_epilogue:
  594. ret
  595. .cfi_endproc
  596. .align 32
  597. .Lbase2_64_avx:
  598. .cfi_startproc
  599. push %rbx
  600. .cfi_push %rbx
  601. push %rbp
  602. .cfi_push %rbp
  603. push %r12
  604. .cfi_push %r12
  605. push %r13
  606. .cfi_push %r13
  607. push %r14
  608. .cfi_push %r14
  609. push %r15
  610. .cfi_push %r15
  611. .Lbase2_64_avx_body:
  612. mov $len,%r15 # reassign $len
  613. mov 24($ctx),$r0 # load r
  614. mov 32($ctx),$s1
  615. mov 0($ctx),$h0 # load hash value
  616. mov 8($ctx),$h1
  617. mov 16($ctx),$h2#d
  618. mov $s1,$r1
  619. mov $s1,%rax
  620. shr \$2,$s1
  621. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  622. test \$31,$len
  623. jz .Linit_avx
  624. add 0($inp),$h0 # accumulate input
  625. adc 8($inp),$h1
  626. lea 16($inp),$inp
  627. adc $padbit,$h2
  628. sub \$16,%r15
  629. call __poly1305_block
  630. .Linit_avx:
  631. ################################# base 2^64 -> base 2^26
  632. mov $h0,%rax
  633. mov $h0,%rdx
  634. shr \$52,$h0
  635. mov $h1,$d1
  636. mov $h1,$d2
  637. shr \$26,%rdx
  638. and \$0x3ffffff,%rax # h[0]
  639. shl \$12,$d1
  640. and \$0x3ffffff,%rdx # h[1]
  641. shr \$14,$h1
  642. or $d1,$h0
  643. shl \$24,$h2
  644. and \$0x3ffffff,$h0 # h[2]
  645. shr \$40,$d2
  646. and \$0x3ffffff,$h1 # h[3]
  647. or $d2,$h2 # h[4]
  648. vmovd %rax#d,$H0
  649. vmovd %rdx#d,$H1
  650. vmovd $h0#d,$H2
  651. vmovd $h1#d,$H3
  652. vmovd $h2#d,$H4
  653. movl \$1,20($ctx) # set is_base2_26
  654. call __poly1305_init_avx
  655. .Lproceed_avx:
  656. mov %r15,$len
  657. mov 0(%rsp),%r15
  658. .cfi_restore %r15
  659. mov 8(%rsp),%r14
  660. .cfi_restore %r14
  661. mov 16(%rsp),%r13
  662. .cfi_restore %r13
  663. mov 24(%rsp),%r12
  664. .cfi_restore %r12
  665. mov 32(%rsp),%rbp
  666. .cfi_restore %rbp
  667. mov 40(%rsp),%rbx
  668. .cfi_restore %rbx
  669. lea 48(%rsp),%rax
  670. lea 48(%rsp),%rsp
  671. .cfi_adjust_cfa_offset -48
  672. .Lbase2_64_avx_epilogue:
  673. jmp .Ldo_avx
  674. .cfi_endproc
  675. .align 32
  676. .Leven_avx:
  677. .cfi_startproc
  678. vmovd 4*0($ctx),$H0 # load hash value
  679. vmovd 4*1($ctx),$H1
  680. vmovd 4*2($ctx),$H2
  681. vmovd 4*3($ctx),$H3
  682. vmovd 4*4($ctx),$H4
  683. .Ldo_avx:
  684. ___
  685. $code.=<<___ if (!$win64);
  686. lea -0x58(%rsp),%r11
  687. .cfi_def_cfa %r11,0x60
  688. sub \$0x178,%rsp
  689. ___
  690. $code.=<<___ if ($win64);
  691. lea -0xf8(%rsp),%r11
  692. sub \$0x218,%rsp
  693. vmovdqa %xmm6,0x50(%r11)
  694. vmovdqa %xmm7,0x60(%r11)
  695. vmovdqa %xmm8,0x70(%r11)
  696. vmovdqa %xmm9,0x80(%r11)
  697. vmovdqa %xmm10,0x90(%r11)
  698. vmovdqa %xmm11,0xa0(%r11)
  699. vmovdqa %xmm12,0xb0(%r11)
  700. vmovdqa %xmm13,0xc0(%r11)
  701. vmovdqa %xmm14,0xd0(%r11)
  702. vmovdqa %xmm15,0xe0(%r11)
  703. .Ldo_avx_body:
  704. ___
  705. $code.=<<___;
  706. sub \$64,$len
  707. lea -32($inp),%rax
  708. cmovc %rax,$inp
  709. vmovdqu `16*3`($ctx),$D4 # preload r0^2
  710. lea `16*3+64`($ctx),$ctx # size optimization
  711. lea .Lconst(%rip),%rcx
  712. ################################################################
  713. # load input
  714. vmovdqu 16*2($inp),$T0
  715. vmovdqu 16*3($inp),$T1
  716. vmovdqa 64(%rcx),$MASK # .Lmask26
  717. vpsrldq \$6,$T0,$T2 # splat input
  718. vpsrldq \$6,$T1,$T3
  719. vpunpckhqdq $T1,$T0,$T4 # 4
  720. vpunpcklqdq $T1,$T0,$T0 # 0:1
  721. vpunpcklqdq $T3,$T2,$T3 # 2:3
  722. vpsrlq \$40,$T4,$T4 # 4
  723. vpsrlq \$26,$T0,$T1
  724. vpand $MASK,$T0,$T0 # 0
  725. vpsrlq \$4,$T3,$T2
  726. vpand $MASK,$T1,$T1 # 1
  727. vpsrlq \$30,$T3,$T3
  728. vpand $MASK,$T2,$T2 # 2
  729. vpand $MASK,$T3,$T3 # 3
  730. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  731. jbe .Lskip_loop_avx
  732. # expand and copy pre-calculated table to stack
  733. vmovdqu `16*1-64`($ctx),$D1
  734. vmovdqu `16*2-64`($ctx),$D2
  735. vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
  736. vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
  737. vmovdqa $D3,-0x90(%r11)
  738. vmovdqa $D0,0x00(%rsp)
  739. vpshufd \$0xEE,$D1,$D4
  740. vmovdqu `16*3-64`($ctx),$D0
  741. vpshufd \$0x44,$D1,$D1
  742. vmovdqa $D4,-0x80(%r11)
  743. vmovdqa $D1,0x10(%rsp)
  744. vpshufd \$0xEE,$D2,$D3
  745. vmovdqu `16*4-64`($ctx),$D1
  746. vpshufd \$0x44,$D2,$D2
  747. vmovdqa $D3,-0x70(%r11)
  748. vmovdqa $D2,0x20(%rsp)
  749. vpshufd \$0xEE,$D0,$D4
  750. vmovdqu `16*5-64`($ctx),$D2
  751. vpshufd \$0x44,$D0,$D0
  752. vmovdqa $D4,-0x60(%r11)
  753. vmovdqa $D0,0x30(%rsp)
  754. vpshufd \$0xEE,$D1,$D3
  755. vmovdqu `16*6-64`($ctx),$D0
  756. vpshufd \$0x44,$D1,$D1
  757. vmovdqa $D3,-0x50(%r11)
  758. vmovdqa $D1,0x40(%rsp)
  759. vpshufd \$0xEE,$D2,$D4
  760. vmovdqu `16*7-64`($ctx),$D1
  761. vpshufd \$0x44,$D2,$D2
  762. vmovdqa $D4,-0x40(%r11)
  763. vmovdqa $D2,0x50(%rsp)
  764. vpshufd \$0xEE,$D0,$D3
  765. vmovdqu `16*8-64`($ctx),$D2
  766. vpshufd \$0x44,$D0,$D0
  767. vmovdqa $D3,-0x30(%r11)
  768. vmovdqa $D0,0x60(%rsp)
  769. vpshufd \$0xEE,$D1,$D4
  770. vpshufd \$0x44,$D1,$D1
  771. vmovdqa $D4,-0x20(%r11)
  772. vmovdqa $D1,0x70(%rsp)
  773. vpshufd \$0xEE,$D2,$D3
  774. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  775. vpshufd \$0x44,$D2,$D2
  776. vmovdqa $D3,-0x10(%r11)
  777. vmovdqa $D2,0x80(%rsp)
  778. jmp .Loop_avx
  779. .align 32
  780. .Loop_avx:
  781. ################################################################
  782. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  783. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  784. # \___________________/
  785. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  786. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  787. # \___________________/ \____________________/
  788. #
  789. # Note that we start with inp[2:3]*r^2. This is because it
  790. # doesn't depend on reduction in previous iteration.
  791. ################################################################
  792. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  793. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  794. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  795. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  796. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  797. #
  798. # though note that $Tx and $Hx are "reversed" in this section,
  799. # and $D4 is preloaded with r0^2...
  800. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  801. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  802. vmovdqa $H2,0x20(%r11) # offload hash
  803. vpmuludq $T2,$D4,$D2 # d3 = h2*r0
  804. vmovdqa 0x10(%rsp),$H2 # r1^2
  805. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  806. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  807. vmovdqa $H0,0x00(%r11) #
  808. vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
  809. vmovdqa $H1,0x10(%r11) #
  810. vpmuludq $T3,$H2,$H1 # h3*r1
  811. vpaddq $H0,$D0,$D0 # d0 += h4*s1
  812. vpaddq $H1,$D4,$D4 # d4 += h3*r1
  813. vmovdqa $H3,0x30(%r11) #
  814. vpmuludq $T2,$H2,$H0 # h2*r1
  815. vpmuludq $T1,$H2,$H1 # h1*r1
  816. vpaddq $H0,$D3,$D3 # d3 += h2*r1
  817. vmovdqa 0x30(%rsp),$H3 # r2^2
  818. vpaddq $H1,$D2,$D2 # d2 += h1*r1
  819. vmovdqa $H4,0x40(%r11) #
  820. vpmuludq $T0,$H2,$H2 # h0*r1
  821. vpmuludq $T2,$H3,$H0 # h2*r2
  822. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  823. vmovdqa 0x40(%rsp),$H4 # s2^2
  824. vpaddq $H0,$D4,$D4 # d4 += h2*r2
  825. vpmuludq $T1,$H3,$H1 # h1*r2
  826. vpmuludq $T0,$H3,$H3 # h0*r2
  827. vpaddq $H1,$D3,$D3 # d3 += h1*r2
  828. vmovdqa 0x50(%rsp),$H2 # r3^2
  829. vpaddq $H3,$D2,$D2 # d2 += h0*r2
  830. vpmuludq $T4,$H4,$H0 # h4*s2
  831. vpmuludq $T3,$H4,$H4 # h3*s2
  832. vpaddq $H0,$D1,$D1 # d1 += h4*s2
  833. vmovdqa 0x60(%rsp),$H3 # s3^2
  834. vpaddq $H4,$D0,$D0 # d0 += h3*s2
  835. vmovdqa 0x80(%rsp),$H4 # s4^2
  836. vpmuludq $T1,$H2,$H1 # h1*r3
  837. vpmuludq $T0,$H2,$H2 # h0*r3
  838. vpaddq $H1,$D4,$D4 # d4 += h1*r3
  839. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  840. vpmuludq $T4,$H3,$H0 # h4*s3
  841. vpmuludq $T3,$H3,$H1 # h3*s3
  842. vpaddq $H0,$D2,$D2 # d2 += h4*s3
  843. vmovdqu 16*0($inp),$H0 # load input
  844. vpaddq $H1,$D1,$D1 # d1 += h3*s3
  845. vpmuludq $T2,$H3,$H3 # h2*s3
  846. vpmuludq $T2,$H4,$T2 # h2*s4
  847. vpaddq $H3,$D0,$D0 # d0 += h2*s3
  848. vmovdqu 16*1($inp),$H1 #
  849. vpaddq $T2,$D1,$D1 # d1 += h2*s4
  850. vpmuludq $T3,$H4,$T3 # h3*s4
  851. vpmuludq $T4,$H4,$T4 # h4*s4
  852. vpsrldq \$6,$H0,$H2 # splat input
  853. vpaddq $T3,$D2,$D2 # d2 += h3*s4
  854. vpaddq $T4,$D3,$D3 # d3 += h4*s4
  855. vpsrldq \$6,$H1,$H3 #
  856. vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
  857. vpmuludq $T1,$H4,$T0 # h1*s4
  858. vpunpckhqdq $H1,$H0,$H4 # 4
  859. vpaddq $T4,$D4,$D4 # d4 += h0*r4
  860. vmovdqa -0x90(%r11),$T4 # r0^4
  861. vpaddq $T0,$D0,$D0 # d0 += h1*s4
  862. vpunpcklqdq $H1,$H0,$H0 # 0:1
  863. vpunpcklqdq $H3,$H2,$H3 # 2:3
  864. #vpsrlq \$40,$H4,$H4 # 4
  865. vpsrldq \$`40/8`,$H4,$H4 # 4
  866. vpsrlq \$26,$H0,$H1
  867. vpand $MASK,$H0,$H0 # 0
  868. vpsrlq \$4,$H3,$H2
  869. vpand $MASK,$H1,$H1 # 1
  870. vpand 0(%rcx),$H4,$H4 # .Lmask24
  871. vpsrlq \$30,$H3,$H3
  872. vpand $MASK,$H2,$H2 # 2
  873. vpand $MASK,$H3,$H3 # 3
  874. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  875. vpaddq 0x00(%r11),$H0,$H0 # add hash value
  876. vpaddq 0x10(%r11),$H1,$H1
  877. vpaddq 0x20(%r11),$H2,$H2
  878. vpaddq 0x30(%r11),$H3,$H3
  879. vpaddq 0x40(%r11),$H4,$H4
  880. lea 16*2($inp),%rax
  881. lea 16*4($inp),$inp
  882. sub \$64,$len
  883. cmovc %rax,$inp
  884. ################################################################
  885. # Now we accumulate (inp[0:1]+hash)*r^4
  886. ################################################################
  887. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  888. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  889. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  890. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  891. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  892. vpmuludq $H0,$T4,$T0 # h0*r0
  893. vpmuludq $H1,$T4,$T1 # h1*r0
  894. vpaddq $T0,$D0,$D0
  895. vpaddq $T1,$D1,$D1
  896. vmovdqa -0x80(%r11),$T2 # r1^4
  897. vpmuludq $H2,$T4,$T0 # h2*r0
  898. vpmuludq $H3,$T4,$T1 # h3*r0
  899. vpaddq $T0,$D2,$D2
  900. vpaddq $T1,$D3,$D3
  901. vpmuludq $H4,$T4,$T4 # h4*r0
  902. vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
  903. vpaddq $T4,$D4,$D4
  904. vpaddq $T0,$D0,$D0 # d0 += h4*s1
  905. vpmuludq $H2,$T2,$T1 # h2*r1
  906. vpmuludq $H3,$T2,$T0 # h3*r1
  907. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  908. vmovdqa -0x60(%r11),$T3 # r2^4
  909. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  910. vpmuludq $H1,$T2,$T1 # h1*r1
  911. vpmuludq $H0,$T2,$T2 # h0*r1
  912. vpaddq $T1,$D2,$D2 # d2 += h1*r1
  913. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  914. vmovdqa -0x50(%r11),$T4 # s2^4
  915. vpmuludq $H2,$T3,$T0 # h2*r2
  916. vpmuludq $H1,$T3,$T1 # h1*r2
  917. vpaddq $T0,$D4,$D4 # d4 += h2*r2
  918. vpaddq $T1,$D3,$D3 # d3 += h1*r2
  919. vmovdqa -0x40(%r11),$T2 # r3^4
  920. vpmuludq $H0,$T3,$T3 # h0*r2
  921. vpmuludq $H4,$T4,$T0 # h4*s2
  922. vpaddq $T3,$D2,$D2 # d2 += h0*r2
  923. vpaddq $T0,$D1,$D1 # d1 += h4*s2
  924. vmovdqa -0x30(%r11),$T3 # s3^4
  925. vpmuludq $H3,$T4,$T4 # h3*s2
  926. vpmuludq $H1,$T2,$T1 # h1*r3
  927. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  928. vmovdqa -0x10(%r11),$T4 # s4^4
  929. vpaddq $T1,$D4,$D4 # d4 += h1*r3
  930. vpmuludq $H0,$T2,$T2 # h0*r3
  931. vpmuludq $H4,$T3,$T0 # h4*s3
  932. vpaddq $T2,$D3,$D3 # d3 += h0*r3
  933. vpaddq $T0,$D2,$D2 # d2 += h4*s3
  934. vmovdqu 16*2($inp),$T0 # load input
  935. vpmuludq $H3,$T3,$T2 # h3*s3
  936. vpmuludq $H2,$T3,$T3 # h2*s3
  937. vpaddq $T2,$D1,$D1 # d1 += h3*s3
  938. vmovdqu 16*3($inp),$T1 #
  939. vpaddq $T3,$D0,$D0 # d0 += h2*s3
  940. vpmuludq $H2,$T4,$H2 # h2*s4
  941. vpmuludq $H3,$T4,$H3 # h3*s4
  942. vpsrldq \$6,$T0,$T2 # splat input
  943. vpaddq $H2,$D1,$D1 # d1 += h2*s4
  944. vpmuludq $H4,$T4,$H4 # h4*s4
  945. vpsrldq \$6,$T1,$T3 #
  946. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
  947. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
  948. vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
  949. vpmuludq $H1,$T4,$H0
  950. vpunpckhqdq $T1,$T0,$T4 # 4
  951. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  952. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  953. vpunpcklqdq $T1,$T0,$T0 # 0:1
  954. vpunpcklqdq $T3,$T2,$T3 # 2:3
  955. #vpsrlq \$40,$T4,$T4 # 4
  956. vpsrldq \$`40/8`,$T4,$T4 # 4
  957. vpsrlq \$26,$T0,$T1
  958. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  959. vpand $MASK,$T0,$T0 # 0
  960. vpsrlq \$4,$T3,$T2
  961. vpand $MASK,$T1,$T1 # 1
  962. vpand 0(%rcx),$T4,$T4 # .Lmask24
  963. vpsrlq \$30,$T3,$T3
  964. vpand $MASK,$T2,$T2 # 2
  965. vpand $MASK,$T3,$T3 # 3
  966. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  967. ################################################################
  968. # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  969. # and P. Schwabe
  970. vpsrlq \$26,$H3,$D3
  971. vpand $MASK,$H3,$H3
  972. vpaddq $D3,$H4,$H4 # h3 -> h4
  973. vpsrlq \$26,$H0,$D0
  974. vpand $MASK,$H0,$H0
  975. vpaddq $D0,$D1,$H1 # h0 -> h1
  976. vpsrlq \$26,$H4,$D0
  977. vpand $MASK,$H4,$H4
  978. vpsrlq \$26,$H1,$D1
  979. vpand $MASK,$H1,$H1
  980. vpaddq $D1,$H2,$H2 # h1 -> h2
  981. vpaddq $D0,$H0,$H0
  982. vpsllq \$2,$D0,$D0
  983. vpaddq $D0,$H0,$H0 # h4 -> h0
  984. vpsrlq \$26,$H2,$D2
  985. vpand $MASK,$H2,$H2
  986. vpaddq $D2,$H3,$H3 # h2 -> h3
  987. vpsrlq \$26,$H0,$D0
  988. vpand $MASK,$H0,$H0
  989. vpaddq $D0,$H1,$H1 # h0 -> h1
  990. vpsrlq \$26,$H3,$D3
  991. vpand $MASK,$H3,$H3
  992. vpaddq $D3,$H4,$H4 # h3 -> h4
  993. ja .Loop_avx
  994. .Lskip_loop_avx:
  995. ################################################################
  996. # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  997. vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
  998. add \$32,$len
  999. jnz .Long_tail_avx
  1000. vpaddq $H2,$T2,$T2
  1001. vpaddq $H0,$T0,$T0
  1002. vpaddq $H1,$T1,$T1
  1003. vpaddq $H3,$T3,$T3
  1004. vpaddq $H4,$T4,$T4
  1005. .Long_tail_avx:
  1006. vmovdqa $H2,0x20(%r11)
  1007. vmovdqa $H0,0x00(%r11)
  1008. vmovdqa $H1,0x10(%r11)
  1009. vmovdqa $H3,0x30(%r11)
  1010. vmovdqa $H4,0x40(%r11)
  1011. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1012. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1013. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1014. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1015. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1016. vpmuludq $T2,$D4,$D2 # d2 = h2*r0
  1017. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  1018. vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
  1019. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  1020. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  1021. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  1022. vpmuludq $T3,$H2,$H0 # h3*r1
  1023. vpaddq $H0,$D4,$D4 # d4 += h3*r1
  1024. vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
  1025. vpmuludq $T2,$H2,$H1 # h2*r1
  1026. vpaddq $H1,$D3,$D3 # d3 += h2*r1
  1027. vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
  1028. vpmuludq $T1,$H2,$H0 # h1*r1
  1029. vpaddq $H0,$D2,$D2 # d2 += h1*r1
  1030. vpmuludq $T0,$H2,$H2 # h0*r1
  1031. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  1032. vpmuludq $T4,$H3,$H3 # h4*s1
  1033. vpaddq $H3,$D0,$D0 # d0 += h4*s1
  1034. vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
  1035. vpmuludq $T2,$H4,$H1 # h2*r2
  1036. vpaddq $H1,$D4,$D4 # d4 += h2*r2
  1037. vpmuludq $T1,$H4,$H0 # h1*r2
  1038. vpaddq $H0,$D3,$D3 # d3 += h1*r2
  1039. vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
  1040. vpmuludq $T0,$H4,$H4 # h0*r2
  1041. vpaddq $H4,$D2,$D2 # d2 += h0*r2
  1042. vpmuludq $T4,$H2,$H1 # h4*s2
  1043. vpaddq $H1,$D1,$D1 # d1 += h4*s2
  1044. vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
  1045. vpmuludq $T3,$H2,$H2 # h3*s2
  1046. vpaddq $H2,$D0,$D0 # d0 += h3*s2
  1047. vpmuludq $T1,$H3,$H0 # h1*r3
  1048. vpaddq $H0,$D4,$D4 # d4 += h1*r3
  1049. vpmuludq $T0,$H3,$H3 # h0*r3
  1050. vpaddq $H3,$D3,$D3 # d3 += h0*r3
  1051. vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
  1052. vpmuludq $T4,$H4,$H1 # h4*s3
  1053. vpaddq $H1,$D2,$D2 # d2 += h4*s3
  1054. vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
  1055. vpmuludq $T3,$H4,$H0 # h3*s3
  1056. vpaddq $H0,$D1,$D1 # d1 += h3*s3
  1057. vpmuludq $T2,$H4,$H4 # h2*s3
  1058. vpaddq $H4,$D0,$D0 # d0 += h2*s3
  1059. vpmuludq $T0,$H2,$H2 # h0*r4
  1060. vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
  1061. vpmuludq $T4,$H3,$H1 # h4*s4
  1062. vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
  1063. vpmuludq $T3,$H3,$H0 # h3*s4
  1064. vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
  1065. vpmuludq $T2,$H3,$H1 # h2*s4
  1066. vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
  1067. vpmuludq $T1,$H3,$H3 # h1*s4
  1068. vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
  1069. jz .Lshort_tail_avx
  1070. vmovdqu 16*0($inp),$H0 # load input
  1071. vmovdqu 16*1($inp),$H1
  1072. vpsrldq \$6,$H0,$H2 # splat input
  1073. vpsrldq \$6,$H1,$H3
  1074. vpunpckhqdq $H1,$H0,$H4 # 4
  1075. vpunpcklqdq $H1,$H0,$H0 # 0:1
  1076. vpunpcklqdq $H3,$H2,$H3 # 2:3
  1077. vpsrlq \$40,$H4,$H4 # 4
  1078. vpsrlq \$26,$H0,$H1
  1079. vpand $MASK,$H0,$H0 # 0
  1080. vpsrlq \$4,$H3,$H2
  1081. vpand $MASK,$H1,$H1 # 1
  1082. vpsrlq \$30,$H3,$H3
  1083. vpand $MASK,$H2,$H2 # 2
  1084. vpand $MASK,$H3,$H3 # 3
  1085. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  1086. vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
  1087. vpaddq 0x00(%r11),$H0,$H0
  1088. vpaddq 0x10(%r11),$H1,$H1
  1089. vpaddq 0x20(%r11),$H2,$H2
  1090. vpaddq 0x30(%r11),$H3,$H3
  1091. vpaddq 0x40(%r11),$H4,$H4
  1092. ################################################################
  1093. # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
  1094. vpmuludq $H0,$T4,$T0 # h0*r0
  1095. vpaddq $T0,$D0,$D0 # d0 += h0*r0
  1096. vpmuludq $H1,$T4,$T1 # h1*r0
  1097. vpaddq $T1,$D1,$D1 # d1 += h1*r0
  1098. vpmuludq $H2,$T4,$T0 # h2*r0
  1099. vpaddq $T0,$D2,$D2 # d2 += h2*r0
  1100. vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
  1101. vpmuludq $H3,$T4,$T1 # h3*r0
  1102. vpaddq $T1,$D3,$D3 # d3 += h3*r0
  1103. vpmuludq $H4,$T4,$T4 # h4*r0
  1104. vpaddq $T4,$D4,$D4 # d4 += h4*r0
  1105. vpmuludq $H3,$T2,$T0 # h3*r1
  1106. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  1107. vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
  1108. vpmuludq $H2,$T2,$T1 # h2*r1
  1109. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  1110. vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
  1111. vpmuludq $H1,$T2,$T0 # h1*r1
  1112. vpaddq $T0,$D2,$D2 # d2 += h1*r1
  1113. vpmuludq $H0,$T2,$T2 # h0*r1
  1114. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  1115. vpmuludq $H4,$T3,$T3 # h4*s1
  1116. vpaddq $T3,$D0,$D0 # d0 += h4*s1
  1117. vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
  1118. vpmuludq $H2,$T4,$T1 # h2*r2
  1119. vpaddq $T1,$D4,$D4 # d4 += h2*r2
  1120. vpmuludq $H1,$T4,$T0 # h1*r2
  1121. vpaddq $T0,$D3,$D3 # d3 += h1*r2
  1122. vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
  1123. vpmuludq $H0,$T4,$T4 # h0*r2
  1124. vpaddq $T4,$D2,$D2 # d2 += h0*r2
  1125. vpmuludq $H4,$T2,$T1 # h4*s2
  1126. vpaddq $T1,$D1,$D1 # d1 += h4*s2
  1127. vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
  1128. vpmuludq $H3,$T2,$T2 # h3*s2
  1129. vpaddq $T2,$D0,$D0 # d0 += h3*s2
  1130. vpmuludq $H1,$T3,$T0 # h1*r3
  1131. vpaddq $T0,$D4,$D4 # d4 += h1*r3
  1132. vpmuludq $H0,$T3,$T3 # h0*r3
  1133. vpaddq $T3,$D3,$D3 # d3 += h0*r3
  1134. vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
  1135. vpmuludq $H4,$T4,$T1 # h4*s3
  1136. vpaddq $T1,$D2,$D2 # d2 += h4*s3
  1137. vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
  1138. vpmuludq $H3,$T4,$T0 # h3*s3
  1139. vpaddq $T0,$D1,$D1 # d1 += h3*s3
  1140. vpmuludq $H2,$T4,$T4 # h2*s3
  1141. vpaddq $T4,$D0,$D0 # d0 += h2*s3
  1142. vpmuludq $H0,$T2,$T2 # h0*r4
  1143. vpaddq $T2,$D4,$D4 # d4 += h0*r4
  1144. vpmuludq $H4,$T3,$T1 # h4*s4
  1145. vpaddq $T1,$D3,$D3 # d3 += h4*s4
  1146. vpmuludq $H3,$T3,$T0 # h3*s4
  1147. vpaddq $T0,$D2,$D2 # d2 += h3*s4
  1148. vpmuludq $H2,$T3,$T1 # h2*s4
  1149. vpaddq $T1,$D1,$D1 # d1 += h2*s4
  1150. vpmuludq $H1,$T3,$T3 # h1*s4
  1151. vpaddq $T3,$D0,$D0 # d0 += h1*s4
  1152. .Lshort_tail_avx:
  1153. ################################################################
  1154. # horizontal addition
  1155. vpsrldq \$8,$D4,$T4
  1156. vpsrldq \$8,$D3,$T3
  1157. vpsrldq \$8,$D1,$T1
  1158. vpsrldq \$8,$D0,$T0
  1159. vpsrldq \$8,$D2,$T2
  1160. vpaddq $T3,$D3,$D3
  1161. vpaddq $T4,$D4,$D4
  1162. vpaddq $T0,$D0,$D0
  1163. vpaddq $T1,$D1,$D1
  1164. vpaddq $T2,$D2,$D2
  1165. ################################################################
  1166. # lazy reduction
  1167. vpsrlq \$26,$D3,$H3
  1168. vpand $MASK,$D3,$D3
  1169. vpaddq $H3,$D4,$D4 # h3 -> h4
  1170. vpsrlq \$26,$D0,$H0
  1171. vpand $MASK,$D0,$D0
  1172. vpaddq $H0,$D1,$D1 # h0 -> h1
  1173. vpsrlq \$26,$D4,$H4
  1174. vpand $MASK,$D4,$D4
  1175. vpsrlq \$26,$D1,$H1
  1176. vpand $MASK,$D1,$D1
  1177. vpaddq $H1,$D2,$D2 # h1 -> h2
  1178. vpaddq $H4,$D0,$D0
  1179. vpsllq \$2,$H4,$H4
  1180. vpaddq $H4,$D0,$D0 # h4 -> h0
  1181. vpsrlq \$26,$D2,$H2
  1182. vpand $MASK,$D2,$D2
  1183. vpaddq $H2,$D3,$D3 # h2 -> h3
  1184. vpsrlq \$26,$D0,$H0
  1185. vpand $MASK,$D0,$D0
  1186. vpaddq $H0,$D1,$D1 # h0 -> h1
  1187. vpsrlq \$26,$D3,$H3
  1188. vpand $MASK,$D3,$D3
  1189. vpaddq $H3,$D4,$D4 # h3 -> h4
  1190. vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
  1191. vmovd $D1,`4*1-48-64`($ctx)
  1192. vmovd $D2,`4*2-48-64`($ctx)
  1193. vmovd $D3,`4*3-48-64`($ctx)
  1194. vmovd $D4,`4*4-48-64`($ctx)
  1195. ___
  1196. $code.=<<___ if ($win64);
  1197. vmovdqa 0x50(%r11),%xmm6
  1198. vmovdqa 0x60(%r11),%xmm7
  1199. vmovdqa 0x70(%r11),%xmm8
  1200. vmovdqa 0x80(%r11),%xmm9
  1201. vmovdqa 0x90(%r11),%xmm10
  1202. vmovdqa 0xa0(%r11),%xmm11
  1203. vmovdqa 0xb0(%r11),%xmm12
  1204. vmovdqa 0xc0(%r11),%xmm13
  1205. vmovdqa 0xd0(%r11),%xmm14
  1206. vmovdqa 0xe0(%r11),%xmm15
  1207. lea 0xf8(%r11),%rsp
  1208. .Ldo_avx_epilogue:
  1209. ___
  1210. $code.=<<___ if (!$win64);
  1211. lea 0x58(%r11),%rsp
  1212. .cfi_def_cfa %rsp,8
  1213. ___
  1214. $code.=<<___;
  1215. vzeroupper
  1216. ret
  1217. .cfi_endproc
  1218. .size poly1305_blocks_avx,.-poly1305_blocks_avx
  1219. .type poly1305_emit_avx,\@function,3
  1220. .align 32
  1221. poly1305_emit_avx:
  1222. .cfi_startproc
  1223. endbranch
  1224. cmpl \$0,20($ctx) # is_base2_26?
  1225. je .Lemit
  1226. mov 0($ctx),%eax # load hash value base 2^26
  1227. mov 4($ctx),%ecx
  1228. mov 8($ctx),%r8d
  1229. mov 12($ctx),%r11d
  1230. mov 16($ctx),%r10d
  1231. shl \$26,%rcx # base 2^26 -> base 2^64
  1232. mov %r8,%r9
  1233. shl \$52,%r8
  1234. add %rcx,%rax
  1235. shr \$12,%r9
  1236. add %rax,%r8 # h0
  1237. adc \$0,%r9
  1238. shl \$14,%r11
  1239. mov %r10,%rax
  1240. shr \$24,%r10
  1241. add %r11,%r9
  1242. shl \$40,%rax
  1243. add %rax,%r9 # h1
  1244. adc \$0,%r10 # h2
  1245. mov %r10,%rax # could be partially reduced, so reduce
  1246. mov %r10,%rcx
  1247. and \$3,%r10
  1248. shr \$2,%rax
  1249. and \$-4,%rcx
  1250. add %rcx,%rax
  1251. add %rax,%r8
  1252. adc \$0,%r9
  1253. adc \$0,%r10
  1254. mov %r8,%rax
  1255. add \$5,%r8 # compare to modulus
  1256. mov %r9,%rcx
  1257. adc \$0,%r9
  1258. adc \$0,%r10
  1259. shr \$2,%r10 # did 130-bit value overflow?
  1260. cmovnz %r8,%rax
  1261. cmovnz %r9,%rcx
  1262. add 0($nonce),%rax # accumulate nonce
  1263. adc 8($nonce),%rcx
  1264. mov %rax,0($mac) # write result
  1265. mov %rcx,8($mac)
  1266. ret
  1267. .cfi_endproc
  1268. .size poly1305_emit_avx,.-poly1305_emit_avx
  1269. ___
  1270. if ($avx>1) {
  1271. my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
  1272. map("%ymm$_",(0..15));
  1273. my $S4=$MASK;
  1274. $code.=<<___;
  1275. .type poly1305_blocks_avx2,\@function,4
  1276. .align 32
  1277. poly1305_blocks_avx2:
  1278. .cfi_startproc
  1279. endbranch
  1280. mov 20($ctx),%r8d # is_base2_26
  1281. cmp \$128,$len
  1282. jae .Lblocks_avx2
  1283. test %r8d,%r8d
  1284. jz .Lblocks
  1285. .Lblocks_avx2:
  1286. and \$-16,$len
  1287. jz .Lno_data_avx2
  1288. vzeroupper
  1289. test %r8d,%r8d
  1290. jz .Lbase2_64_avx2
  1291. test \$63,$len
  1292. jz .Leven_avx2
  1293. push %rbx
  1294. .cfi_push %rbx
  1295. push %rbp
  1296. .cfi_push %rbp
  1297. push %r12
  1298. .cfi_push %r12
  1299. push %r13
  1300. .cfi_push %r13
  1301. push %r14
  1302. .cfi_push %r14
  1303. push %r15
  1304. .cfi_push %r15
  1305. .Lblocks_avx2_body:
  1306. mov $len,%r15 # reassign $len
  1307. mov 0($ctx),$d1 # load hash value
  1308. mov 8($ctx),$d2
  1309. mov 16($ctx),$h2#d
  1310. mov 24($ctx),$r0 # load r
  1311. mov 32($ctx),$s1
  1312. ################################# base 2^26 -> base 2^64
  1313. mov $d1#d,$h0#d
  1314. and \$`-1*(1<<31)`,$d1
  1315. mov $d2,$r1 # borrow $r1
  1316. mov $d2#d,$h1#d
  1317. and \$`-1*(1<<31)`,$d2
  1318. shr \$6,$d1
  1319. shl \$52,$r1
  1320. add $d1,$h0
  1321. shr \$12,$h1
  1322. shr \$18,$d2
  1323. add $r1,$h0
  1324. adc $d2,$h1
  1325. mov $h2,$d1
  1326. shl \$40,$d1
  1327. shr \$24,$h2
  1328. add $d1,$h1
  1329. adc \$0,$h2 # can be partially reduced...
  1330. mov \$-4,$d2 # ... so reduce
  1331. mov $h2,$d1
  1332. and $h2,$d2
  1333. shr \$2,$d1
  1334. and \$3,$h2
  1335. add $d2,$d1 # =*5
  1336. add $d1,$h0
  1337. adc \$0,$h1
  1338. adc \$0,$h2
  1339. mov $s1,$r1
  1340. mov $s1,%rax
  1341. shr \$2,$s1
  1342. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1343. .Lbase2_26_pre_avx2:
  1344. add 0($inp),$h0 # accumulate input
  1345. adc 8($inp),$h1
  1346. lea 16($inp),$inp
  1347. adc $padbit,$h2
  1348. sub \$16,%r15
  1349. call __poly1305_block
  1350. mov $r1,%rax
  1351. test \$63,%r15
  1352. jnz .Lbase2_26_pre_avx2
  1353. test $padbit,$padbit # if $padbit is zero,
  1354. jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
  1355. ################################# base 2^64 -> base 2^26
  1356. mov $h0,%rax
  1357. mov $h0,%rdx
  1358. shr \$52,$h0
  1359. mov $h1,$r0
  1360. mov $h1,$r1
  1361. shr \$26,%rdx
  1362. and \$0x3ffffff,%rax # h[0]
  1363. shl \$12,$r0
  1364. and \$0x3ffffff,%rdx # h[1]
  1365. shr \$14,$h1
  1366. or $r0,$h0
  1367. shl \$24,$h2
  1368. and \$0x3ffffff,$h0 # h[2]
  1369. shr \$40,$r1
  1370. and \$0x3ffffff,$h1 # h[3]
  1371. or $r1,$h2 # h[4]
  1372. test %r15,%r15
  1373. jz .Lstore_base2_26_avx2
  1374. vmovd %rax#d,%x#$H0
  1375. vmovd %rdx#d,%x#$H1
  1376. vmovd $h0#d,%x#$H2
  1377. vmovd $h1#d,%x#$H3
  1378. vmovd $h2#d,%x#$H4
  1379. jmp .Lproceed_avx2
  1380. .align 32
  1381. .Lstore_base2_64_avx2:
  1382. mov $h0,0($ctx)
  1383. mov $h1,8($ctx)
  1384. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  1385. jmp .Ldone_avx2
  1386. .align 16
  1387. .Lstore_base2_26_avx2:
  1388. mov %rax#d,0($ctx) # store hash value base 2^26
  1389. mov %rdx#d,4($ctx)
  1390. mov $h0#d,8($ctx)
  1391. mov $h1#d,12($ctx)
  1392. mov $h2#d,16($ctx)
  1393. .align 16
  1394. .Ldone_avx2:
  1395. mov 0(%rsp),%r15
  1396. .cfi_restore %r15
  1397. mov 8(%rsp),%r14
  1398. .cfi_restore %r14
  1399. mov 16(%rsp),%r13
  1400. .cfi_restore %r13
  1401. mov 24(%rsp),%r12
  1402. .cfi_restore %r12
  1403. mov 32(%rsp),%rbp
  1404. .cfi_restore %rbp
  1405. mov 40(%rsp),%rbx
  1406. .cfi_restore %rbx
  1407. lea 48(%rsp),%rsp
  1408. .cfi_adjust_cfa_offset -48
  1409. .Lno_data_avx2:
  1410. .Lblocks_avx2_epilogue:
  1411. ret
  1412. .cfi_endproc
  1413. .align 32
  1414. .Lbase2_64_avx2:
  1415. .cfi_startproc
  1416. push %rbx
  1417. .cfi_push %rbx
  1418. push %rbp
  1419. .cfi_push %rbp
  1420. push %r12
  1421. .cfi_push %r12
  1422. push %r13
  1423. .cfi_push %r13
  1424. push %r14
  1425. .cfi_push %r14
  1426. push %r15
  1427. .cfi_push %r15
  1428. .Lbase2_64_avx2_body:
  1429. mov $len,%r15 # reassign $len
  1430. mov 24($ctx),$r0 # load r
  1431. mov 32($ctx),$s1
  1432. mov 0($ctx),$h0 # load hash value
  1433. mov 8($ctx),$h1
  1434. mov 16($ctx),$h2#d
  1435. mov $s1,$r1
  1436. mov $s1,%rax
  1437. shr \$2,$s1
  1438. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1439. test \$63,$len
  1440. jz .Linit_avx2
  1441. .Lbase2_64_pre_avx2:
  1442. add 0($inp),$h0 # accumulate input
  1443. adc 8($inp),$h1
  1444. lea 16($inp),$inp
  1445. adc $padbit,$h2
  1446. sub \$16,%r15
  1447. call __poly1305_block
  1448. mov $r1,%rax
  1449. test \$63,%r15
  1450. jnz .Lbase2_64_pre_avx2
  1451. .Linit_avx2:
  1452. ################################# base 2^64 -> base 2^26
  1453. mov $h0,%rax
  1454. mov $h0,%rdx
  1455. shr \$52,$h0
  1456. mov $h1,$d1
  1457. mov $h1,$d2
  1458. shr \$26,%rdx
  1459. and \$0x3ffffff,%rax # h[0]
  1460. shl \$12,$d1
  1461. and \$0x3ffffff,%rdx # h[1]
  1462. shr \$14,$h1
  1463. or $d1,$h0
  1464. shl \$24,$h2
  1465. and \$0x3ffffff,$h0 # h[2]
  1466. shr \$40,$d2
  1467. and \$0x3ffffff,$h1 # h[3]
  1468. or $d2,$h2 # h[4]
  1469. vmovd %rax#d,%x#$H0
  1470. vmovd %rdx#d,%x#$H1
  1471. vmovd $h0#d,%x#$H2
  1472. vmovd $h1#d,%x#$H3
  1473. vmovd $h2#d,%x#$H4
  1474. movl \$1,20($ctx) # set is_base2_26
  1475. call __poly1305_init_avx
  1476. .Lproceed_avx2:
  1477. mov %r15,$len # restore $len
  1478. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  1479. mov \$`(1<<31|1<<30|1<<16)`,%r11d
  1480. mov 0(%rsp),%r15
  1481. .cfi_restore %r15
  1482. mov 8(%rsp),%r14
  1483. .cfi_restore %r14
  1484. mov 16(%rsp),%r13
  1485. .cfi_restore %r13
  1486. mov 24(%rsp),%r12
  1487. .cfi_restore %r12
  1488. mov 32(%rsp),%rbp
  1489. .cfi_restore %rbp
  1490. mov 40(%rsp),%rbx
  1491. .cfi_restore %rbx
  1492. lea 48(%rsp),%rax
  1493. lea 48(%rsp),%rsp
  1494. .cfi_adjust_cfa_offset -48
  1495. .Lbase2_64_avx2_epilogue:
  1496. jmp .Ldo_avx2
  1497. .cfi_endproc
  1498. .align 32
  1499. .Leven_avx2:
  1500. .cfi_startproc
  1501. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  1502. vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
  1503. vmovd 4*1($ctx),%x#$H1
  1504. vmovd 4*2($ctx),%x#$H2
  1505. vmovd 4*3($ctx),%x#$H3
  1506. vmovd 4*4($ctx),%x#$H4
  1507. .Ldo_avx2:
  1508. ___
  1509. $code.=<<___ if ($avx>2);
  1510. cmp \$512,$len
  1511. jb .Lskip_avx512
  1512. and %r11d,%r10d
  1513. test \$`1<<16`,%r10d # check for AVX512F
  1514. jnz .Lblocks_avx512
  1515. .Lskip_avx512:
  1516. ___
  1517. $code.=<<___ if (!$win64);
  1518. lea -8(%rsp),%r11
  1519. .cfi_def_cfa %r11,16
  1520. sub \$0x128,%rsp
  1521. ___
  1522. $code.=<<___ if ($win64);
  1523. lea -0xf8(%rsp),%r11
  1524. sub \$0x1c8,%rsp
  1525. vmovdqa %xmm6,0x50(%r11)
  1526. vmovdqa %xmm7,0x60(%r11)
  1527. vmovdqa %xmm8,0x70(%r11)
  1528. vmovdqa %xmm9,0x80(%r11)
  1529. vmovdqa %xmm10,0x90(%r11)
  1530. vmovdqa %xmm11,0xa0(%r11)
  1531. vmovdqa %xmm12,0xb0(%r11)
  1532. vmovdqa %xmm13,0xc0(%r11)
  1533. vmovdqa %xmm14,0xd0(%r11)
  1534. vmovdqa %xmm15,0xe0(%r11)
  1535. .Ldo_avx2_body:
  1536. ___
  1537. $code.=<<___;
  1538. lea .Lconst(%rip),%rcx
  1539. lea 48+64($ctx),$ctx # size optimization
  1540. vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
  1541. # expand and copy pre-calculated table to stack
  1542. vmovdqu `16*0-64`($ctx),%x#$T2
  1543. and \$-512,%rsp
  1544. vmovdqu `16*1-64`($ctx),%x#$T3
  1545. vmovdqu `16*2-64`($ctx),%x#$T4
  1546. vmovdqu `16*3-64`($ctx),%x#$D0
  1547. vmovdqu `16*4-64`($ctx),%x#$D1
  1548. vmovdqu `16*5-64`($ctx),%x#$D2
  1549. lea 0x90(%rsp),%rax # size optimization
  1550. vmovdqu `16*6-64`($ctx),%x#$D3
  1551. vpermd $T2,$T0,$T2 # 00003412 -> 14243444
  1552. vmovdqu `16*7-64`($ctx),%x#$D4
  1553. vpermd $T3,$T0,$T3
  1554. vmovdqu `16*8-64`($ctx),%x#$MASK
  1555. vpermd $T4,$T0,$T4
  1556. vmovdqa $T2,0x00(%rsp)
  1557. vpermd $D0,$T0,$D0
  1558. vmovdqa $T3,0x20-0x90(%rax)
  1559. vpermd $D1,$T0,$D1
  1560. vmovdqa $T4,0x40-0x90(%rax)
  1561. vpermd $D2,$T0,$D2
  1562. vmovdqa $D0,0x60-0x90(%rax)
  1563. vpermd $D3,$T0,$D3
  1564. vmovdqa $D1,0x80-0x90(%rax)
  1565. vpermd $D4,$T0,$D4
  1566. vmovdqa $D2,0xa0-0x90(%rax)
  1567. vpermd $MASK,$T0,$MASK
  1568. vmovdqa $D3,0xc0-0x90(%rax)
  1569. vmovdqa $D4,0xe0-0x90(%rax)
  1570. vmovdqa $MASK,0x100-0x90(%rax)
  1571. vmovdqa 64(%rcx),$MASK # .Lmask26
  1572. ################################################################
  1573. # load input
  1574. vmovdqu 16*0($inp),%x#$T0
  1575. vmovdqu 16*1($inp),%x#$T1
  1576. vinserti128 \$1,16*2($inp),$T0,$T0
  1577. vinserti128 \$1,16*3($inp),$T1,$T1
  1578. lea 16*4($inp),$inp
  1579. vpsrldq \$6,$T0,$T2 # splat input
  1580. vpsrldq \$6,$T1,$T3
  1581. vpunpckhqdq $T1,$T0,$T4 # 4
  1582. vpunpcklqdq $T3,$T2,$T2 # 2:3
  1583. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1584. vpsrlq \$30,$T2,$T3
  1585. vpsrlq \$4,$T2,$T2
  1586. vpsrlq \$26,$T0,$T1
  1587. vpsrlq \$40,$T4,$T4 # 4
  1588. vpand $MASK,$T2,$T2 # 2
  1589. vpand $MASK,$T0,$T0 # 0
  1590. vpand $MASK,$T1,$T1 # 1
  1591. vpand $MASK,$T3,$T3 # 3
  1592. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1593. vpaddq $H2,$T2,$H2 # accumulate input
  1594. sub \$64,$len
  1595. jz .Ltail_avx2
  1596. jmp .Loop_avx2
  1597. .align 32
  1598. .Loop_avx2:
  1599. ################################################################
  1600. # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
  1601. # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
  1602. # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
  1603. # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
  1604. # \________/\__________/
  1605. ################################################################
  1606. #vpaddq $H2,$T2,$H2 # accumulate input
  1607. vpaddq $H0,$T0,$H0
  1608. vmovdqa `32*0`(%rsp),$T0 # r0^4
  1609. vpaddq $H1,$T1,$H1
  1610. vmovdqa `32*1`(%rsp),$T1 # r1^4
  1611. vpaddq $H3,$T3,$H3
  1612. vmovdqa `32*3`(%rsp),$T2 # r2^4
  1613. vpaddq $H4,$T4,$H4
  1614. vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
  1615. vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
  1616. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1617. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1618. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1619. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1620. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1621. #
  1622. # however, as h2 is "chronologically" first one available pull
  1623. # corresponding operations up, so it's
  1624. #
  1625. # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
  1626. # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
  1627. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1628. # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
  1629. # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
  1630. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1631. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1632. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1633. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1634. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1635. vpmuludq $H0,$T1,$T4 # h0*r1
  1636. vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
  1637. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1638. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1639. vpmuludq $H3,$T1,$T4 # h3*r1
  1640. vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
  1641. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1642. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1643. vmovdqa `32*4-0x90`(%rax),$T1 # s2
  1644. vpmuludq $H0,$T0,$T4 # h0*r0
  1645. vpmuludq $H1,$T0,$H2 # h1*r0
  1646. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1647. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1648. vpmuludq $H3,$T0,$T4 # h3*r0
  1649. vpmuludq $H4,$T0,$H2 # h4*r0
  1650. vmovdqu 16*0($inp),%x#$T0 # load input
  1651. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1652. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1653. vinserti128 \$1,16*2($inp),$T0,$T0
  1654. vpmuludq $H3,$T1,$T4 # h3*s2
  1655. vpmuludq $H4,$T1,$H2 # h4*s2
  1656. vmovdqu 16*1($inp),%x#$T1
  1657. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1658. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1659. vmovdqa `32*5-0x90`(%rax),$H2 # r3
  1660. vpmuludq $H1,$T2,$T4 # h1*r2
  1661. vpmuludq $H0,$T2,$T2 # h0*r2
  1662. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1663. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1664. vinserti128 \$1,16*3($inp),$T1,$T1
  1665. lea 16*4($inp),$inp
  1666. vpmuludq $H1,$H2,$T4 # h1*r3
  1667. vpmuludq $H0,$H2,$H2 # h0*r3
  1668. vpsrldq \$6,$T0,$T2 # splat input
  1669. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1670. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1671. vpmuludq $H3,$T3,$T4 # h3*s3
  1672. vpmuludq $H4,$T3,$H2 # h4*s3
  1673. vpsrldq \$6,$T1,$T3
  1674. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1675. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1676. vpunpckhqdq $T1,$T0,$T4 # 4
  1677. vpmuludq $H3,$S4,$H3 # h3*s4
  1678. vpmuludq $H4,$S4,$H4 # h4*s4
  1679. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1680. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1681. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1682. vpunpcklqdq $T3,$T2,$T3 # 2:3
  1683. vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
  1684. vpmuludq $H1,$S4,$H0 # h1*s4
  1685. vmovdqa 64(%rcx),$MASK # .Lmask26
  1686. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1687. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1688. ################################################################
  1689. # lazy reduction (interleaved with tail of input splat)
  1690. vpsrlq \$26,$H3,$D3
  1691. vpand $MASK,$H3,$H3
  1692. vpaddq $D3,$H4,$H4 # h3 -> h4
  1693. vpsrlq \$26,$H0,$D0
  1694. vpand $MASK,$H0,$H0
  1695. vpaddq $D0,$D1,$H1 # h0 -> h1
  1696. vpsrlq \$26,$H4,$D4
  1697. vpand $MASK,$H4,$H4
  1698. vpsrlq \$4,$T3,$T2
  1699. vpsrlq \$26,$H1,$D1
  1700. vpand $MASK,$H1,$H1
  1701. vpaddq $D1,$H2,$H2 # h1 -> h2
  1702. vpaddq $D4,$H0,$H0
  1703. vpsllq \$2,$D4,$D4
  1704. vpaddq $D4,$H0,$H0 # h4 -> h0
  1705. vpand $MASK,$T2,$T2 # 2
  1706. vpsrlq \$26,$T0,$T1
  1707. vpsrlq \$26,$H2,$D2
  1708. vpand $MASK,$H2,$H2
  1709. vpaddq $D2,$H3,$H3 # h2 -> h3
  1710. vpaddq $T2,$H2,$H2 # modulo-scheduled
  1711. vpsrlq \$30,$T3,$T3
  1712. vpsrlq \$26,$H0,$D0
  1713. vpand $MASK,$H0,$H0
  1714. vpaddq $D0,$H1,$H1 # h0 -> h1
  1715. vpsrlq \$40,$T4,$T4 # 4
  1716. vpsrlq \$26,$H3,$D3
  1717. vpand $MASK,$H3,$H3
  1718. vpaddq $D3,$H4,$H4 # h3 -> h4
  1719. vpand $MASK,$T0,$T0 # 0
  1720. vpand $MASK,$T1,$T1 # 1
  1721. vpand $MASK,$T3,$T3 # 3
  1722. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1723. sub \$64,$len
  1724. jnz .Loop_avx2
  1725. .byte 0x66,0x90
  1726. .Ltail_avx2:
  1727. ################################################################
  1728. # while above multiplications were by r^4 in all lanes, in last
  1729. # iteration we multiply least significant lane by r^4 and most
  1730. # significant one by r, so copy of above except that references
  1731. # to the precomputed table are displaced by 4...
  1732. #vpaddq $H2,$T2,$H2 # accumulate input
  1733. vpaddq $H0,$T0,$H0
  1734. vmovdqu `32*0+4`(%rsp),$T0 # r0^4
  1735. vpaddq $H1,$T1,$H1
  1736. vmovdqu `32*1+4`(%rsp),$T1 # r1^4
  1737. vpaddq $H3,$T3,$H3
  1738. vmovdqu `32*3+4`(%rsp),$T2 # r2^4
  1739. vpaddq $H4,$T4,$H4
  1740. vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
  1741. vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
  1742. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1743. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1744. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1745. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1746. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1747. vpmuludq $H0,$T1,$T4 # h0*r1
  1748. vpmuludq $H1,$T1,$H2 # h1*r1
  1749. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1750. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1751. vpmuludq $H3,$T1,$T4 # h3*r1
  1752. vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
  1753. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1754. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1755. vpmuludq $H0,$T0,$T4 # h0*r0
  1756. vpmuludq $H1,$T0,$H2 # h1*r0
  1757. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1758. vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
  1759. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1760. vpmuludq $H3,$T0,$T4 # h3*r0
  1761. vpmuludq $H4,$T0,$H2 # h4*r0
  1762. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1763. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1764. vpmuludq $H3,$T1,$T4 # h3*s2
  1765. vpmuludq $H4,$T1,$H2 # h4*s2
  1766. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1767. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1768. vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
  1769. vpmuludq $H1,$T2,$T4 # h1*r2
  1770. vpmuludq $H0,$T2,$T2 # h0*r2
  1771. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1772. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1773. vpmuludq $H1,$H2,$T4 # h1*r3
  1774. vpmuludq $H0,$H2,$H2 # h0*r3
  1775. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1776. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1777. vpmuludq $H3,$T3,$T4 # h3*s3
  1778. vpmuludq $H4,$T3,$H2 # h4*s3
  1779. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1780. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1781. vpmuludq $H3,$S4,$H3 # h3*s4
  1782. vpmuludq $H4,$S4,$H4 # h4*s4
  1783. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1784. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1785. vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
  1786. vpmuludq $H1,$S4,$H0 # h1*s4
  1787. vmovdqa 64(%rcx),$MASK # .Lmask26
  1788. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1789. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1790. ################################################################
  1791. # horizontal addition
  1792. vpsrldq \$8,$D1,$T1
  1793. vpsrldq \$8,$H2,$T2
  1794. vpsrldq \$8,$H3,$T3
  1795. vpsrldq \$8,$H4,$T4
  1796. vpsrldq \$8,$H0,$T0
  1797. vpaddq $T1,$D1,$D1
  1798. vpaddq $T2,$H2,$H2
  1799. vpaddq $T3,$H3,$H3
  1800. vpaddq $T4,$H4,$H4
  1801. vpaddq $T0,$H0,$H0
  1802. vpermq \$0x2,$H3,$T3
  1803. vpermq \$0x2,$H4,$T4
  1804. vpermq \$0x2,$H0,$T0
  1805. vpermq \$0x2,$D1,$T1
  1806. vpermq \$0x2,$H2,$T2
  1807. vpaddq $T3,$H3,$H3
  1808. vpaddq $T4,$H4,$H4
  1809. vpaddq $T0,$H0,$H0
  1810. vpaddq $T1,$D1,$D1
  1811. vpaddq $T2,$H2,$H2
  1812. ################################################################
  1813. # lazy reduction
  1814. vpsrlq \$26,$H3,$D3
  1815. vpand $MASK,$H3,$H3
  1816. vpaddq $D3,$H4,$H4 # h3 -> h4
  1817. vpsrlq \$26,$H0,$D0
  1818. vpand $MASK,$H0,$H0
  1819. vpaddq $D0,$D1,$H1 # h0 -> h1
  1820. vpsrlq \$26,$H4,$D4
  1821. vpand $MASK,$H4,$H4
  1822. vpsrlq \$26,$H1,$D1
  1823. vpand $MASK,$H1,$H1
  1824. vpaddq $D1,$H2,$H2 # h1 -> h2
  1825. vpaddq $D4,$H0,$H0
  1826. vpsllq \$2,$D4,$D4
  1827. vpaddq $D4,$H0,$H0 # h4 -> h0
  1828. vpsrlq \$26,$H2,$D2
  1829. vpand $MASK,$H2,$H2
  1830. vpaddq $D2,$H3,$H3 # h2 -> h3
  1831. vpsrlq \$26,$H0,$D0
  1832. vpand $MASK,$H0,$H0
  1833. vpaddq $D0,$H1,$H1 # h0 -> h1
  1834. vpsrlq \$26,$H3,$D3
  1835. vpand $MASK,$H3,$H3
  1836. vpaddq $D3,$H4,$H4 # h3 -> h4
  1837. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  1838. vmovd %x#$H1,`4*1-48-64`($ctx)
  1839. vmovd %x#$H2,`4*2-48-64`($ctx)
  1840. vmovd %x#$H3,`4*3-48-64`($ctx)
  1841. vmovd %x#$H4,`4*4-48-64`($ctx)
  1842. ___
  1843. $code.=<<___ if ($win64);
  1844. vmovdqa 0x50(%r11),%xmm6
  1845. vmovdqa 0x60(%r11),%xmm7
  1846. vmovdqa 0x70(%r11),%xmm8
  1847. vmovdqa 0x80(%r11),%xmm9
  1848. vmovdqa 0x90(%r11),%xmm10
  1849. vmovdqa 0xa0(%r11),%xmm11
  1850. vmovdqa 0xb0(%r11),%xmm12
  1851. vmovdqa 0xc0(%r11),%xmm13
  1852. vmovdqa 0xd0(%r11),%xmm14
  1853. vmovdqa 0xe0(%r11),%xmm15
  1854. lea 0xf8(%r11),%rsp
  1855. .Ldo_avx2_epilogue:
  1856. ___
  1857. $code.=<<___ if (!$win64);
  1858. lea 8(%r11),%rsp
  1859. .cfi_def_cfa %rsp,8
  1860. ___
  1861. $code.=<<___;
  1862. vzeroupper
  1863. ret
  1864. .cfi_endproc
  1865. .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
  1866. ___
  1867. #######################################################################
  1868. if ($avx>2) {
  1869. # On entry we have input length divisible by 64. But since inner loop
  1870. # processes 128 bytes per iteration, cases when length is not divisible
  1871. # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
  1872. # reason stack layout is kept identical to poly1305_blocks_avx2. If not
  1873. # for this tail, we wouldn't have to even allocate stack frame...
  1874. my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
  1875. my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
  1876. my $PADBIT="%zmm30";
  1877. map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
  1878. map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
  1879. map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
  1880. map(s/%y/%z/,($MASK));
  1881. $code.=<<___;
  1882. .type poly1305_blocks_avx512,\@function,4
  1883. .align 32
  1884. poly1305_blocks_avx512:
  1885. .cfi_startproc
  1886. endbranch
  1887. .Lblocks_avx512:
  1888. mov \$15,%eax
  1889. kmovw %eax,%k2
  1890. ___
  1891. $code.=<<___ if (!$win64);
  1892. lea -8(%rsp),%r11
  1893. .cfi_def_cfa %r11,16
  1894. sub \$0x128,%rsp
  1895. ___
  1896. $code.=<<___ if ($win64);
  1897. lea -0xf8(%rsp),%r11
  1898. sub \$0x1c8,%rsp
  1899. vmovdqa %xmm6,0x50(%r11)
  1900. vmovdqa %xmm7,0x60(%r11)
  1901. vmovdqa %xmm8,0x70(%r11)
  1902. vmovdqa %xmm9,0x80(%r11)
  1903. vmovdqa %xmm10,0x90(%r11)
  1904. vmovdqa %xmm11,0xa0(%r11)
  1905. vmovdqa %xmm12,0xb0(%r11)
  1906. vmovdqa %xmm13,0xc0(%r11)
  1907. vmovdqa %xmm14,0xd0(%r11)
  1908. vmovdqa %xmm15,0xe0(%r11)
  1909. .Ldo_avx512_body:
  1910. ___
  1911. $code.=<<___;
  1912. lea .Lconst(%rip),%rcx
  1913. lea 48+64($ctx),$ctx # size optimization
  1914. vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
  1915. # expand pre-calculated table
  1916. vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
  1917. and \$-512,%rsp
  1918. vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
  1919. mov \$0x20,%rax
  1920. vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
  1921. vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
  1922. vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
  1923. vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
  1924. vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
  1925. vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
  1926. vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
  1927. vpermd $D0,$T2,$R0 # 00003412 -> 14243444
  1928. vpbroadcastq 64(%rcx),$MASK # .Lmask26
  1929. vpermd $D1,$T2,$R1
  1930. vpermd $T0,$T2,$S1
  1931. vpermd $D2,$T2,$R2
  1932. vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
  1933. vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
  1934. vpermd $T1,$T2,$S2
  1935. vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
  1936. vpsrlq \$32,$R1,$T1
  1937. vpermd $D3,$T2,$R3
  1938. vmovdqa64 $S1,0x40(%rsp){%k2}
  1939. vpermd $T3,$T2,$S3
  1940. vpermd $D4,$T2,$R4
  1941. vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
  1942. vpermd $T4,$T2,$S4
  1943. vmovdqa64 $S2,0x80(%rsp){%k2}
  1944. vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
  1945. vmovdqa64 $S3,0xc0(%rsp){%k2}
  1946. vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
  1947. vmovdqa64 $S4,0x100(%rsp){%k2}
  1948. ################################################################
  1949. # calculate 5th through 8th powers of the key
  1950. #
  1951. # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
  1952. # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
  1953. # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
  1954. # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
  1955. # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
  1956. vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
  1957. vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
  1958. vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
  1959. vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
  1960. vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
  1961. vpsrlq \$32,$R2,$T2
  1962. vpmuludq $T1,$S4,$M0
  1963. vpmuludq $T1,$R0,$M1
  1964. vpmuludq $T1,$R1,$M2
  1965. vpmuludq $T1,$R2,$M3
  1966. vpmuludq $T1,$R3,$M4
  1967. vpsrlq \$32,$R3,$T3
  1968. vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
  1969. vpaddq $M1,$D1,$D1 # d1 += r1'*r0
  1970. vpaddq $M2,$D2,$D2 # d2 += r1'*r1
  1971. vpaddq $M3,$D3,$D3 # d3 += r1'*r2
  1972. vpaddq $M4,$D4,$D4 # d4 += r1'*r3
  1973. vpmuludq $T2,$S3,$M0
  1974. vpmuludq $T2,$S4,$M1
  1975. vpmuludq $T2,$R1,$M3
  1976. vpmuludq $T2,$R2,$M4
  1977. vpmuludq $T2,$R0,$M2
  1978. vpsrlq \$32,$R4,$T4
  1979. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
  1980. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
  1981. vpaddq $M3,$D3,$D3 # d3 += r2'*r1
  1982. vpaddq $M4,$D4,$D4 # d4 += r2'*r2
  1983. vpaddq $M2,$D2,$D2 # d2 += r2'*r0
  1984. vpmuludq $T3,$S2,$M0
  1985. vpmuludq $T3,$R0,$M3
  1986. vpmuludq $T3,$R1,$M4
  1987. vpmuludq $T3,$S3,$M1
  1988. vpmuludq $T3,$S4,$M2
  1989. vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
  1990. vpaddq $M3,$D3,$D3 # d3 += r3'*r0
  1991. vpaddq $M4,$D4,$D4 # d4 += r3'*r1
  1992. vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
  1993. vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
  1994. vpmuludq $T4,$S4,$M3
  1995. vpmuludq $T4,$R0,$M4
  1996. vpmuludq $T4,$S1,$M0
  1997. vpmuludq $T4,$S2,$M1
  1998. vpmuludq $T4,$S3,$M2
  1999. vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
  2000. vpaddq $M4,$D4,$D4 # d4 += r2'*r0
  2001. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
  2002. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
  2003. vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
  2004. ################################################################
  2005. # load input
  2006. vmovdqu64 16*0($inp),%z#$T3
  2007. vmovdqu64 16*4($inp),%z#$T4
  2008. lea 16*8($inp),$inp
  2009. ################################################################
  2010. # lazy reduction
  2011. vpsrlq \$26,$D3,$M3
  2012. vpandq $MASK,$D3,$D3
  2013. vpaddq $M3,$D4,$D4 # d3 -> d4
  2014. vpsrlq \$26,$D0,$M0
  2015. vpandq $MASK,$D0,$D0
  2016. vpaddq $M0,$D1,$D1 # d0 -> d1
  2017. vpsrlq \$26,$D4,$M4
  2018. vpandq $MASK,$D4,$D4
  2019. vpsrlq \$26,$D1,$M1
  2020. vpandq $MASK,$D1,$D1
  2021. vpaddq $M1,$D2,$D2 # d1 -> d2
  2022. vpaddq $M4,$D0,$D0
  2023. vpsllq \$2,$M4,$M4
  2024. vpaddq $M4,$D0,$D0 # d4 -> d0
  2025. vpsrlq \$26,$D2,$M2
  2026. vpandq $MASK,$D2,$D2
  2027. vpaddq $M2,$D3,$D3 # d2 -> d3
  2028. vpsrlq \$26,$D0,$M0
  2029. vpandq $MASK,$D0,$D0
  2030. vpaddq $M0,$D1,$D1 # d0 -> d1
  2031. vpsrlq \$26,$D3,$M3
  2032. vpandq $MASK,$D3,$D3
  2033. vpaddq $M3,$D4,$D4 # d3 -> d4
  2034. ################################################################
  2035. # at this point we have 14243444 in $R0-$S4 and 05060708 in
  2036. # $D0-$D4, ...
  2037. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2038. vpunpckhqdq $T4,$T3,$T4
  2039. # ... since input 64-bit lanes are ordered as 73625140, we could
  2040. # "vperm" it to 76543210 (here and in each loop iteration), *or*
  2041. # we could just flow along, hence the goal for $R0-$S4 is
  2042. # 1858286838784888 ...
  2043. vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
  2044. mov \$0x7777,%eax
  2045. kmovw %eax,%k1
  2046. vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
  2047. vpermd $R1,$M0,$R1
  2048. vpermd $R2,$M0,$R2
  2049. vpermd $R3,$M0,$R3
  2050. vpermd $R4,$M0,$R4
  2051. vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
  2052. vpermd $D1,$M0,${R1}{%k1}
  2053. vpermd $D2,$M0,${R2}{%k1}
  2054. vpermd $D3,$M0,${R3}{%k1}
  2055. vpermd $D4,$M0,${R4}{%k1}
  2056. vpslld \$2,$R1,$S1 # *5
  2057. vpslld \$2,$R2,$S2
  2058. vpslld \$2,$R3,$S3
  2059. vpslld \$2,$R4,$S4
  2060. vpaddd $R1,$S1,$S1
  2061. vpaddd $R2,$S2,$S2
  2062. vpaddd $R3,$S3,$S3
  2063. vpaddd $R4,$S4,$S4
  2064. vpbroadcastq 32(%rcx),$PADBIT # .L129
  2065. vpsrlq \$52,$T0,$T2 # splat input
  2066. vpsllq \$12,$T4,$T3
  2067. vporq $T3,$T2,$T2
  2068. vpsrlq \$26,$T0,$T1
  2069. vpsrlq \$14,$T4,$T3
  2070. vpsrlq \$40,$T4,$T4 # 4
  2071. vpandq $MASK,$T2,$T2 # 2
  2072. vpandq $MASK,$T0,$T0 # 0
  2073. #vpandq $MASK,$T1,$T1 # 1
  2074. #vpandq $MASK,$T3,$T3 # 3
  2075. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2076. vpaddq $H2,$T2,$H2 # accumulate input
  2077. sub \$192,$len
  2078. jbe .Ltail_avx512
  2079. jmp .Loop_avx512
  2080. .align 32
  2081. .Loop_avx512:
  2082. ################################################################
  2083. # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
  2084. # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
  2085. # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
  2086. # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
  2087. # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
  2088. # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
  2089. # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
  2090. # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
  2091. # \________/\___________/
  2092. ################################################################
  2093. #vpaddq $H2,$T2,$H2 # accumulate input
  2094. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  2095. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  2096. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  2097. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  2098. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  2099. #
  2100. # however, as h2 is "chronologically" first one available pull
  2101. # corresponding operations up, so it's
  2102. #
  2103. # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
  2104. # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
  2105. # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
  2106. # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
  2107. # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
  2108. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2109. vpaddq $H0,$T0,$H0
  2110. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2111. vpandq $MASK,$T1,$T1 # 1
  2112. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2113. vpandq $MASK,$T3,$T3 # 3
  2114. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2115. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2116. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2117. vpaddq $H1,$T1,$H1 # accumulate input
  2118. vpaddq $H3,$T3,$H3
  2119. vpaddq $H4,$T4,$H4
  2120. vmovdqu64 16*0($inp),$T3 # load input
  2121. vmovdqu64 16*4($inp),$T4
  2122. lea 16*8($inp),$inp
  2123. vpmuludq $H0,$R3,$M3
  2124. vpmuludq $H0,$R4,$M4
  2125. vpmuludq $H0,$R0,$M0
  2126. vpmuludq $H0,$R1,$M1
  2127. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2128. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2129. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2130. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2131. vpmuludq $H1,$R2,$M3
  2132. vpmuludq $H1,$R3,$M4
  2133. vpmuludq $H1,$S4,$M0
  2134. vpmuludq $H0,$R2,$M2
  2135. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2136. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2137. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2138. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2139. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2140. vpunpckhqdq $T4,$T3,$T4
  2141. vpmuludq $H3,$R0,$M3
  2142. vpmuludq $H3,$R1,$M4
  2143. vpmuludq $H1,$R0,$M1
  2144. vpmuludq $H1,$R1,$M2
  2145. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2146. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2147. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2148. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2149. vpmuludq $H4,$S4,$M3
  2150. vpmuludq $H4,$R0,$M4
  2151. vpmuludq $H3,$S2,$M0
  2152. vpmuludq $H3,$S3,$M1
  2153. vpaddq $M3,$D3,$D3 # d3 += h4*s4
  2154. vpmuludq $H3,$S4,$M2
  2155. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2156. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2157. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2158. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2159. vpmuludq $H4,$S1,$M0
  2160. vpmuludq $H4,$S2,$M1
  2161. vpmuludq $H4,$S3,$M2
  2162. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2163. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2164. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2165. ################################################################
  2166. # lazy reduction (interleaved with input splat)
  2167. vpsrlq \$52,$T0,$T2 # splat input
  2168. vpsllq \$12,$T4,$T3
  2169. vpsrlq \$26,$D3,$H3
  2170. vpandq $MASK,$D3,$D3
  2171. vpaddq $H3,$D4,$H4 # h3 -> h4
  2172. vporq $T3,$T2,$T2
  2173. vpsrlq \$26,$H0,$D0
  2174. vpandq $MASK,$H0,$H0
  2175. vpaddq $D0,$H1,$H1 # h0 -> h1
  2176. vpandq $MASK,$T2,$T2 # 2
  2177. vpsrlq \$26,$H4,$D4
  2178. vpandq $MASK,$H4,$H4
  2179. vpsrlq \$26,$H1,$D1
  2180. vpandq $MASK,$H1,$H1
  2181. vpaddq $D1,$H2,$H2 # h1 -> h2
  2182. vpaddq $D4,$H0,$H0
  2183. vpsllq \$2,$D4,$D4
  2184. vpaddq $D4,$H0,$H0 # h4 -> h0
  2185. vpaddq $T2,$H2,$H2 # modulo-scheduled
  2186. vpsrlq \$26,$T0,$T1
  2187. vpsrlq \$26,$H2,$D2
  2188. vpandq $MASK,$H2,$H2
  2189. vpaddq $D2,$D3,$H3 # h2 -> h3
  2190. vpsrlq \$14,$T4,$T3
  2191. vpsrlq \$26,$H0,$D0
  2192. vpandq $MASK,$H0,$H0
  2193. vpaddq $D0,$H1,$H1 # h0 -> h1
  2194. vpsrlq \$40,$T4,$T4 # 4
  2195. vpsrlq \$26,$H3,$D3
  2196. vpandq $MASK,$H3,$H3
  2197. vpaddq $D3,$H4,$H4 # h3 -> h4
  2198. vpandq $MASK,$T0,$T0 # 0
  2199. #vpandq $MASK,$T1,$T1 # 1
  2200. #vpandq $MASK,$T3,$T3 # 3
  2201. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2202. sub \$128,$len
  2203. ja .Loop_avx512
  2204. .Ltail_avx512:
  2205. ################################################################
  2206. # while above multiplications were by r^8 in all lanes, in last
  2207. # iteration we multiply least significant lane by r^8 and most
  2208. # significant one by r, that's why table gets shifted...
  2209. vpsrlq \$32,$R0,$R0 # 0105020603070408
  2210. vpsrlq \$32,$R1,$R1
  2211. vpsrlq \$32,$R2,$R2
  2212. vpsrlq \$32,$S3,$S3
  2213. vpsrlq \$32,$S4,$S4
  2214. vpsrlq \$32,$R3,$R3
  2215. vpsrlq \$32,$R4,$R4
  2216. vpsrlq \$32,$S1,$S1
  2217. vpsrlq \$32,$S2,$S2
  2218. ################################################################
  2219. # load either next or last 64 byte of input
  2220. lea ($inp,$len),$inp
  2221. #vpaddq $H2,$T2,$H2 # accumulate input
  2222. vpaddq $H0,$T0,$H0
  2223. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2224. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2225. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2226. vpandq $MASK,$T1,$T1 # 1
  2227. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2228. vpandq $MASK,$T3,$T3 # 3
  2229. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2230. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2231. vpaddq $H1,$T1,$H1 # accumulate input
  2232. vpaddq $H3,$T3,$H3
  2233. vpaddq $H4,$T4,$H4
  2234. vmovdqu 16*0($inp),%x#$T0
  2235. vpmuludq $H0,$R3,$M3
  2236. vpmuludq $H0,$R4,$M4
  2237. vpmuludq $H0,$R0,$M0
  2238. vpmuludq $H0,$R1,$M1
  2239. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2240. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2241. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2242. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2243. vmovdqu 16*1($inp),%x#$T1
  2244. vpmuludq $H1,$R2,$M3
  2245. vpmuludq $H1,$R3,$M4
  2246. vpmuludq $H1,$S4,$M0
  2247. vpmuludq $H0,$R2,$M2
  2248. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2249. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2250. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2251. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2252. vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
  2253. vpmuludq $H3,$R0,$M3
  2254. vpmuludq $H3,$R1,$M4
  2255. vpmuludq $H1,$R0,$M1
  2256. vpmuludq $H1,$R1,$M2
  2257. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2258. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2259. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2260. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2261. vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
  2262. vpmuludq $H4,$S4,$M3
  2263. vpmuludq $H4,$R0,$M4
  2264. vpmuludq $H3,$S2,$M0
  2265. vpmuludq $H3,$S3,$M1
  2266. vpmuludq $H3,$S4,$M2
  2267. vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
  2268. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2269. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2270. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2271. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2272. vpmuludq $H4,$S1,$M0
  2273. vpmuludq $H4,$S2,$M1
  2274. vpmuludq $H4,$S3,$M2
  2275. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2276. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2277. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2278. ################################################################
  2279. # horizontal addition
  2280. mov \$1,%eax
  2281. vpermq \$0xb1,$H3,$D3
  2282. vpermq \$0xb1,$D4,$H4
  2283. vpermq \$0xb1,$H0,$D0
  2284. vpermq \$0xb1,$H1,$D1
  2285. vpermq \$0xb1,$H2,$D2
  2286. vpaddq $D3,$H3,$H3
  2287. vpaddq $D4,$H4,$H4
  2288. vpaddq $D0,$H0,$H0
  2289. vpaddq $D1,$H1,$H1
  2290. vpaddq $D2,$H2,$H2
  2291. kmovw %eax,%k3
  2292. vpermq \$0x2,$H3,$D3
  2293. vpermq \$0x2,$H4,$D4
  2294. vpermq \$0x2,$H0,$D0
  2295. vpermq \$0x2,$H1,$D1
  2296. vpermq \$0x2,$H2,$D2
  2297. vpaddq $D3,$H3,$H3
  2298. vpaddq $D4,$H4,$H4
  2299. vpaddq $D0,$H0,$H0
  2300. vpaddq $D1,$H1,$H1
  2301. vpaddq $D2,$H2,$H2
  2302. vextracti64x4 \$0x1,$H3,%y#$D3
  2303. vextracti64x4 \$0x1,$H4,%y#$D4
  2304. vextracti64x4 \$0x1,$H0,%y#$D0
  2305. vextracti64x4 \$0x1,$H1,%y#$D1
  2306. vextracti64x4 \$0x1,$H2,%y#$D2
  2307. vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
  2308. vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
  2309. vpaddq $D0,$H0,${H0}{%k3}{z}
  2310. vpaddq $D1,$H1,${H1}{%k3}{z}
  2311. vpaddq $D2,$H2,${H2}{%k3}{z}
  2312. ___
  2313. map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
  2314. map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
  2315. $code.=<<___;
  2316. ################################################################
  2317. # lazy reduction (interleaved with input splat)
  2318. vpsrlq \$26,$H3,$D3
  2319. vpand $MASK,$H3,$H3
  2320. vpsrldq \$6,$T0,$T2 # splat input
  2321. vpsrldq \$6,$T1,$T3
  2322. vpunpckhqdq $T1,$T0,$T4 # 4
  2323. vpaddq $D3,$H4,$H4 # h3 -> h4
  2324. vpsrlq \$26,$H0,$D0
  2325. vpand $MASK,$H0,$H0
  2326. vpunpcklqdq $T3,$T2,$T2 # 2:3
  2327. vpunpcklqdq $T1,$T0,$T0 # 0:1
  2328. vpaddq $D0,$H1,$H1 # h0 -> h1
  2329. vpsrlq \$26,$H4,$D4
  2330. vpand $MASK,$H4,$H4
  2331. vpsrlq \$26,$H1,$D1
  2332. vpand $MASK,$H1,$H1
  2333. vpsrlq \$30,$T2,$T3
  2334. vpsrlq \$4,$T2,$T2
  2335. vpaddq $D1,$H2,$H2 # h1 -> h2
  2336. vpaddq $D4,$H0,$H0
  2337. vpsllq \$2,$D4,$D4
  2338. vpsrlq \$26,$T0,$T1
  2339. vpsrlq \$40,$T4,$T4 # 4
  2340. vpaddq $D4,$H0,$H0 # h4 -> h0
  2341. vpsrlq \$26,$H2,$D2
  2342. vpand $MASK,$H2,$H2
  2343. vpand $MASK,$T2,$T2 # 2
  2344. vpand $MASK,$T0,$T0 # 0
  2345. vpaddq $D2,$H3,$H3 # h2 -> h3
  2346. vpsrlq \$26,$H0,$D0
  2347. vpand $MASK,$H0,$H0
  2348. vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
  2349. vpand $MASK,$T1,$T1 # 1
  2350. vpaddq $D0,$H1,$H1 # h0 -> h1
  2351. vpsrlq \$26,$H3,$D3
  2352. vpand $MASK,$H3,$H3
  2353. vpand $MASK,$T3,$T3 # 3
  2354. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  2355. vpaddq $D3,$H4,$H4 # h3 -> h4
  2356. lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
  2357. add \$64,$len
  2358. jnz .Ltail_avx2
  2359. vpsubq $T2,$H2,$H2 # undo input accumulation
  2360. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  2361. vmovd %x#$H1,`4*1-48-64`($ctx)
  2362. vmovd %x#$H2,`4*2-48-64`($ctx)
  2363. vmovd %x#$H3,`4*3-48-64`($ctx)
  2364. vmovd %x#$H4,`4*4-48-64`($ctx)
  2365. vzeroall
  2366. ___
  2367. $code.=<<___ if ($win64);
  2368. movdqa 0x50(%r11),%xmm6
  2369. movdqa 0x60(%r11),%xmm7
  2370. movdqa 0x70(%r11),%xmm8
  2371. movdqa 0x80(%r11),%xmm9
  2372. movdqa 0x90(%r11),%xmm10
  2373. movdqa 0xa0(%r11),%xmm11
  2374. movdqa 0xb0(%r11),%xmm12
  2375. movdqa 0xc0(%r11),%xmm13
  2376. movdqa 0xd0(%r11),%xmm14
  2377. movdqa 0xe0(%r11),%xmm15
  2378. lea 0xf8(%r11),%rsp
  2379. .Ldo_avx512_epilogue:
  2380. ___
  2381. $code.=<<___ if (!$win64);
  2382. lea 8(%r11),%rsp
  2383. .cfi_def_cfa %rsp,8
  2384. ___
  2385. $code.=<<___;
  2386. ret
  2387. .cfi_endproc
  2388. .size poly1305_blocks_avx512,.-poly1305_blocks_avx512
  2389. ___
  2390. if ($avx>3 && !$win64) {
  2391. ########################################################################
  2392. # VPMADD52 version using 2^44 radix.
  2393. #
  2394. # One can argue that base 2^52 would be more natural. Well, even though
  2395. # some operations would be more natural, one has to recognize couple of
  2396. # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
  2397. # at amount of multiply-n-accumulate operations. Secondly, it makes it
  2398. # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
  2399. # reference implementations], which means that more such operations
  2400. # would have to be performed in inner loop, which in turn makes critical
  2401. # path longer. In other words, even though base 2^44 reduction might
  2402. # look less elegant, overall critical path is actually shorter...
  2403. ########################################################################
  2404. # Layout of opaque area is following.
  2405. #
  2406. # unsigned __int64 h[3]; # current hash value base 2^44
  2407. # unsigned __int64 s[2]; # key value*20 base 2^44
  2408. # unsigned __int64 r[3]; # key value base 2^44
  2409. # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
  2410. # # r^n positions reflect
  2411. # # placement in register, not
  2412. # # memory, R[3] is R[1]*20
  2413. $code.=<<___;
  2414. .type poly1305_init_base2_44,\@function,3
  2415. .align 32
  2416. poly1305_init_base2_44:
  2417. .cfi_startproc
  2418. xor %rax,%rax
  2419. mov %rax,0($ctx) # initialize hash value
  2420. mov %rax,8($ctx)
  2421. mov %rax,16($ctx)
  2422. .Linit_base2_44:
  2423. lea poly1305_blocks_vpmadd52(%rip),%r10
  2424. lea poly1305_emit_base2_44(%rip),%r11
  2425. mov \$0x0ffffffc0fffffff,%rax
  2426. mov \$0x0ffffffc0ffffffc,%rcx
  2427. and 0($inp),%rax
  2428. mov \$0x00000fffffffffff,%r8
  2429. and 8($inp),%rcx
  2430. mov \$0x00000fffffffffff,%r9
  2431. and %rax,%r8
  2432. shrd \$44,%rcx,%rax
  2433. mov %r8,40($ctx) # r0
  2434. and %r9,%rax
  2435. shr \$24,%rcx
  2436. mov %rax,48($ctx) # r1
  2437. lea (%rax,%rax,4),%rax # *5
  2438. mov %rcx,56($ctx) # r2
  2439. shl \$2,%rax # magic <<2
  2440. lea (%rcx,%rcx,4),%rcx # *5
  2441. shl \$2,%rcx # magic <<2
  2442. mov %rax,24($ctx) # s1
  2443. mov %rcx,32($ctx) # s2
  2444. movq \$-1,64($ctx) # write impossible value
  2445. ___
  2446. $code.=<<___ if ($flavour !~ /elf32/);
  2447. mov %r10,0(%rdx)
  2448. mov %r11,8(%rdx)
  2449. ___
  2450. $code.=<<___ if ($flavour =~ /elf32/);
  2451. mov %r10d,0(%rdx)
  2452. mov %r11d,4(%rdx)
  2453. ___
  2454. $code.=<<___;
  2455. mov \$1,%eax
  2456. ret
  2457. .cfi_endproc
  2458. .size poly1305_init_base2_44,.-poly1305_init_base2_44
  2459. ___
  2460. {
  2461. my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
  2462. my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
  2463. my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
  2464. $code.=<<___;
  2465. .type poly1305_blocks_vpmadd52,\@function,4
  2466. .align 32
  2467. poly1305_blocks_vpmadd52:
  2468. .cfi_startproc
  2469. endbranch
  2470. shr \$4,$len
  2471. jz .Lno_data_vpmadd52 # too short
  2472. shl \$40,$padbit
  2473. mov 64($ctx),%r8 # peek on power of the key
  2474. # if powers of the key are not calculated yet, process up to 3
  2475. # blocks with this single-block subroutine, otherwise ensure that
  2476. # length is divisible by 2 blocks and pass the rest down to next
  2477. # subroutine...
  2478. mov \$3,%rax
  2479. mov \$1,%r10
  2480. cmp \$4,$len # is input long
  2481. cmovae %r10,%rax
  2482. test %r8,%r8 # is power value impossible?
  2483. cmovns %r10,%rax
  2484. and $len,%rax # is input of favourable length?
  2485. jz .Lblocks_vpmadd52_4x
  2486. sub %rax,$len
  2487. mov \$7,%r10d
  2488. mov \$1,%r11d
  2489. kmovw %r10d,%k7
  2490. lea .L2_44_inp_permd(%rip),%r10
  2491. kmovw %r11d,%k1
  2492. vmovq $padbit,%x#$PAD
  2493. vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
  2494. vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
  2495. vpermq \$0xcf,$PAD,$PAD
  2496. vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
  2497. vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
  2498. vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
  2499. vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
  2500. vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
  2501. vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
  2502. vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
  2503. jmp .Loop_vpmadd52
  2504. .align 32
  2505. .Loop_vpmadd52:
  2506. vmovdqu32 0($inp),%x#$T0 # load input as ----3210
  2507. lea 16($inp),$inp
  2508. vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
  2509. vpsrlvq $inp_shift,$T0,$T0
  2510. vpandq $reduc_mask,$T0,$T0
  2511. vporq $PAD,$T0,$T0
  2512. vpaddq $T0,$Dlo,$Dlo # accumulate input
  2513. vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
  2514. vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
  2515. vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
  2516. vpxord $Dlo,$Dlo,$Dlo
  2517. vpxord $Dhi,$Dhi,$Dhi
  2518. vpmadd52luq $r2r1r0,$H0,$Dlo
  2519. vpmadd52huq $r2r1r0,$H0,$Dhi
  2520. vpmadd52luq $r1r0s2,$H1,$Dlo
  2521. vpmadd52huq $r1r0s2,$H1,$Dhi
  2522. vpmadd52luq $r0s2s1,$H2,$Dlo
  2523. vpmadd52huq $r0s2s1,$H2,$Dhi
  2524. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
  2525. vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
  2526. vpandq $reduc_mask,$Dlo,$Dlo
  2527. vpaddq $T0,$Dhi,$Dhi
  2528. vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
  2529. vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
  2530. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
  2531. vpandq $reduc_mask,$Dlo,$Dlo
  2532. vpermq \$0b10010011,$T0,$T0
  2533. vpaddq $T0,$Dlo,$Dlo
  2534. vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
  2535. vpaddq $T0,$Dlo,$Dlo
  2536. vpsllq \$2,$T0,$T0
  2537. vpaddq $T0,$Dlo,$Dlo
  2538. dec %rax # len-=16
  2539. jnz .Loop_vpmadd52
  2540. vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
  2541. test $len,$len
  2542. jnz .Lblocks_vpmadd52_4x
  2543. .Lno_data_vpmadd52:
  2544. ret
  2545. .cfi_endproc
  2546. .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
  2547. ___
  2548. }
  2549. {
  2550. ########################################################################
  2551. # As implied by its name 4x subroutine processes 4 blocks in parallel
  2552. # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
  2553. # and is handled in 256-bit %ymm registers.
  2554. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2555. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2556. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2557. $code.=<<___;
  2558. .type poly1305_blocks_vpmadd52_4x,\@function,4
  2559. .align 32
  2560. poly1305_blocks_vpmadd52_4x:
  2561. .cfi_startproc
  2562. shr \$4,$len
  2563. jz .Lno_data_vpmadd52_4x # too short
  2564. shl \$40,$padbit
  2565. mov 64($ctx),%r8 # peek on power of the key
  2566. .Lblocks_vpmadd52_4x:
  2567. vpbroadcastq $padbit,$PAD
  2568. vmovdqa64 .Lx_mask44(%rip),$mask44
  2569. mov \$5,%eax
  2570. vmovdqa64 .Lx_mask42(%rip),$mask42
  2571. kmovw %eax,%k1 # used in 2x path
  2572. test %r8,%r8 # is power value impossible?
  2573. js .Linit_vpmadd52 # if it is, then init R[4]
  2574. vmovq 0($ctx),%x#$H0 # load current hash value
  2575. vmovq 8($ctx),%x#$H1
  2576. vmovq 16($ctx),%x#$H2
  2577. test \$3,$len # is length 4*n+2?
  2578. jnz .Lblocks_vpmadd52_2x_do
  2579. .Lblocks_vpmadd52_4x_do:
  2580. vpbroadcastq 64($ctx),$R0 # load 4th power of the key
  2581. vpbroadcastq 96($ctx),$R1
  2582. vpbroadcastq 128($ctx),$R2
  2583. vpbroadcastq 160($ctx),$S1
  2584. .Lblocks_vpmadd52_4x_key_loaded:
  2585. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2586. vpaddq $R2,$S2,$S2
  2587. vpsllq \$2,$S2,$S2
  2588. test \$7,$len # is len 8*n?
  2589. jz .Lblocks_vpmadd52_8x
  2590. vmovdqu64 16*0($inp),$T2 # load data
  2591. vmovdqu64 16*2($inp),$T3
  2592. lea 16*4($inp),$inp
  2593. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2594. vpunpckhqdq $T3,$T2,$T3
  2595. # at this point 64-bit lanes are ordered as 3-1-2-0
  2596. vpsrlq \$24,$T3,$T2 # splat the data
  2597. vporq $PAD,$T2,$T2
  2598. vpaddq $T2,$H2,$H2 # accumulate input
  2599. vpandq $mask44,$T1,$T0
  2600. vpsrlq \$44,$T1,$T1
  2601. vpsllq \$20,$T3,$T3
  2602. vporq $T3,$T1,$T1
  2603. vpandq $mask44,$T1,$T1
  2604. sub \$4,$len
  2605. jz .Ltail_vpmadd52_4x
  2606. jmp .Loop_vpmadd52_4x
  2607. ud2
  2608. .align 32
  2609. .Linit_vpmadd52:
  2610. vmovq 24($ctx),%x#$S1 # load key
  2611. vmovq 56($ctx),%x#$H2
  2612. vmovq 32($ctx),%x#$S2
  2613. vmovq 40($ctx),%x#$R0
  2614. vmovq 48($ctx),%x#$R1
  2615. vmovdqa $R0,$H0
  2616. vmovdqa $R1,$H1
  2617. vmovdqa $H2,$R2
  2618. mov \$2,%eax
  2619. .Lmul_init_vpmadd52:
  2620. vpxorq $D0lo,$D0lo,$D0lo
  2621. vpmadd52luq $H2,$S1,$D0lo
  2622. vpxorq $D0hi,$D0hi,$D0hi
  2623. vpmadd52huq $H2,$S1,$D0hi
  2624. vpxorq $D1lo,$D1lo,$D1lo
  2625. vpmadd52luq $H2,$S2,$D1lo
  2626. vpxorq $D1hi,$D1hi,$D1hi
  2627. vpmadd52huq $H2,$S2,$D1hi
  2628. vpxorq $D2lo,$D2lo,$D2lo
  2629. vpmadd52luq $H2,$R0,$D2lo
  2630. vpxorq $D2hi,$D2hi,$D2hi
  2631. vpmadd52huq $H2,$R0,$D2hi
  2632. vpmadd52luq $H0,$R0,$D0lo
  2633. vpmadd52huq $H0,$R0,$D0hi
  2634. vpmadd52luq $H0,$R1,$D1lo
  2635. vpmadd52huq $H0,$R1,$D1hi
  2636. vpmadd52luq $H0,$R2,$D2lo
  2637. vpmadd52huq $H0,$R2,$D2hi
  2638. vpmadd52luq $H1,$S2,$D0lo
  2639. vpmadd52huq $H1,$S2,$D0hi
  2640. vpmadd52luq $H1,$R0,$D1lo
  2641. vpmadd52huq $H1,$R0,$D1hi
  2642. vpmadd52luq $H1,$R1,$D2lo
  2643. vpmadd52huq $H1,$R1,$D2hi
  2644. ################################################################
  2645. # partial reduction
  2646. vpsrlq \$44,$D0lo,$tmp
  2647. vpsllq \$8,$D0hi,$D0hi
  2648. vpandq $mask44,$D0lo,$H0
  2649. vpaddq $tmp,$D0hi,$D0hi
  2650. vpaddq $D0hi,$D1lo,$D1lo
  2651. vpsrlq \$44,$D1lo,$tmp
  2652. vpsllq \$8,$D1hi,$D1hi
  2653. vpandq $mask44,$D1lo,$H1
  2654. vpaddq $tmp,$D1hi,$D1hi
  2655. vpaddq $D1hi,$D2lo,$D2lo
  2656. vpsrlq \$42,$D2lo,$tmp
  2657. vpsllq \$10,$D2hi,$D2hi
  2658. vpandq $mask42,$D2lo,$H2
  2659. vpaddq $tmp,$D2hi,$D2hi
  2660. vpaddq $D2hi,$H0,$H0
  2661. vpsllq \$2,$D2hi,$D2hi
  2662. vpaddq $D2hi,$H0,$H0
  2663. vpsrlq \$44,$H0,$tmp # additional step
  2664. vpandq $mask44,$H0,$H0
  2665. vpaddq $tmp,$H1,$H1
  2666. dec %eax
  2667. jz .Ldone_init_vpmadd52
  2668. vpunpcklqdq $R1,$H1,$R1 # 1,2
  2669. vpbroadcastq %x#$H1,%x#$H1 # 2,2
  2670. vpunpcklqdq $R2,$H2,$R2
  2671. vpbroadcastq %x#$H2,%x#$H2
  2672. vpunpcklqdq $R0,$H0,$R0
  2673. vpbroadcastq %x#$H0,%x#$H0
  2674. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2675. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2676. vpaddq $R1,$S1,$S1
  2677. vpaddq $R2,$S2,$S2
  2678. vpsllq \$2,$S1,$S1
  2679. vpsllq \$2,$S2,$S2
  2680. jmp .Lmul_init_vpmadd52
  2681. ud2
  2682. .align 32
  2683. .Ldone_init_vpmadd52:
  2684. vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
  2685. vinserti128 \$1,%x#$R2,$H2,$R2
  2686. vinserti128 \$1,%x#$R0,$H0,$R0
  2687. vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
  2688. vpermq \$0b11011000,$R2,$R2
  2689. vpermq \$0b11011000,$R0,$R0
  2690. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2691. vpaddq $R1,$S1,$S1
  2692. vpsllq \$2,$S1,$S1
  2693. vmovq 0($ctx),%x#$H0 # load current hash value
  2694. vmovq 8($ctx),%x#$H1
  2695. vmovq 16($ctx),%x#$H2
  2696. test \$3,$len # is length 4*n+2?
  2697. jnz .Ldone_init_vpmadd52_2x
  2698. vmovdqu64 $R0,64($ctx) # save key powers
  2699. vpbroadcastq %x#$R0,$R0 # broadcast 4th power
  2700. vmovdqu64 $R1,96($ctx)
  2701. vpbroadcastq %x#$R1,$R1
  2702. vmovdqu64 $R2,128($ctx)
  2703. vpbroadcastq %x#$R2,$R2
  2704. vmovdqu64 $S1,160($ctx)
  2705. vpbroadcastq %x#$S1,$S1
  2706. jmp .Lblocks_vpmadd52_4x_key_loaded
  2707. ud2
  2708. .align 32
  2709. .Ldone_init_vpmadd52_2x:
  2710. vmovdqu64 $R0,64($ctx) # save key powers
  2711. vpsrldq \$8,$R0,$R0 # 0-1-0-2
  2712. vmovdqu64 $R1,96($ctx)
  2713. vpsrldq \$8,$R1,$R1
  2714. vmovdqu64 $R2,128($ctx)
  2715. vpsrldq \$8,$R2,$R2
  2716. vmovdqu64 $S1,160($ctx)
  2717. vpsrldq \$8,$S1,$S1
  2718. jmp .Lblocks_vpmadd52_2x_key_loaded
  2719. ud2
  2720. .align 32
  2721. .Lblocks_vpmadd52_2x_do:
  2722. vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
  2723. vmovdqu64 160+8($ctx),${S1}{%k1}{z}
  2724. vmovdqu64 64+8($ctx),${R0}{%k1}{z}
  2725. vmovdqu64 96+8($ctx),${R1}{%k1}{z}
  2726. .Lblocks_vpmadd52_2x_key_loaded:
  2727. vmovdqu64 16*0($inp),$T2 # load data
  2728. vpxorq $T3,$T3,$T3
  2729. lea 16*2($inp),$inp
  2730. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2731. vpunpckhqdq $T3,$T2,$T3
  2732. # at this point 64-bit lanes are ordered as x-1-x-0
  2733. vpsrlq \$24,$T3,$T2 # splat the data
  2734. vporq $PAD,$T2,$T2
  2735. vpaddq $T2,$H2,$H2 # accumulate input
  2736. vpandq $mask44,$T1,$T0
  2737. vpsrlq \$44,$T1,$T1
  2738. vpsllq \$20,$T3,$T3
  2739. vporq $T3,$T1,$T1
  2740. vpandq $mask44,$T1,$T1
  2741. jmp .Ltail_vpmadd52_2x
  2742. ud2
  2743. .align 32
  2744. .Loop_vpmadd52_4x:
  2745. #vpaddq $T2,$H2,$H2 # accumulate input
  2746. vpaddq $T0,$H0,$H0
  2747. vpaddq $T1,$H1,$H1
  2748. vpxorq $D0lo,$D0lo,$D0lo
  2749. vpmadd52luq $H2,$S1,$D0lo
  2750. vpxorq $D0hi,$D0hi,$D0hi
  2751. vpmadd52huq $H2,$S1,$D0hi
  2752. vpxorq $D1lo,$D1lo,$D1lo
  2753. vpmadd52luq $H2,$S2,$D1lo
  2754. vpxorq $D1hi,$D1hi,$D1hi
  2755. vpmadd52huq $H2,$S2,$D1hi
  2756. vpxorq $D2lo,$D2lo,$D2lo
  2757. vpmadd52luq $H2,$R0,$D2lo
  2758. vpxorq $D2hi,$D2hi,$D2hi
  2759. vpmadd52huq $H2,$R0,$D2hi
  2760. vmovdqu64 16*0($inp),$T2 # load data
  2761. vmovdqu64 16*2($inp),$T3
  2762. lea 16*4($inp),$inp
  2763. vpmadd52luq $H0,$R0,$D0lo
  2764. vpmadd52huq $H0,$R0,$D0hi
  2765. vpmadd52luq $H0,$R1,$D1lo
  2766. vpmadd52huq $H0,$R1,$D1hi
  2767. vpmadd52luq $H0,$R2,$D2lo
  2768. vpmadd52huq $H0,$R2,$D2hi
  2769. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2770. vpunpckhqdq $T3,$T2,$T3
  2771. vpmadd52luq $H1,$S2,$D0lo
  2772. vpmadd52huq $H1,$S2,$D0hi
  2773. vpmadd52luq $H1,$R0,$D1lo
  2774. vpmadd52huq $H1,$R0,$D1hi
  2775. vpmadd52luq $H1,$R1,$D2lo
  2776. vpmadd52huq $H1,$R1,$D2hi
  2777. ################################################################
  2778. # partial reduction (interleaved with data splat)
  2779. vpsrlq \$44,$D0lo,$tmp
  2780. vpsllq \$8,$D0hi,$D0hi
  2781. vpandq $mask44,$D0lo,$H0
  2782. vpaddq $tmp,$D0hi,$D0hi
  2783. vpsrlq \$24,$T3,$T2
  2784. vporq $PAD,$T2,$T2
  2785. vpaddq $D0hi,$D1lo,$D1lo
  2786. vpsrlq \$44,$D1lo,$tmp
  2787. vpsllq \$8,$D1hi,$D1hi
  2788. vpandq $mask44,$D1lo,$H1
  2789. vpaddq $tmp,$D1hi,$D1hi
  2790. vpandq $mask44,$T1,$T0
  2791. vpsrlq \$44,$T1,$T1
  2792. vpsllq \$20,$T3,$T3
  2793. vpaddq $D1hi,$D2lo,$D2lo
  2794. vpsrlq \$42,$D2lo,$tmp
  2795. vpsllq \$10,$D2hi,$D2hi
  2796. vpandq $mask42,$D2lo,$H2
  2797. vpaddq $tmp,$D2hi,$D2hi
  2798. vpaddq $T2,$H2,$H2 # accumulate input
  2799. vpaddq $D2hi,$H0,$H0
  2800. vpsllq \$2,$D2hi,$D2hi
  2801. vpaddq $D2hi,$H0,$H0
  2802. vporq $T3,$T1,$T1
  2803. vpandq $mask44,$T1,$T1
  2804. vpsrlq \$44,$H0,$tmp # additional step
  2805. vpandq $mask44,$H0,$H0
  2806. vpaddq $tmp,$H1,$H1
  2807. sub \$4,$len # len-=64
  2808. jnz .Loop_vpmadd52_4x
  2809. .Ltail_vpmadd52_4x:
  2810. vmovdqu64 128($ctx),$R2 # load all key powers
  2811. vmovdqu64 160($ctx),$S1
  2812. vmovdqu64 64($ctx),$R0
  2813. vmovdqu64 96($ctx),$R1
  2814. .Ltail_vpmadd52_2x:
  2815. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2816. vpaddq $R2,$S2,$S2
  2817. vpsllq \$2,$S2,$S2
  2818. #vpaddq $T2,$H2,$H2 # accumulate input
  2819. vpaddq $T0,$H0,$H0
  2820. vpaddq $T1,$H1,$H1
  2821. vpxorq $D0lo,$D0lo,$D0lo
  2822. vpmadd52luq $H2,$S1,$D0lo
  2823. vpxorq $D0hi,$D0hi,$D0hi
  2824. vpmadd52huq $H2,$S1,$D0hi
  2825. vpxorq $D1lo,$D1lo,$D1lo
  2826. vpmadd52luq $H2,$S2,$D1lo
  2827. vpxorq $D1hi,$D1hi,$D1hi
  2828. vpmadd52huq $H2,$S2,$D1hi
  2829. vpxorq $D2lo,$D2lo,$D2lo
  2830. vpmadd52luq $H2,$R0,$D2lo
  2831. vpxorq $D2hi,$D2hi,$D2hi
  2832. vpmadd52huq $H2,$R0,$D2hi
  2833. vpmadd52luq $H0,$R0,$D0lo
  2834. vpmadd52huq $H0,$R0,$D0hi
  2835. vpmadd52luq $H0,$R1,$D1lo
  2836. vpmadd52huq $H0,$R1,$D1hi
  2837. vpmadd52luq $H0,$R2,$D2lo
  2838. vpmadd52huq $H0,$R2,$D2hi
  2839. vpmadd52luq $H1,$S2,$D0lo
  2840. vpmadd52huq $H1,$S2,$D0hi
  2841. vpmadd52luq $H1,$R0,$D1lo
  2842. vpmadd52huq $H1,$R0,$D1hi
  2843. vpmadd52luq $H1,$R1,$D2lo
  2844. vpmadd52huq $H1,$R1,$D2hi
  2845. ################################################################
  2846. # horizontal addition
  2847. mov \$1,%eax
  2848. kmovw %eax,%k1
  2849. vpsrldq \$8,$D0lo,$T0
  2850. vpsrldq \$8,$D0hi,$H0
  2851. vpsrldq \$8,$D1lo,$T1
  2852. vpsrldq \$8,$D1hi,$H1
  2853. vpaddq $T0,$D0lo,$D0lo
  2854. vpaddq $H0,$D0hi,$D0hi
  2855. vpsrldq \$8,$D2lo,$T2
  2856. vpsrldq \$8,$D2hi,$H2
  2857. vpaddq $T1,$D1lo,$D1lo
  2858. vpaddq $H1,$D1hi,$D1hi
  2859. vpermq \$0x2,$D0lo,$T0
  2860. vpermq \$0x2,$D0hi,$H0
  2861. vpaddq $T2,$D2lo,$D2lo
  2862. vpaddq $H2,$D2hi,$D2hi
  2863. vpermq \$0x2,$D1lo,$T1
  2864. vpermq \$0x2,$D1hi,$H1
  2865. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  2866. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  2867. vpermq \$0x2,$D2lo,$T2
  2868. vpermq \$0x2,$D2hi,$H2
  2869. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  2870. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  2871. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  2872. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  2873. ################################################################
  2874. # partial reduction
  2875. vpsrlq \$44,$D0lo,$tmp
  2876. vpsllq \$8,$D0hi,$D0hi
  2877. vpandq $mask44,$D0lo,$H0
  2878. vpaddq $tmp,$D0hi,$D0hi
  2879. vpaddq $D0hi,$D1lo,$D1lo
  2880. vpsrlq \$44,$D1lo,$tmp
  2881. vpsllq \$8,$D1hi,$D1hi
  2882. vpandq $mask44,$D1lo,$H1
  2883. vpaddq $tmp,$D1hi,$D1hi
  2884. vpaddq $D1hi,$D2lo,$D2lo
  2885. vpsrlq \$42,$D2lo,$tmp
  2886. vpsllq \$10,$D2hi,$D2hi
  2887. vpandq $mask42,$D2lo,$H2
  2888. vpaddq $tmp,$D2hi,$D2hi
  2889. vpaddq $D2hi,$H0,$H0
  2890. vpsllq \$2,$D2hi,$D2hi
  2891. vpaddq $D2hi,$H0,$H0
  2892. vpsrlq \$44,$H0,$tmp # additional step
  2893. vpandq $mask44,$H0,$H0
  2894. vpaddq $tmp,$H1,$H1
  2895. # at this point $len is
  2896. # either 4*n+2 or 0...
  2897. sub \$2,$len # len-=32
  2898. ja .Lblocks_vpmadd52_4x_do
  2899. vmovq %x#$H0,0($ctx)
  2900. vmovq %x#$H1,8($ctx)
  2901. vmovq %x#$H2,16($ctx)
  2902. vzeroall
  2903. .Lno_data_vpmadd52_4x:
  2904. ret
  2905. .cfi_endproc
  2906. .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
  2907. ___
  2908. }
  2909. {
  2910. ########################################################################
  2911. # As implied by its name 8x subroutine processes 8 blocks in parallel...
  2912. # This is intermediate version, as it's used only in cases when input
  2913. # length is either 8*n, 8*n+1 or 8*n+2...
  2914. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2915. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2916. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2917. my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
  2918. $code.=<<___;
  2919. .type poly1305_blocks_vpmadd52_8x,\@function,4
  2920. .align 32
  2921. poly1305_blocks_vpmadd52_8x:
  2922. .cfi_startproc
  2923. shr \$4,$len
  2924. jz .Lno_data_vpmadd52_8x # too short
  2925. shl \$40,$padbit
  2926. mov 64($ctx),%r8 # peek on power of the key
  2927. vmovdqa64 .Lx_mask44(%rip),$mask44
  2928. vmovdqa64 .Lx_mask42(%rip),$mask42
  2929. test %r8,%r8 # is power value impossible?
  2930. js .Linit_vpmadd52 # if it is, then init R[4]
  2931. vmovq 0($ctx),%x#$H0 # load current hash value
  2932. vmovq 8($ctx),%x#$H1
  2933. vmovq 16($ctx),%x#$H2
  2934. .Lblocks_vpmadd52_8x:
  2935. ################################################################
  2936. # fist we calculate more key powers
  2937. vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
  2938. vmovdqu64 160($ctx),$S1
  2939. vmovdqu64 64($ctx),$R0
  2940. vmovdqu64 96($ctx),$R1
  2941. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2942. vpaddq $R2,$S2,$S2
  2943. vpsllq \$2,$S2,$S2
  2944. vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
  2945. vpbroadcastq %x#$R0,$RR0
  2946. vpbroadcastq %x#$R1,$RR1
  2947. vpxorq $D0lo,$D0lo,$D0lo
  2948. vpmadd52luq $RR2,$S1,$D0lo
  2949. vpxorq $D0hi,$D0hi,$D0hi
  2950. vpmadd52huq $RR2,$S1,$D0hi
  2951. vpxorq $D1lo,$D1lo,$D1lo
  2952. vpmadd52luq $RR2,$S2,$D1lo
  2953. vpxorq $D1hi,$D1hi,$D1hi
  2954. vpmadd52huq $RR2,$S2,$D1hi
  2955. vpxorq $D2lo,$D2lo,$D2lo
  2956. vpmadd52luq $RR2,$R0,$D2lo
  2957. vpxorq $D2hi,$D2hi,$D2hi
  2958. vpmadd52huq $RR2,$R0,$D2hi
  2959. vpmadd52luq $RR0,$R0,$D0lo
  2960. vpmadd52huq $RR0,$R0,$D0hi
  2961. vpmadd52luq $RR0,$R1,$D1lo
  2962. vpmadd52huq $RR0,$R1,$D1hi
  2963. vpmadd52luq $RR0,$R2,$D2lo
  2964. vpmadd52huq $RR0,$R2,$D2hi
  2965. vpmadd52luq $RR1,$S2,$D0lo
  2966. vpmadd52huq $RR1,$S2,$D0hi
  2967. vpmadd52luq $RR1,$R0,$D1lo
  2968. vpmadd52huq $RR1,$R0,$D1hi
  2969. vpmadd52luq $RR1,$R1,$D2lo
  2970. vpmadd52huq $RR1,$R1,$D2hi
  2971. ################################################################
  2972. # partial reduction
  2973. vpsrlq \$44,$D0lo,$tmp
  2974. vpsllq \$8,$D0hi,$D0hi
  2975. vpandq $mask44,$D0lo,$RR0
  2976. vpaddq $tmp,$D0hi,$D0hi
  2977. vpaddq $D0hi,$D1lo,$D1lo
  2978. vpsrlq \$44,$D1lo,$tmp
  2979. vpsllq \$8,$D1hi,$D1hi
  2980. vpandq $mask44,$D1lo,$RR1
  2981. vpaddq $tmp,$D1hi,$D1hi
  2982. vpaddq $D1hi,$D2lo,$D2lo
  2983. vpsrlq \$42,$D2lo,$tmp
  2984. vpsllq \$10,$D2hi,$D2hi
  2985. vpandq $mask42,$D2lo,$RR2
  2986. vpaddq $tmp,$D2hi,$D2hi
  2987. vpaddq $D2hi,$RR0,$RR0
  2988. vpsllq \$2,$D2hi,$D2hi
  2989. vpaddq $D2hi,$RR0,$RR0
  2990. vpsrlq \$44,$RR0,$tmp # additional step
  2991. vpandq $mask44,$RR0,$RR0
  2992. vpaddq $tmp,$RR1,$RR1
  2993. ################################################################
  2994. # At this point Rx holds 1324 powers, RRx - 5768, and the goal
  2995. # is 15263748, which reflects how data is loaded...
  2996. vpunpcklqdq $R2,$RR2,$T2 # 3748
  2997. vpunpckhqdq $R2,$RR2,$R2 # 1526
  2998. vpunpcklqdq $R0,$RR0,$T0
  2999. vpunpckhqdq $R0,$RR0,$R0
  3000. vpunpcklqdq $R1,$RR1,$T1
  3001. vpunpckhqdq $R1,$RR1,$R1
  3002. ___
  3003. ######## switch to %zmm
  3004. map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  3005. map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3006. map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3007. map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
  3008. $code.=<<___;
  3009. vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
  3010. vshufi64x2 \$0x44,$R0,$T0,$RR0
  3011. vshufi64x2 \$0x44,$R1,$T1,$RR1
  3012. vmovdqu64 16*0($inp),$T2 # load data
  3013. vmovdqu64 16*4($inp),$T3
  3014. lea 16*8($inp),$inp
  3015. vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
  3016. vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
  3017. vpaddq $RR2,$SS2,$SS2
  3018. vpaddq $RR1,$SS1,$SS1
  3019. vpsllq \$2,$SS2,$SS2
  3020. vpsllq \$2,$SS1,$SS1
  3021. vpbroadcastq $padbit,$PAD
  3022. vpbroadcastq %x#$mask44,$mask44
  3023. vpbroadcastq %x#$mask42,$mask42
  3024. vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
  3025. vpbroadcastq %x#$SS2,$S2
  3026. vpbroadcastq %x#$RR0,$R0
  3027. vpbroadcastq %x#$RR1,$R1
  3028. vpbroadcastq %x#$RR2,$R2
  3029. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3030. vpunpckhqdq $T3,$T2,$T3
  3031. # at this point 64-bit lanes are ordered as 73625140
  3032. vpsrlq \$24,$T3,$T2 # splat the data
  3033. vporq $PAD,$T2,$T2
  3034. vpaddq $T2,$H2,$H2 # accumulate input
  3035. vpandq $mask44,$T1,$T0
  3036. vpsrlq \$44,$T1,$T1
  3037. vpsllq \$20,$T3,$T3
  3038. vporq $T3,$T1,$T1
  3039. vpandq $mask44,$T1,$T1
  3040. sub \$8,$len
  3041. jz .Ltail_vpmadd52_8x
  3042. jmp .Loop_vpmadd52_8x
  3043. .align 32
  3044. .Loop_vpmadd52_8x:
  3045. #vpaddq $T2,$H2,$H2 # accumulate input
  3046. vpaddq $T0,$H0,$H0
  3047. vpaddq $T1,$H1,$H1
  3048. vpxorq $D0lo,$D0lo,$D0lo
  3049. vpmadd52luq $H2,$S1,$D0lo
  3050. vpxorq $D0hi,$D0hi,$D0hi
  3051. vpmadd52huq $H2,$S1,$D0hi
  3052. vpxorq $D1lo,$D1lo,$D1lo
  3053. vpmadd52luq $H2,$S2,$D1lo
  3054. vpxorq $D1hi,$D1hi,$D1hi
  3055. vpmadd52huq $H2,$S2,$D1hi
  3056. vpxorq $D2lo,$D2lo,$D2lo
  3057. vpmadd52luq $H2,$R0,$D2lo
  3058. vpxorq $D2hi,$D2hi,$D2hi
  3059. vpmadd52huq $H2,$R0,$D2hi
  3060. vmovdqu64 16*0($inp),$T2 # load data
  3061. vmovdqu64 16*4($inp),$T3
  3062. lea 16*8($inp),$inp
  3063. vpmadd52luq $H0,$R0,$D0lo
  3064. vpmadd52huq $H0,$R0,$D0hi
  3065. vpmadd52luq $H0,$R1,$D1lo
  3066. vpmadd52huq $H0,$R1,$D1hi
  3067. vpmadd52luq $H0,$R2,$D2lo
  3068. vpmadd52huq $H0,$R2,$D2hi
  3069. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3070. vpunpckhqdq $T3,$T2,$T3
  3071. vpmadd52luq $H1,$S2,$D0lo
  3072. vpmadd52huq $H1,$S2,$D0hi
  3073. vpmadd52luq $H1,$R0,$D1lo
  3074. vpmadd52huq $H1,$R0,$D1hi
  3075. vpmadd52luq $H1,$R1,$D2lo
  3076. vpmadd52huq $H1,$R1,$D2hi
  3077. ################################################################
  3078. # partial reduction (interleaved with data splat)
  3079. vpsrlq \$44,$D0lo,$tmp
  3080. vpsllq \$8,$D0hi,$D0hi
  3081. vpandq $mask44,$D0lo,$H0
  3082. vpaddq $tmp,$D0hi,$D0hi
  3083. vpsrlq \$24,$T3,$T2
  3084. vporq $PAD,$T2,$T2
  3085. vpaddq $D0hi,$D1lo,$D1lo
  3086. vpsrlq \$44,$D1lo,$tmp
  3087. vpsllq \$8,$D1hi,$D1hi
  3088. vpandq $mask44,$D1lo,$H1
  3089. vpaddq $tmp,$D1hi,$D1hi
  3090. vpandq $mask44,$T1,$T0
  3091. vpsrlq \$44,$T1,$T1
  3092. vpsllq \$20,$T3,$T3
  3093. vpaddq $D1hi,$D2lo,$D2lo
  3094. vpsrlq \$42,$D2lo,$tmp
  3095. vpsllq \$10,$D2hi,$D2hi
  3096. vpandq $mask42,$D2lo,$H2
  3097. vpaddq $tmp,$D2hi,$D2hi
  3098. vpaddq $T2,$H2,$H2 # accumulate input
  3099. vpaddq $D2hi,$H0,$H0
  3100. vpsllq \$2,$D2hi,$D2hi
  3101. vpaddq $D2hi,$H0,$H0
  3102. vporq $T3,$T1,$T1
  3103. vpandq $mask44,$T1,$T1
  3104. vpsrlq \$44,$H0,$tmp # additional step
  3105. vpandq $mask44,$H0,$H0
  3106. vpaddq $tmp,$H1,$H1
  3107. sub \$8,$len # len-=128
  3108. jnz .Loop_vpmadd52_8x
  3109. .Ltail_vpmadd52_8x:
  3110. #vpaddq $T2,$H2,$H2 # accumulate input
  3111. vpaddq $T0,$H0,$H0
  3112. vpaddq $T1,$H1,$H1
  3113. vpxorq $D0lo,$D0lo,$D0lo
  3114. vpmadd52luq $H2,$SS1,$D0lo
  3115. vpxorq $D0hi,$D0hi,$D0hi
  3116. vpmadd52huq $H2,$SS1,$D0hi
  3117. vpxorq $D1lo,$D1lo,$D1lo
  3118. vpmadd52luq $H2,$SS2,$D1lo
  3119. vpxorq $D1hi,$D1hi,$D1hi
  3120. vpmadd52huq $H2,$SS2,$D1hi
  3121. vpxorq $D2lo,$D2lo,$D2lo
  3122. vpmadd52luq $H2,$RR0,$D2lo
  3123. vpxorq $D2hi,$D2hi,$D2hi
  3124. vpmadd52huq $H2,$RR0,$D2hi
  3125. vpmadd52luq $H0,$RR0,$D0lo
  3126. vpmadd52huq $H0,$RR0,$D0hi
  3127. vpmadd52luq $H0,$RR1,$D1lo
  3128. vpmadd52huq $H0,$RR1,$D1hi
  3129. vpmadd52luq $H0,$RR2,$D2lo
  3130. vpmadd52huq $H0,$RR2,$D2hi
  3131. vpmadd52luq $H1,$SS2,$D0lo
  3132. vpmadd52huq $H1,$SS2,$D0hi
  3133. vpmadd52luq $H1,$RR0,$D1lo
  3134. vpmadd52huq $H1,$RR0,$D1hi
  3135. vpmadd52luq $H1,$RR1,$D2lo
  3136. vpmadd52huq $H1,$RR1,$D2hi
  3137. ################################################################
  3138. # horizontal addition
  3139. mov \$1,%eax
  3140. kmovw %eax,%k1
  3141. vpsrldq \$8,$D0lo,$T0
  3142. vpsrldq \$8,$D0hi,$H0
  3143. vpsrldq \$8,$D1lo,$T1
  3144. vpsrldq \$8,$D1hi,$H1
  3145. vpaddq $T0,$D0lo,$D0lo
  3146. vpaddq $H0,$D0hi,$D0hi
  3147. vpsrldq \$8,$D2lo,$T2
  3148. vpsrldq \$8,$D2hi,$H2
  3149. vpaddq $T1,$D1lo,$D1lo
  3150. vpaddq $H1,$D1hi,$D1hi
  3151. vpermq \$0x2,$D0lo,$T0
  3152. vpermq \$0x2,$D0hi,$H0
  3153. vpaddq $T2,$D2lo,$D2lo
  3154. vpaddq $H2,$D2hi,$D2hi
  3155. vpermq \$0x2,$D1lo,$T1
  3156. vpermq \$0x2,$D1hi,$H1
  3157. vpaddq $T0,$D0lo,$D0lo
  3158. vpaddq $H0,$D0hi,$D0hi
  3159. vpermq \$0x2,$D2lo,$T2
  3160. vpermq \$0x2,$D2hi,$H2
  3161. vpaddq $T1,$D1lo,$D1lo
  3162. vpaddq $H1,$D1hi,$D1hi
  3163. vextracti64x4 \$1,$D0lo,%y#$T0
  3164. vextracti64x4 \$1,$D0hi,%y#$H0
  3165. vpaddq $T2,$D2lo,$D2lo
  3166. vpaddq $H2,$D2hi,$D2hi
  3167. vextracti64x4 \$1,$D1lo,%y#$T1
  3168. vextracti64x4 \$1,$D1hi,%y#$H1
  3169. vextracti64x4 \$1,$D2lo,%y#$T2
  3170. vextracti64x4 \$1,$D2hi,%y#$H2
  3171. ___
  3172. ######## switch back to %ymm
  3173. map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  3174. map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3175. map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3176. $code.=<<___;
  3177. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  3178. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  3179. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  3180. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  3181. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  3182. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  3183. ################################################################
  3184. # partial reduction
  3185. vpsrlq \$44,$D0lo,$tmp
  3186. vpsllq \$8,$D0hi,$D0hi
  3187. vpandq $mask44,$D0lo,$H0
  3188. vpaddq $tmp,$D0hi,$D0hi
  3189. vpaddq $D0hi,$D1lo,$D1lo
  3190. vpsrlq \$44,$D1lo,$tmp
  3191. vpsllq \$8,$D1hi,$D1hi
  3192. vpandq $mask44,$D1lo,$H1
  3193. vpaddq $tmp,$D1hi,$D1hi
  3194. vpaddq $D1hi,$D2lo,$D2lo
  3195. vpsrlq \$42,$D2lo,$tmp
  3196. vpsllq \$10,$D2hi,$D2hi
  3197. vpandq $mask42,$D2lo,$H2
  3198. vpaddq $tmp,$D2hi,$D2hi
  3199. vpaddq $D2hi,$H0,$H0
  3200. vpsllq \$2,$D2hi,$D2hi
  3201. vpaddq $D2hi,$H0,$H0
  3202. vpsrlq \$44,$H0,$tmp # additional step
  3203. vpandq $mask44,$H0,$H0
  3204. vpaddq $tmp,$H1,$H1
  3205. ################################################################
  3206. vmovq %x#$H0,0($ctx)
  3207. vmovq %x#$H1,8($ctx)
  3208. vmovq %x#$H2,16($ctx)
  3209. vzeroall
  3210. .Lno_data_vpmadd52_8x:
  3211. ret
  3212. .cfi_endproc
  3213. .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
  3214. ___
  3215. }
  3216. $code.=<<___;
  3217. .type poly1305_emit_base2_44,\@function,3
  3218. .align 32
  3219. poly1305_emit_base2_44:
  3220. .cfi_startproc
  3221. endbranch
  3222. mov 0($ctx),%r8 # load hash value
  3223. mov 8($ctx),%r9
  3224. mov 16($ctx),%r10
  3225. mov %r9,%rax
  3226. shr \$20,%r9
  3227. shl \$44,%rax
  3228. mov %r10,%rcx
  3229. shr \$40,%r10
  3230. shl \$24,%rcx
  3231. add %rax,%r8
  3232. adc %rcx,%r9
  3233. adc \$0,%r10
  3234. mov %r8,%rax
  3235. add \$5,%r8 # compare to modulus
  3236. mov %r9,%rcx
  3237. adc \$0,%r9
  3238. adc \$0,%r10
  3239. shr \$2,%r10 # did 130-bit value overflow?
  3240. cmovnz %r8,%rax
  3241. cmovnz %r9,%rcx
  3242. add 0($nonce),%rax # accumulate nonce
  3243. adc 8($nonce),%rcx
  3244. mov %rax,0($mac) # write result
  3245. mov %rcx,8($mac)
  3246. ret
  3247. .cfi_endproc
  3248. .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
  3249. ___
  3250. } } }
  3251. $code.=<<___;
  3252. .section .rodata align=64
  3253. .align 64
  3254. .Lconst:
  3255. .Lmask24:
  3256. .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
  3257. .L129:
  3258. .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
  3259. .Lmask26:
  3260. .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
  3261. .Lpermd_avx2:
  3262. .long 2,2,2,3,2,0,2,1
  3263. .Lpermd_avx512:
  3264. .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
  3265. .L2_44_inp_permd:
  3266. .long 0,1,1,2,2,3,7,7
  3267. .L2_44_inp_shift:
  3268. .quad 0,12,24,64
  3269. .L2_44_mask:
  3270. .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
  3271. .L2_44_shift_rgt:
  3272. .quad 44,44,42,64
  3273. .L2_44_shift_lft:
  3274. .quad 8,8,10,64
  3275. .align 64
  3276. .Lx_mask44:
  3277. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  3278. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  3279. .Lx_mask42:
  3280. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  3281. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  3282. .previous
  3283. ___
  3284. }
  3285. $code.=<<___;
  3286. .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  3287. .align 16
  3288. ___
  3289. { # chacha20-poly1305 helpers
  3290. my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  3291. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  3292. $code.=<<___;
  3293. .globl xor128_encrypt_n_pad
  3294. .type xor128_encrypt_n_pad,\@abi-omnipotent
  3295. .align 16
  3296. xor128_encrypt_n_pad:
  3297. .cfi_startproc
  3298. sub $otp,$inp
  3299. sub $otp,$out
  3300. mov $len,%r10 # put len aside
  3301. shr \$4,$len # len / 16
  3302. jz .Ltail_enc
  3303. nop
  3304. .Loop_enc_xmm:
  3305. movdqu ($inp,$otp),%xmm0
  3306. pxor ($otp),%xmm0
  3307. movdqu %xmm0,($out,$otp)
  3308. movdqa %xmm0,($otp)
  3309. lea 16($otp),$otp
  3310. dec $len
  3311. jnz .Loop_enc_xmm
  3312. and \$15,%r10 # len % 16
  3313. jz .Ldone_enc
  3314. .Ltail_enc:
  3315. mov \$16,$len
  3316. sub %r10,$len
  3317. xor %eax,%eax
  3318. .Loop_enc_byte:
  3319. mov ($inp,$otp),%al
  3320. xor ($otp),%al
  3321. mov %al,($out,$otp)
  3322. mov %al,($otp)
  3323. lea 1($otp),$otp
  3324. dec %r10
  3325. jnz .Loop_enc_byte
  3326. xor %eax,%eax
  3327. .Loop_enc_pad:
  3328. mov %al,($otp)
  3329. lea 1($otp),$otp
  3330. dec $len
  3331. jnz .Loop_enc_pad
  3332. .Ldone_enc:
  3333. mov $otp,%rax
  3334. ret
  3335. .cfi_endproc
  3336. .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
  3337. .globl xor128_decrypt_n_pad
  3338. .type xor128_decrypt_n_pad,\@abi-omnipotent
  3339. .align 16
  3340. xor128_decrypt_n_pad:
  3341. .cfi_startproc
  3342. sub $otp,$inp
  3343. sub $otp,$out
  3344. mov $len,%r10 # put len aside
  3345. shr \$4,$len # len / 16
  3346. jz .Ltail_dec
  3347. nop
  3348. .Loop_dec_xmm:
  3349. movdqu ($inp,$otp),%xmm0
  3350. movdqa ($otp),%xmm1
  3351. pxor %xmm0,%xmm1
  3352. movdqu %xmm1,($out,$otp)
  3353. movdqa %xmm0,($otp)
  3354. lea 16($otp),$otp
  3355. dec $len
  3356. jnz .Loop_dec_xmm
  3357. pxor %xmm1,%xmm1
  3358. and \$15,%r10 # len % 16
  3359. jz .Ldone_dec
  3360. .Ltail_dec:
  3361. mov \$16,$len
  3362. sub %r10,$len
  3363. xor %eax,%eax
  3364. xor %r11,%r11
  3365. .Loop_dec_byte:
  3366. mov ($inp,$otp),%r11b
  3367. mov ($otp),%al
  3368. xor %r11b,%al
  3369. mov %al,($out,$otp)
  3370. mov %r11b,($otp)
  3371. lea 1($otp),$otp
  3372. dec %r10
  3373. jnz .Loop_dec_byte
  3374. xor %eax,%eax
  3375. .Loop_dec_pad:
  3376. mov %al,($otp)
  3377. lea 1($otp),$otp
  3378. dec $len
  3379. jnz .Loop_dec_pad
  3380. .Ldone_dec:
  3381. mov $otp,%rax
  3382. ret
  3383. .cfi_endproc
  3384. .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
  3385. ___
  3386. }
  3387. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  3388. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  3389. if ($win64) {
  3390. $rec="%rcx";
  3391. $frame="%rdx";
  3392. $context="%r8";
  3393. $disp="%r9";
  3394. $code.=<<___;
  3395. .extern __imp_RtlVirtualUnwind
  3396. .type se_handler,\@abi-omnipotent
  3397. .align 16
  3398. se_handler:
  3399. push %rsi
  3400. push %rdi
  3401. push %rbx
  3402. push %rbp
  3403. push %r12
  3404. push %r13
  3405. push %r14
  3406. push %r15
  3407. pushfq
  3408. sub \$64,%rsp
  3409. mov 120($context),%rax # pull context->Rax
  3410. mov 248($context),%rbx # pull context->Rip
  3411. mov 8($disp),%rsi # disp->ImageBase
  3412. mov 56($disp),%r11 # disp->HandlerData
  3413. mov 0(%r11),%r10d # HandlerData[0]
  3414. lea (%rsi,%r10),%r10 # prologue label
  3415. cmp %r10,%rbx # context->Rip<.Lprologue
  3416. jb .Lcommon_seh_tail
  3417. mov 152($context),%rax # pull context->Rsp
  3418. mov 4(%r11),%r10d # HandlerData[1]
  3419. lea (%rsi,%r10),%r10 # epilogue label
  3420. cmp %r10,%rbx # context->Rip>=.Lepilogue
  3421. jae .Lcommon_seh_tail
  3422. lea 48(%rax),%rax
  3423. mov -8(%rax),%rbx
  3424. mov -16(%rax),%rbp
  3425. mov -24(%rax),%r12
  3426. mov -32(%rax),%r13
  3427. mov -40(%rax),%r14
  3428. mov -48(%rax),%r15
  3429. mov %rbx,144($context) # restore context->Rbx
  3430. mov %rbp,160($context) # restore context->Rbp
  3431. mov %r12,216($context) # restore context->R12
  3432. mov %r13,224($context) # restore context->R13
  3433. mov %r14,232($context) # restore context->R14
  3434. mov %r15,240($context) # restore context->R14
  3435. jmp .Lcommon_seh_tail
  3436. .size se_handler,.-se_handler
  3437. .type avx_handler,\@abi-omnipotent
  3438. .align 16
  3439. avx_handler:
  3440. push %rsi
  3441. push %rdi
  3442. push %rbx
  3443. push %rbp
  3444. push %r12
  3445. push %r13
  3446. push %r14
  3447. push %r15
  3448. pushfq
  3449. sub \$64,%rsp
  3450. mov 120($context),%rax # pull context->Rax
  3451. mov 248($context),%rbx # pull context->Rip
  3452. mov 8($disp),%rsi # disp->ImageBase
  3453. mov 56($disp),%r11 # disp->HandlerData
  3454. mov 0(%r11),%r10d # HandlerData[0]
  3455. lea (%rsi,%r10),%r10 # prologue label
  3456. cmp %r10,%rbx # context->Rip<prologue label
  3457. jb .Lcommon_seh_tail
  3458. mov 152($context),%rax # pull context->Rsp
  3459. mov 4(%r11),%r10d # HandlerData[1]
  3460. lea (%rsi,%r10),%r10 # epilogue label
  3461. cmp %r10,%rbx # context->Rip>=epilogue label
  3462. jae .Lcommon_seh_tail
  3463. mov 208($context),%rax # pull context->R11
  3464. lea 0x50(%rax),%rsi
  3465. lea 0xf8(%rax),%rax
  3466. lea 512($context),%rdi # &context.Xmm6
  3467. mov \$20,%ecx
  3468. .long 0xa548f3fc # cld; rep movsq
  3469. .Lcommon_seh_tail:
  3470. mov 8(%rax),%rdi
  3471. mov 16(%rax),%rsi
  3472. mov %rax,152($context) # restore context->Rsp
  3473. mov %rsi,168($context) # restore context->Rsi
  3474. mov %rdi,176($context) # restore context->Rdi
  3475. mov 40($disp),%rdi # disp->ContextRecord
  3476. mov $context,%rsi # context
  3477. mov \$154,%ecx # sizeof(CONTEXT)
  3478. .long 0xa548f3fc # cld; rep movsq
  3479. mov $disp,%rsi
  3480. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  3481. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  3482. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  3483. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  3484. mov 40(%rsi),%r10 # disp->ContextRecord
  3485. lea 56(%rsi),%r11 # &disp->HandlerData
  3486. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  3487. mov %r10,32(%rsp) # arg5
  3488. mov %r11,40(%rsp) # arg6
  3489. mov %r12,48(%rsp) # arg7
  3490. mov %rcx,56(%rsp) # arg8, (NULL)
  3491. call *__imp_RtlVirtualUnwind(%rip)
  3492. mov \$1,%eax # ExceptionContinueSearch
  3493. add \$64,%rsp
  3494. popfq
  3495. pop %r15
  3496. pop %r14
  3497. pop %r13
  3498. pop %r12
  3499. pop %rbp
  3500. pop %rbx
  3501. pop %rdi
  3502. pop %rsi
  3503. ret
  3504. .size avx_handler,.-avx_handler
  3505. .section .pdata
  3506. .align 4
  3507. .rva .LSEH_begin_poly1305_init
  3508. .rva .LSEH_end_poly1305_init
  3509. .rva .LSEH_info_poly1305_init
  3510. .rva .LSEH_begin_poly1305_blocks
  3511. .rva .LSEH_end_poly1305_blocks
  3512. .rva .LSEH_info_poly1305_blocks
  3513. .rva .LSEH_begin_poly1305_emit
  3514. .rva .LSEH_end_poly1305_emit
  3515. .rva .LSEH_info_poly1305_emit
  3516. ___
  3517. $code.=<<___ if ($avx);
  3518. .rva .LSEH_begin_poly1305_blocks_avx
  3519. .rva .Lbase2_64_avx
  3520. .rva .LSEH_info_poly1305_blocks_avx_1
  3521. .rva .Lbase2_64_avx
  3522. .rva .Leven_avx
  3523. .rva .LSEH_info_poly1305_blocks_avx_2
  3524. .rva .Leven_avx
  3525. .rva .LSEH_end_poly1305_blocks_avx
  3526. .rva .LSEH_info_poly1305_blocks_avx_3
  3527. .rva .LSEH_begin_poly1305_emit_avx
  3528. .rva .LSEH_end_poly1305_emit_avx
  3529. .rva .LSEH_info_poly1305_emit_avx
  3530. ___
  3531. $code.=<<___ if ($avx>1);
  3532. .rva .LSEH_begin_poly1305_blocks_avx2
  3533. .rva .Lbase2_64_avx2
  3534. .rva .LSEH_info_poly1305_blocks_avx2_1
  3535. .rva .Lbase2_64_avx2
  3536. .rva .Leven_avx2
  3537. .rva .LSEH_info_poly1305_blocks_avx2_2
  3538. .rva .Leven_avx2
  3539. .rva .LSEH_end_poly1305_blocks_avx2
  3540. .rva .LSEH_info_poly1305_blocks_avx2_3
  3541. ___
  3542. $code.=<<___ if ($avx>2);
  3543. .rva .LSEH_begin_poly1305_blocks_avx512
  3544. .rva .LSEH_end_poly1305_blocks_avx512
  3545. .rva .LSEH_info_poly1305_blocks_avx512
  3546. ___
  3547. $code.=<<___;
  3548. .section .xdata
  3549. .align 8
  3550. .LSEH_info_poly1305_init:
  3551. .byte 9,0,0,0
  3552. .rva se_handler
  3553. .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
  3554. .LSEH_info_poly1305_blocks:
  3555. .byte 9,0,0,0
  3556. .rva se_handler
  3557. .rva .Lblocks_body,.Lblocks_epilogue
  3558. .LSEH_info_poly1305_emit:
  3559. .byte 9,0,0,0
  3560. .rva se_handler
  3561. .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
  3562. ___
  3563. $code.=<<___ if ($avx);
  3564. .LSEH_info_poly1305_blocks_avx_1:
  3565. .byte 9,0,0,0
  3566. .rva se_handler
  3567. .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
  3568. .LSEH_info_poly1305_blocks_avx_2:
  3569. .byte 9,0,0,0
  3570. .rva se_handler
  3571. .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
  3572. .LSEH_info_poly1305_blocks_avx_3:
  3573. .byte 9,0,0,0
  3574. .rva avx_handler
  3575. .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
  3576. .LSEH_info_poly1305_emit_avx:
  3577. .byte 9,0,0,0
  3578. .rva se_handler
  3579. .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
  3580. ___
  3581. $code.=<<___ if ($avx>1);
  3582. .LSEH_info_poly1305_blocks_avx2_1:
  3583. .byte 9,0,0,0
  3584. .rva se_handler
  3585. .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
  3586. .LSEH_info_poly1305_blocks_avx2_2:
  3587. .byte 9,0,0,0
  3588. .rva se_handler
  3589. .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
  3590. .LSEH_info_poly1305_blocks_avx2_3:
  3591. .byte 9,0,0,0
  3592. .rva avx_handler
  3593. .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
  3594. ___
  3595. $code.=<<___ if ($avx>2);
  3596. .LSEH_info_poly1305_blocks_avx512:
  3597. .byte 9,0,0,0
  3598. .rva avx_handler
  3599. .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
  3600. ___
  3601. }
  3602. foreach (split('\n',$code)) {
  3603. s/\`([^\`]*)\`/eval($1)/ge;
  3604. s/%r([a-z]+)#d/%e$1/g;
  3605. s/%r([0-9]+)#d/%r$1d/g;
  3606. s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
  3607. print $_,"\n";
  3608. }
  3609. close STDOUT or die "error closing STDOUT: $!";